-
Notifications
You must be signed in to change notification settings - Fork 61
/
parse.py
155 lines (132 loc) · 5.03 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/python3
import re
import os
import json
import datetime,pytz
import collections
import itertools
missing=[]
dir = os.path.join(os.getcwd(),"Takeout/YouTube/")
if not os.path.exists(dir):
missing.append(dir)
found=False
for path in ("Verlauf/Wiedergabeverlauf.html","history/watch-history.html"):
watch_history = os.path.join(dir,path)
if os.path.exists(watch_history):
found=True
break
if not found:
missing.append(watch_history)
found=False
for path in ("Verlauf/Suchverlauf.html","history/search-history.html"):
search_history = os.path.join(dir,path)
if os.path.exists(search_history):
found=True
break
if not found:
missing.append(search_history)
found=False
for path in ("Meine Kommentare/Meine Kommentare.html","my-comments/my-comments.html"):
comments_history = os.path.join(dir,path)
if os.path.exists(comments_history):
found=True
break
if not found:
missing.append(comments_history)
found=False
for path in ("Playlists/Positive Bewertungen.json","playlists/likes.json"):
like_history = os.path.join(dir,path)
if os.path.exists(like_history):
found=True
break
if not found:
missing.append(like_history)
del found
if len(missing)>0:
raise OSError("Required directories do not exist: %s"%(missing))
del missing
class HTML:
with open(watch_history, "r", encoding="utf-8") as f:
html_watch = f.read()
with open(search_history, "r", encoding="utf-8") as f:
html_search = f.read()
try:
with open(comments_history, "r", encoding="utf-8") as f:
html_comment = f.read()
except Exception:
print("Could not parse comments.")
def find_links(self):
# search all links based on your personal html file
links = []
#if you want to understand ↓these↓, go to regex101.com.
#also, I just assumed that the previously written english regex was faulty too, but replace that one if needed. I've only got the german one on hand.
for translation in (r"""Watched\xa0<a href=\"([^\"]*)\">[^<]*<\/a>""",r"""<a href=\"([^\"]*)\">[^<]*<\/a>\xa0angesehen"""):
links+=self.raw_find_links(translation)
return links
def raw_find_links(self,translation):
pattern = re.compile(translation)
matchList = pattern.findall(str(self.html_watch))
# save links into list
return [match for match in matchList if type(match)==str] #just sorting out stuff that could f up the whole script
def find_times(self):
times = []
for translation in ((r"""<\/a><br><a href=\"[^\"]*\">[^<]*<\/a><br>(\D*) (\d\d?), (\d\d\d\d), (\d\d?):(\d\d?):(\d\d?) (AM|PM) ([^<]*)<\/div>""","%s %s, %s, %s:%s:%s %s","%b %d, %Y, %I:%M:%S %p"),(r"""\xa0angesehen<br><a href=\"[^\"]*\">[^<]*<\/a><br>(\d\d?)\.(\d\d?)\.(\d\d\d\d), (\d\d?):(\d\d?):(\d\d?) ([^<]*)<\/div>""","%s.%s.%s %s:%s:%s","%d.%m.%Y %H:%M:%S")):
times+=self.raw_find_times(*translation)
return times
def raw_find_times(self,regex,timegex,timegex2):
pattern = re.compile(regex)
matchList = pattern.findall(str(self.html_watch))
times=[]
for time in matchList:
times.append(pytz.timezone(time[-1]).localize(datetime.datetime.strptime(timegex%(time[:-1]),timegex2)))
return times
def _find_times(self):
"""
Find and format times within the HTML file.
Returns
-------
times : List[str]
e.g. "19 Feb 2013, 11:56:19 UTC Tue"
"""
# Format all matched dates
times = [
datetime_obj.strftime("%d %b %Y, %H:%M:%S UTC %a")
for datetime_obj in self._find_times_datetime()
]
return times
def search_history(self):
search_raw = []
search_clean = []
pattern = re.compile(r"search_query=[^%].*?>")
match_list = pattern.findall(str(HTML.html_search))
# save links into list
for match in match_list:
match = match[13:][:-2]
match = match.split("+")
search_raw.append(match)
for word in list(itertools.chain.from_iterable(search_raw)):
if "%" not in word:
search_clean.append(word)
return search_raw, search_clean
def comment_history(self):
try:
pattern = re.compile(r"""<a href=['"].*?['"]>""")
match_list = pattern.findall(str(HTML.html_comment))
link = match_list[-1][9:][:-2]
return link, match_list
except Exception:
pass
def like_history(self):
with open(like_history, "rb") as f:
data = json.load(f)
pattern = re.compile(r"videoId.{15}")
match_list = pattern.findall(str(data))
link = r"https://www.youtube.com/watch?v=" + match_list[-1][11:]
return link, match_list
def dataframe_heatmap(self, day):
times = self.find_times()
watchtimes=[0 for t in range(12)]
for time in times:
if time.weekday()==day:
watchtimes[(time.hour//2)-time.hour%2]+=1
return watchtimes