Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #https://github.com/A3M4/Personal-YouTube-PDF-Report-Generator/blob/master/parse.py
- import re
- import os
- import json
- import datetime
- import itertools
- import collections
- Dir = os.getcwd() + "/Takeout/YouTube/"
- watchHistory = Dir + 'history/watch-history.html'
- searchHistory = Dir + 'history/search-history.html'
- commentHistory = Dir + 'my-comments/my-comments.html'
- likeHistory = Dir + 'playlists/likes.json'
- class HTML:
- htmlWatch = open(watchHistory, 'r', encoding='utf-8').read()
- htmlSearch = open(searchHistory, 'r', encoding='utf-8').read()
- try:
- htmlComment = open(commentHistory, 'r', encoding='utf-8').read()
- except: pass
- def find_links(self):
- # search all links based on your personal html file
- links = []
- pattern = re.compile(r'Watched.<.*?>')
- matchList = pattern.findall(str(HTML.htmlWatch))
- # save links into list
- for match in matchList:
- match = match.split('"')[1]
- links.append(match)
- return links
- def find_times(self):
- times = []
- pattern = re.compile(r'(?:[A-Za-z]{3}\s\d{1,2}\,\s[0-9]{4}\,|\d{1,2}\s.{9})\s\d?\d:\d\d:\d\d\s(?:PM\s|AM\s)?[A-Z]{3,4}')
- matchList = pattern.findall(str(HTML.htmlWatch))
- # add '0' to the beginning of the string to make all string same length
- for time in matchList:
- if time[0].isalpha():
- if time[6] != ",":
- time = time[:4] + "0" + time[4:]
- dayOfWeek = datetime.datetime.strptime(time[0:12], '%b %d, %Y').strftime('%a')
- time = time[:6] + time[7:]
- dt = datetime.datetime.strptime(time[12:24].strip(), "%I:%M:%S %p")
- times.append(time[:13] + dt.strftime("%H:%M:%S") + ' ' + time[-3:] + ' ' + dayOfWeek)
- else:
- if len(time) == 24:
- time = str(0) + time
- # add the day of week to the end of strings
- dayOfWeek = datetime.datetime.strptime(time[0:11], '%d %b %Y').strftime('%a')
- times.append(time + ' ' + dayOfWeek)
- else:
- # add the day of week to the end of strings
- dayOfWeek = datetime.datetime.strptime(time[0:11], '%d %b %Y').strftime('%a')
- times.append(time + ' ' + dayOfWeek)
- return times
- def searchHistory(self):
- searchRaw = []
- searchClean = []
- pattern = re.compile(r'search_query=[^%].*?>')
- matchList = pattern.findall(str(HTML.htmlSearch))
- # save links into list
- for match in matchList:
- match = match[13:][:-3]
- match = match.split('+')
- searchRaw.append(match)
- for word in list(itertools.chain.from_iterable(searchRaw)):
- if '%' not in word:
- searchClean.append(word)
- return searchRaw, searchClean
- def commentHistory(self):
- try:
- pattern = re.compile(r'<a href=".*?">')
- matchList = pattern.findall(str(HTML.htmlComment))
- link = matchList[-1][9:][:-2]
- return link, matchList
- except:
- pass
- def likeHistory(self):
- with open(likeHistory, 'rb') as f:
- data = json.load(f)
- pattern = re.compile(r'videoId.{15}')
- matchList = pattern.findall(str(data))
- link = r"https://www.youtube.com/watch?v=" + matchList[-1][11:]
- return link, matchList
- def dataframe_heatmap(self,day):
- timeWeeks = []
- daytime = []
- times = self.find_times()
- for time in times:
- timeWeek = time[-3:]+time[13:15]
- timeWeeks.append(timeWeek)
- freq = collections.Counter(timeWeeks)
- for k, v in freq.items():
- if k[0:3] == day:
- daytime.append(str(k)+' '+str(v))
- daytime.sort(key=lambda x: int(str(x)[3:5]))
- print(daytime)
- zero_one = 0
- two_three = 0
- four_five = 0
- six_seven= 0
- eight_nine = 0
- ten_eleven = 0
- twelve_thirteen = 0
- fourteen_fifteen = 0
- sixteen_seventeen = 0
- eighteen_nineteen = 0
- twenty_twentyone = 0
- twentytwo_twentythree = 0
- for i in daytime:
- if int(i[3:5]) in range(0, 2):
- zero_one = zero_one + int(i.split(' ')[1])
- elif int(i[3:5]) in range(2, 4):
- two_three = two_three + int(i.split(' ')[1])
- elif int(i[3:5]) in range(4, 6):
- four_five = four_five + int(i.split(' ')[1])
- elif int(i[3:5]) in range(6, 8):
- six_seven = six_seven + int(i.split(' ')[1])
- elif int(i[3:5]) in range(8, 10):
- eight_nine = eight_nine + int(i.split(' ')[1])
- elif int(i[3:5]) in range(10, 12):
- ten_eleven = ten_eleven + int(i.split(' ')[1])
- elif int(i[3:5]) in range(12, 14):
- twelve_thirteen = twelve_thirteen + int(i.split(' ')[1])
- elif int(i[3:5]) in range(14, 16):
- fourteen_fifteen = fourteen_fifteen + int(i.split(' ')[1])
- elif int(i[3:5]) in range(16, 18):
- sixteen_seventeen = sixteen_seventeen + int(i.split(' ')[1])
- elif int(i[3:5]) in range(18, 20):
- eighteen_nineteen = eighteen_nineteen + int(i.split(' ')[1])
- elif int(i[3:5]) in range(20, 22):
- twenty_twentyone = twenty_twentyone + int(i.split(' ')[1])
- else:
- twentytwo_twentythree = twentytwo_twentythree + int(i.split(' ')[1])
- return ([zero_one, two_three, four_five, six_seven, eight_nine, ten_eleven, twelve_thirteen, fourteen_fifteen,
- sixteen_seventeen, eighteen_nineteen, twenty_twentyone, twentytwo_twentythree])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement