Advertisement
Guest User

Untitled

a guest
Dec 14th, 2019
207
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.93 KB | None | 0 0
  1. #https://github.com/A3M4/Personal-YouTube-PDF-Report-Generator/blob/master/parse.py
  2. import re
  3. import os
  4. import json
  5. import datetime
  6. import itertools
  7. import collections
  8.  
  9.  
  10. Dir = os.getcwd() + "/Takeout/YouTube/"
  11. watchHistory = Dir + 'history/watch-history.html'
  12. searchHistory = Dir + 'history/search-history.html'
  13. commentHistory = Dir + 'my-comments/my-comments.html'
  14. likeHistory = Dir + 'playlists/likes.json'
  15.  
  16.  
  17.  
  18. class HTML:
  19.  
  20.     htmlWatch = open(watchHistory, 'r', encoding='utf-8').read()
  21.     htmlSearch = open(searchHistory, 'r', encoding='utf-8').read()
  22.     try:
  23.         htmlComment = open(commentHistory, 'r', encoding='utf-8').read()
  24.     except: pass
  25.  
  26.     def find_links(self):
  27.         # search all links based on your personal html file
  28.         links = []
  29.         pattern = re.compile(r'Watched.<.*?>')
  30.         matchList = pattern.findall(str(HTML.htmlWatch))
  31.  
  32.         # save links into list
  33.         for match in matchList:
  34.             match = match.split('"')[1]
  35.             links.append(match)
  36.         return links
  37.  
  38.  
  39.  
  40.     def find_times(self):
  41.         times = []
  42.         pattern = re.compile(r'(?:[A-Za-z]{3}\s\d{1,2}\,\s[0-9]{4}\,|\d{1,2}\s.{9})\s\d?\d:\d\d:\d\d\s(?:PM\s|AM\s)?[A-Z]{3,4}')
  43.         matchList = pattern.findall(str(HTML.htmlWatch))
  44.  
  45.         # add '0' to the beginning of the string to make all string same length
  46.         for time in matchList:
  47.             if time[0].isalpha():
  48.                 if time[6] != ",":
  49.                     time = time[:4] + "0" + time[4:]               
  50.                 dayOfWeek = datetime.datetime.strptime(time[0:12], '%b %d, %Y').strftime('%a')
  51.                 time = time[:6] + time[7:]
  52.                 dt = datetime.datetime.strptime(time[12:24].strip(), "%I:%M:%S %p")
  53.                 times.append(time[:13] + dt.strftime("%H:%M:%S") + ' ' + time[-3:] + ' ' + dayOfWeek)
  54.             else:
  55.                 if len(time) == 24:
  56.                     time = str(0) + time
  57.                     # add the day of week to the end of strings
  58.                     dayOfWeek = datetime.datetime.strptime(time[0:11], '%d %b %Y').strftime('%a')
  59.                     times.append(time + ' ' + dayOfWeek)
  60.                 else:
  61.                     # add the day of week to the end of strings
  62.                     dayOfWeek = datetime.datetime.strptime(time[0:11], '%d %b %Y').strftime('%a')
  63.                     times.append(time + ' ' + dayOfWeek)
  64.         return times
  65.  
  66.  
  67.  
  68.     def searchHistory(self):
  69.         searchRaw = []
  70.         searchClean = []
  71.         pattern = re.compile(r'search_query=[^%].*?>')
  72.         matchList = pattern.findall(str(HTML.htmlSearch))
  73.  
  74.         # save links into list
  75.         for match in matchList:
  76.             match = match[13:][:-3]
  77.             match = match.split('+')
  78.             searchRaw.append(match)
  79.         for word in list(itertools.chain.from_iterable(searchRaw)):
  80.             if '%' not in word:
  81.                 searchClean.append(word)
  82.         return searchRaw, searchClean
  83.  
  84.  
  85.  
  86.     def commentHistory(self):
  87.         try:
  88.             pattern = re.compile(r'<a href=".*?">')
  89.             matchList = pattern.findall(str(HTML.htmlComment))
  90.             link = matchList[-1][9:][:-2]
  91.             return link, matchList
  92.         except:
  93.             pass
  94.  
  95.  
  96.  
  97.     def likeHistory(self):
  98.         with open(likeHistory, 'rb') as f:
  99.             data = json.load(f)
  100.             pattern = re.compile(r'videoId.{15}')
  101.             matchList = pattern.findall(str(data))
  102.             link = r"https://www.youtube.com/watch?v=" + matchList[-1][11:]
  103.             return link, matchList
  104.  
  105.  
  106.  
  107.     def dataframe_heatmap(self,day):
  108.         timeWeeks = []
  109.         daytime = []
  110.         times = self.find_times()
  111.         for time in times:
  112.             timeWeek = time[-3:]+time[13:15]
  113.             timeWeeks.append(timeWeek)
  114.         freq = collections.Counter(timeWeeks)
  115.         for k, v in freq.items():
  116.             if k[0:3] == day:
  117.                 daytime.append(str(k)+' '+str(v))
  118.         daytime.sort(key=lambda x: int(str(x)[3:5]))
  119.         print(daytime)
  120.  
  121.         zero_one = 0
  122.         two_three = 0
  123.         four_five = 0
  124.         six_seven= 0
  125.         eight_nine = 0
  126.         ten_eleven = 0
  127.         twelve_thirteen = 0
  128.         fourteen_fifteen = 0
  129.         sixteen_seventeen = 0
  130.         eighteen_nineteen = 0
  131.         twenty_twentyone = 0
  132.         twentytwo_twentythree = 0
  133.  
  134.         for i in daytime:
  135.             if int(i[3:5]) in range(0, 2):
  136.                 zero_one = zero_one + int(i.split(' ')[1])
  137.             elif int(i[3:5]) in range(2, 4):
  138.                 two_three = two_three + int(i.split(' ')[1])
  139.             elif int(i[3:5]) in range(4, 6):
  140.                 four_five = four_five + int(i.split(' ')[1])
  141.             elif int(i[3:5]) in range(6, 8):
  142.                 six_seven = six_seven + int(i.split(' ')[1])
  143.             elif int(i[3:5]) in range(8, 10):
  144.                 eight_nine = eight_nine + int(i.split(' ')[1])
  145.             elif int(i[3:5]) in range(10, 12):
  146.                 ten_eleven = ten_eleven + int(i.split(' ')[1])
  147.             elif int(i[3:5]) in range(12, 14):
  148.                 twelve_thirteen = twelve_thirteen + int(i.split(' ')[1])
  149.             elif int(i[3:5]) in range(14, 16):
  150.                 fourteen_fifteen = fourteen_fifteen + int(i.split(' ')[1])
  151.             elif int(i[3:5]) in range(16, 18):
  152.                 sixteen_seventeen = sixteen_seventeen + int(i.split(' ')[1])
  153.             elif int(i[3:5]) in range(18, 20):
  154.                 eighteen_nineteen = eighteen_nineteen + int(i.split(' ')[1])
  155.             elif int(i[3:5]) in range(20, 22):
  156.                 twenty_twentyone = twenty_twentyone + int(i.split(' ')[1])
  157.             else:
  158.                 twentytwo_twentythree = twentytwo_twentythree + int(i.split(' ')[1])
  159.  
  160.         return ([zero_one, two_three, four_five, six_seven, eight_nine, ten_eleven, twelve_thirteen, fourteen_fifteen,
  161.                  sixteen_seventeen, eighteen_nineteen, twenty_twentyone, twentytwo_twentythree])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement