Guest User

youtubeChannelCrawler.py

a guest
Oct 23rd, 2018
1,896
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.36 KB | None | 0 0
  1. from selenium import webdriver
  2. import argparse
  3. import json
  4. import re
  5. import time
  6. import os
  7.  
  8.  
  9. # ------------------------------------------------------------------------------------------
  10. # GLOBAL VARS
  11. # ------------------------------------------------------------------------------------------
  12.  
  13.  
  14. JS_GET_YOUTUBE_DATA_VARIABLE = 'return window["ytInitialData"]'
  15.  
  16. URL_YOUTUBE_BASE = 'https://www.youtube.com'
  17. URL_PLAYLISTS_ROOT = "/playlists"
  18. URL_VIDEOS_ROOT = "/videos"
  19.  
  20. PLAYLIST_VALUES_KEY = 'playlistId'
  21. PLAYLIST_URL_PARAMETER = '/playlist?list='
  22. VIDEO_VALUES_KEY = 'videoId'
  23. VIDEO_URL_PARAMETER = '/watch?v='
  24.  
  25. VIEW_TYPE_PLAYLISTS = "VIEW_TYPE_PLAYLISTS"
  26. VIEW_TYPE_VIDEOS = "VIEW_TYPE_VIDEOS"
  27.  
  28. url_youtube_channel = ''
  29. collected_ids = []
  30. args = None
  31. browser = None
  32.  
  33.  
  34. # ------------------------------------------------------------------------------------------
  35. #  GENERAL HELPER FUNCTIONS
  36. # ------------------------------------------------------------------------------------------
  37.  
  38.  
  39. def setup_arguments():
  40.     global args
  41.     parser = argparse.ArgumentParser()
  42.     parser.add_argument('url', type=str)
  43.     args = parser.parse_args()
  44.  
  45.  
  46. def remove_duplicates_from_list(param_values):
  47.     output = []
  48.     seen = set()
  49.     for value in param_values:
  50.         # If value has not been encountered yet add it to both list and set.
  51.         if value not in seen:
  52.             output.append(value)
  53.             seen.add(value)
  54.     return output
  55.  
  56.  
  57. def find_values_in_json(param_value_id, param_json_repr):
  58.     results = []
  59.  
  60.     def _decode_dict(a_dict):
  61.         try:
  62.             results.append(a_dict[param_value_id])
  63.         except KeyError:
  64.             pass
  65.         return a_dict
  66.     json.loads(param_json_repr, object_hook=_decode_dict)  # Return value ignored.
  67.     return results
  68.  
  69.  
  70. def save_urls_to_file():
  71.     print("Saving URLs to file.")
  72.     with open('urls.txt', 'w') as f:
  73.         for item in collected_ids:
  74.             f.write("%s\n" % item)
  75.  
  76.  
  77. # ------------------------------------------------------------------------------------------
  78. # BROWSER
  79. # ------------------------------------------------------------------------------------------
  80.  
  81.  
  82. def initiate_browser():
  83.     print("Initiating browser.")
  84.     global browser
  85.  
  86.     # Change this to use a different browser.
  87.     # Don't forget to get the right webdriver to go along with it.
  88.     browser = webdriver.Firefox()
  89.  
  90.     if browser:
  91.         # Change this to the biggest size you can get away with for less scrolling.
  92.         browser.set_window_size(1024, 10000)
  93.         print("Browser active.")
  94.  
  95.  
  96. def scroll_down_to_load_additional_content():
  97.     # Youtube only loads additional videos/playlists once you've scrolled to the bottom of the page.
  98.     # That's what we're doing in the following function.
  99.  
  100.     print("Scroll to load additional content.")
  101.  
  102.     js_get_page_height = ("return Math.max(document.body.scrollHeight," +
  103.                                                 "document.body.offsetHeight," +
  104.                                                 "document.documentElement.clientHeight," +
  105.                                                 "document.documentElement.scrollHeight," +
  106.                                                 "document.documentElement.offsetHeight );")
  107.  
  108.     # I didn't bother looking for a clean way to determine when youtube
  109.     # has finished pulling new content, so I just wait 4 seconds. Seems to work fine so far.
  110.     scroll_pause_time = 4
  111.  
  112.     current_page_height = browser.execute_script(js_get_page_height)
  113.  
  114.     while True:
  115.         browser.execute_script("window.scrollTo(0,"+str(current_page_height)+");")
  116.         time.sleep(scroll_pause_time)
  117.         new_page_height = browser.execute_script(js_get_page_height)
  118.         if new_page_height == current_page_height:
  119.             break
  120.         current_page_height = new_page_height
  121.  
  122.  
  123. def close_browser():
  124.     print("Closing browser.")
  125.     browser.quit()
  126.  
  127.  
  128. # ------------------------------------------------------------------------------------------
  129. # SCRAPING
  130. # ------------------------------------------------------------------------------------------
  131.  
  132.  
  133. def open_tab(param_root):
  134.     print("\nOpen tab: "+param_root)
  135.     print('Waiting for page to be fully loaded.')
  136.  
  137.     final_url = URL_YOUTUBE_BASE + url_youtube_channel + param_root
  138.     browser.get(final_url)
  139.  
  140.     data = browser.execute_script(JS_GET_YOUTUBE_DATA_VARIABLE)
  141.     all_urls_temp = find_values_in_json('url', json.dumps(data))
  142.     views = []
  143.     for url in all_urls_temp:
  144.         if "view=" in url:
  145.             view_id = re.search("(?<=view=)(\d+)", url).group(0)
  146.             views.append(url_youtube_channel+param_root+'?view='+view_id+'&flow=grid')
  147.     views = remove_duplicates_from_list(views)
  148.     return views
  149.  
  150.  
  151. def parse_views(param_view_url_list, param_view_type):
  152.     for view in param_view_url_list:
  153.         open_view(view, param_view_type)
  154.  
  155.  
  156. def open_view(param_view_url_list, param_view_type):
  157.     global collected_ids
  158.  
  159.     url_parameter = ""
  160.     find_values_key = ""
  161.  
  162.     if param_view_type == VIEW_TYPE_PLAYLISTS:
  163.         find_values_key = PLAYLIST_VALUES_KEY
  164.         url_parameter = PLAYLIST_URL_PARAMETER
  165.     elif param_view_type == VIEW_TYPE_VIDEOS:
  166.         find_values_key = VIDEO_VALUES_KEY
  167.         url_parameter = VIDEO_URL_PARAMETER
  168.  
  169.     url = URL_YOUTUBE_BASE+param_view_url_list
  170.     print("Opening view: "+url)
  171.     browser.get(url)
  172.  
  173.     scroll_down_to_load_additional_content()
  174.  
  175.     data = browser.execute_script(JS_GET_YOUTUBE_DATA_VARIABLE)
  176.     all_ids = find_values_in_json(find_values_key, json.dumps(data))
  177.     all_ids = remove_duplicates_from_list(all_ids)
  178.  
  179.     for found_id in all_ids:
  180.         collected_ids.append(URL_YOUTUBE_BASE + url_parameter + found_id)
  181.  
  182.  
  183. def clean_channel_url():
  184.     global url_youtube_channel
  185.     url_youtube_channel = re.search('/(user|channel)/[^/]*', args.url).group(0)
  186.  
  187.  
  188. def run_youtube_dl():
  189.     print("Run youtube-dl.")
  190.     os.system('youtube-dl.exe --download-archive downloaded.txt -civ -f bestvideo+bestaudio --merge-output-format mkv -o "%(playlist)s/%(upload_date)s___%(title)s___%(id)s.%(ext)s" --batch-file urls.txt')
  191.  
  192.  
  193. # ------------------------------------------------------------------------------------------
  194. # MAIN
  195. # ------------------------------------------------------------------------------------------
  196.  
  197.  
  198. if __name__ == "__main__":
  199.     setup_arguments()
  200.     clean_channel_url()
  201.     initiate_browser()
  202.  
  203.     # Collect URLs from the videos tab.
  204.     parse_views(open_tab(URL_VIDEOS_ROOT), VIEW_TYPE_VIDEOS)
  205.  
  206.     # Collect URLs from the playlists tab.
  207.     parse_views(open_tab(URL_PLAYLISTS_ROOT), VIEW_TYPE_PLAYLISTS)
  208.  
  209.     # The browser is no longer needed now.
  210.     close_browser()
  211.  
  212.     collected_ids = remove_duplicates_from_list(collected_ids)
  213.  
  214.     print("\n" + str(len(collected_ids)) + " URLs collected:\n" + str(collected_ids) + '\n')
  215.  
  216.     save_urls_to_file()
  217.  
  218.     run_youtube_dl()
Advertisement
Add Comment
Please, Sign In to add comment