Try95th

Get YouTube Playlist Data [requests+bs4]

Apr 5th, 2023 (edited)
164
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.44 KB | None | 0 0
  1. ## return some data (including a list of videos) from a given YouTube playlist link #####
  2. ### [ screenshot of an example with pandas at https://i.stack.imgur.com/UKjp3.png ] #####
  3. ### [ another example at bottom - with output at https://pastebin.com/wdZWm6q1 ] ########
  4.  
  5. import json
  6. import requests
  7. from bs4 import BeautifulSoup
  8.  
  9. def scrape_playlist_data(playlist_url):
  10.     pSoup = BeautifulSoup((r:=requests.get(playlist_url)).content, 'html.parser')
  11.     rStatus = f'<Response [{r.status_code} {r.reason}]> from {r.url}'
  12.     def extract_vid_val(val):
  13.         if isinstance(val, dict):
  14.             if isinstance(val.get('runs'), list):
  15.                 return ' '.join([str(r.get('text','')) for r in val['runs']])
  16.             return val.get('simpleText', val)
  17.         if isinstance(val, list) and val:
  18.           tKey, adKey = 'thumbnailOverlayTimeStatusRenderer', 'accessibilityData'
  19.           if (x:=getattr(val[0],'get',{}.get)(tKey)):
  20.               try: return x['text']['accessibility'][adKey]['label']
  21.               except: pass
  22.           return [extract_vid_val(v) for v in val]
  23.         return val
  24.  
  25.     jScript = pSoup.select_one('script:-soup-contains("var ytInitialData")')
  26.     pDets, title = {}, pSoup.title.text.strip() if pSoup.title else None
  27.     try:
  28.         jData = json.loads(jScript.string.split('=',1)[-1].strip().rstrip(';'))
  29.         pDets = {'playlistId': jData.get('playlistId'), **{
  30.             k: extract_vid_val(v) for k,v in
  31.             jData['header']['playlistHeaderRenderer'].items() if k in
  32.             [ 'playlistId', 'title', 'numVideosText', 'descriptionText',
  33.               'ownerText', 'viewCountText', 'shareData', 'isEditable', 'privacy' ]
  34.         }}
  35.        
  36.         keysList = [
  37.             'contents', 'twoColumnBrowseResultsRenderer', 'tabs', 0,
  38.             'tabRenderer', 'content', 'sectionListRenderer', 'contents', 0,
  39.             'itemSectionRenderer', 'contents', 0, 'playlistVideoListRenderer'
  40.         ]
  41.         for k in keysList: jData = jData[k]
  42.         for k in ['playlistId', 'isEditable', 'canReorder', 'targetId']:
  43.             pDets.setdefault(k, jData.get(k))
  44.  
  45.         vidsData = [{
  46.             k: extract_vid_val(v) for k,v in vid['playlistVideoRenderer'].items()
  47.         } for vid in jData['contents']]
  48.     except Exception as e: vidsData, rStatus = [], f'{rStatus}\n{e!r}'
  49.  
  50.     # print(len(vidsData), 'results - ', rStatus)
  51.     for i, v in enumerate(vidsData):  
  52.         vidsData[i]['videoLink'] = 'https://www.youtube.com/watch?v='+v['videoId']
  53.     total_seconds = sum([ int(str(v['lengthSeconds'])) for v in vidsData
  54.                           if str(v.get('lengthSeconds')).isdigit()        ])
  55.  
  56.     return {  'inputUrl':playlist_url, 'title': title, **pDets, 'videos': vidsData,
  57.               'vidoesCount': len(vidsData), 'total_seconds': total_seconds  }
  58. #########################################################################################
  59.  
  60.  
  61. ######################################## EXAMPLE ########################################
  62. if __name__ == "__main__":                                                        #######
  63.     url = 'https://www.youtube.com/playlist?list=PL5A4nPQbUF8CJ-l7eiAAyoaVrbaM4p2zU' ####
  64.     data = scrape_playlist_data(url)                                               ######
  65.     with open('paste_UYJUqtXF_op.json', 'w') as f: json.dump(f, data)              ######
  66. #########################################################################################
Advertisement
Add Comment
Please, Sign In to add comment