Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## return some data (including a list of videos) from a given YouTube playlist link #####
- ### [ screenshot of an example with pandas at https://i.stack.imgur.com/UKjp3.png ] #####
- ### [ another example at bottom - with output at https://pastebin.com/wdZWm6q1 ] ########
- import json
- import requests
- from bs4 import BeautifulSoup
- def scrape_playlist_data(playlist_url):
- pSoup = BeautifulSoup((r:=requests.get(playlist_url)).content, 'html.parser')
- rStatus = f'<Response [{r.status_code} {r.reason}]> from {r.url}'
- def extract_vid_val(val):
- if isinstance(val, dict):
- if isinstance(val.get('runs'), list):
- return ' '.join([str(r.get('text','')) for r in val['runs']])
- return val.get('simpleText', val)
- if isinstance(val, list) and val:
- tKey, adKey = 'thumbnailOverlayTimeStatusRenderer', 'accessibilityData'
- if (x:=getattr(val[0],'get',{}.get)(tKey)):
- try: return x['text']['accessibility'][adKey]['label']
- except: pass
- return [extract_vid_val(v) for v in val]
- return val
- jScript = pSoup.select_one('script:-soup-contains("var ytInitialData")')
- pDets, title = {}, pSoup.title.text.strip() if pSoup.title else None
- try:
- jData = json.loads(jScript.string.split('=',1)[-1].strip().rstrip(';'))
- pDets = {'playlistId': jData.get('playlistId'), **{
- k: extract_vid_val(v) for k,v in
- jData['header']['playlistHeaderRenderer'].items() if k in
- [ 'playlistId', 'title', 'numVideosText', 'descriptionText',
- 'ownerText', 'viewCountText', 'shareData', 'isEditable', 'privacy' ]
- }}
- keysList = [
- 'contents', 'twoColumnBrowseResultsRenderer', 'tabs', 0,
- 'tabRenderer', 'content', 'sectionListRenderer', 'contents', 0,
- 'itemSectionRenderer', 'contents', 0, 'playlistVideoListRenderer'
- ]
- for k in keysList: jData = jData[k]
- for k in ['playlistId', 'isEditable', 'canReorder', 'targetId']:
- pDets.setdefault(k, jData.get(k))
- vidsData = [{
- k: extract_vid_val(v) for k,v in vid['playlistVideoRenderer'].items()
- } for vid in jData['contents']]
- except Exception as e: vidsData, rStatus = [], f'{rStatus}\n{e!r}'
- # print(len(vidsData), 'results - ', rStatus)
- for i, v in enumerate(vidsData):
- vidsData[i]['videoLink'] = 'https://www.youtube.com/watch?v='+v['videoId']
- total_seconds = sum([ int(str(v['lengthSeconds'])) for v in vidsData
- if str(v.get('lengthSeconds')).isdigit() ])
- return { 'inputUrl':playlist_url, 'title': title, **pDets, 'videos': vidsData,
- 'vidoesCount': len(vidsData), 'total_seconds': total_seconds }
- #########################################################################################
- ######################################## EXAMPLE ########################################
- if __name__ == "__main__": #######
- url = 'https://www.youtube.com/playlist?list=PL5A4nPQbUF8CJ-l7eiAAyoaVrbaM4p2zU' ####
- data = scrape_playlist_data(url) ######
- with open('paste_UYJUqtXF_op.json', 'w') as f: json.dump(f, data) ######
- #########################################################################################
Advertisement
Add Comment
Please, Sign In to add comment