Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import unicode_literals
- from bs4 import BeautifulSoup
- import requests
- import os
- import youtube_dl
- import datetime
- # url = 'https://www.safaribooksonline.com/library/view/ccna-routing-and/9780134580715/'
- # url = 'https://www.safaribooksonline.com/library/view/learning-python-web/9781785280351/'
- # url = 'https://www.safaribooksonline.com/library/view/web-scraping-in/200000006A0423/'
- # url = 'https://www.safaribooksonline.com/library/view/python-design-patterns/9781786460677/'
- # url = 'https://www.safaribooksonline.com/library/view/introduction-to-python/9781491904794/'
- # url = 'https://www.safaribooksonline.com/library/view/learning-python-web/9781785280351//'
- url = 'https://www.safaribooksonline.com/library/view/master-the-fundamentals/200000006A0422/'
- domain = 'https://www.safaribooksonline.com'
- output_folder = 'output'
- username = 'username'
- password = 'SuperSecretPassword'
- print("*"*100)
- print(f"Let's go hacking following resources :))) {url}")
- print("*"*100)
- lst_exception = []
- dict_time = {}
- d = os.path.dirname(os.path.abspath(__file__))
- req = requests.get(url)
- soup = BeautifulSoup(req.text, 'html.parser')
- lessons = soup.find_all('li', class_='toc-level-1')
- # print("All lessons is: ", len(lessons))
- # print("*"*100)
- source_category_block = soup.find('div', class_='description t-description')
- if 'Book Description' in soup.text:
- source_category = "book"
- # print("Your source category is:", source_category)
- elif 'Video Description' in soup.text:
- source_category = "video"
- # print("Your source category is:", source_category)
- else:
- source_category = "undefined"
- # print("Your source category is:", source_category)
- source_name_block = soup.find('h1', class_='t-title')
- if source_name_block:
- source_name = source_name_block.text
- # print("Your source name is:", source_name)
- else:
- source_name = None
- Path = os.path.join(d, output_folder, source_category, source_name)
- os.makedirs(Path, exist_ok=True)
- module_name = 'Module 0'
- ydl_opts = {
- }
- for lesson in lessons:
- lesson_name = lesson.a.text
- if lesson_name.startswith('Module') and not 'Summary' in lesson_name:
- module_name = lesson_name
- os.makedirs(Path + '/' + module_name, exist_ok=True)
- for index, video in enumerate(lesson.ol.find_all('a')):
- video_name = str(index) + ' - ' + video.text
- video_url = domain + video.get('href')
- video_out = Path + '/' + module_name + '/' + video_name + '.mp4'
- video_out_for_youtube_dl = Path + '/' + module_name
- print("youtube-dl --output '{}' {}".format(video_out, video_url))
- ydl_opts['outtmpl'] = video_out_for_youtube_dl + "/%(title)s-%(id)s.%(ext)s"
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
- print("Downloads:", video_name)
- try:
- ydl.download([video_url])
- except Exception as e:
- lst_exception.append(video.text)
- print(e, e.args)
- for time_video in lesson.ol.find_all('li'):
- # print(11111111, time_video)
- # print(4444444, time_video.find('a').text)
- # print(5555555, time_video.find('span').text)
- if time_video.find('a').text and time_video.find('span').text:
- dict_time[time_video.find('a').text] = time_video.find('span').text
- # print("aaaaaaaaaaa", dict_time)
- else:
- os.makedirs(Path + '/' + module_name + '/' + lesson_name, exist_ok=True)
- for index, video in enumerate(lesson.ol.find_all('a')):
- video_name = str(index) + ' - ' + video.text
- video_url = domain + video.get('href')
- video_out = Path + '/' + module_name + '/' + lesson_name + '/' + video_name + '.mp4'
- video_out_for_youtube_dl = Path + '/' + module_name + '/' + lesson_name
- print("youtube-dl --output '{}' {}".format(video_out, video_url))
- ydl_opts['outtmpl'] = video_out_for_youtube_dl + "/%(title)s-%(id)s.%(ext)s"
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
- print("Downloads:", video_name)
- try:
- ydl.download([video_url])
- except Exception as e:
- lst_exception.append(video.text)
- print(e, e.args)
- for time_video in lesson.ol.find_all('li'):
- # print(222222222, time_video)
- # print(6666666, time_video.find('a').text)
- # print(7777777, time_video.find('span').text)
- if time_video.find('a').text and time_video.find('span').text:
- dict_time[time_video.find('a').text] = time_video.find('span').text
- # print("bbbbbbbbbbb", dict_time)
- print("*"*100)
- print("The END :)))"*8)
- print("*"*100)
- print("Statistic:")
- print('\n')
- print("All lessons is: ", len(lessons))
- print("Your source category is:", source_category)
- print("Your source name is:", source_name)
- print('\n')
- print(f"Number of videos that were available for download is {len(dict_time.keys())}.")
- print(f"The amount of video that has been able to download is {len(dict_time.keys()) - len(lst_exception)}.")
- print(f"VIDEOS JSON : {dict_time}")
- if len(lst_exception) != 0:
- print('\n')
- print("<---------- You have the following Exception !!! -------------->")
- print('\n')
- print(f"Not downloads {len(lst_exception)} resources: ")
- for elem in lst_exception:
- if dict_time.get(elem):
- print({elem: dict_time.get(elem)})
- timeList_available = dict_time.values()
- sum_available = datetime.timedelta()
- for i in timeList_available:
- (h, m, s) = i.split(':')
- d = datetime.timedelta(hours=int(h), minutes=int(m), seconds=int(s))
- sum_available += d
- # print(7865435678654567865456786543, list(dict_time))
- for el in lst_exception:
- # print(34245324242452452, el)
- if el in list(dict_time):
- # print("REWHQGRGQWJGWGHDFJBFDHASHFDBBFCSCFHSHFKCABSHFBAKSFBAKFFMNAFAM")
- dict_time.pop(el, None)
- # print("jkjhgfd", dict_time, type(dict_time))
- timeList_downloads = dict_time.values()
- sum_downloads = datetime.timedelta()
- for i in timeList_downloads:
- (h, m, s) = i.split(':')
- ddd = datetime.timedelta(hours=int(h), minutes=int(m), seconds=int(s))
- sum_downloads += ddd
- print('\n')
- print("<++++++++++ Information about time watch videos ++++++++++++++>")
- print('\n')
- print(f"Summary time watch video (available) is : {str(sum_available)}")
- print(f"Summary time watch video (downloads) is : {str(sum_downloads)}")
- """
- def my_hook(d):
- if d['status'] == 'finished':
- print('Done downloading, now converting ...')
- ydl_opts = {
- 'format': 'bestaudio/best',
- 'outtmpl': '%(id)s',
- 'noplaylist' : True,
- 'progress_hooks': [my_hook],
- }
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
- ydl.download(['https://www.youtube.com/watch?v=pwp1CH5R-w4'])
- """
- """
- timeList = [ '0:00:00', '0:00:15', '9:30:56' ]
- totalSecs = 0
- for tm in timeList:
- timeParts = [int(s) for s in tm.split(':')]
- totalSecs += (timeParts[0] * 60 + timeParts[1]) * 60 + timeParts[2]
- totalSecs, sec = divmod(totalSecs, 60)
- hr, min = divmod(totalSecs, 60)
- print "%d:%02d:%02d" % (hr, min, sec)
- import datetime
- timeList = ['0:00:00', '0:00:15', '9:30:56']
- sum = datetime.timedelta()
- for i in timeList:
- (h, m, s) = i.split(':')
- d = datetime.timedelta(hours=int(h), minutes=int(m), seconds=int(s))
- sum += d
- print(str(sum))
- timeList = [ '0:00:00', '0:00:15', '9:30:56' ]
- ttt = [map(int,i.split()[-1].split(':')) for i in timeList]
- seconds=reduce(lambda x,y:x+y[0]*3600+y[1]*60+y[2],ttt,0)
- #seconds == 34271
- This one looks horrible too ->
- zero_time = datetime.datetime.strptime('0:0:0', '%H:%M:%S')
- ttt=[datetime.datetime.strptime(i, '%H:%M:%S')-zero_time for i in timeList]
- delta=sum(ttt,zero_time)-zero_time
- # delta==datetime.timedelta(0, 34271)
- # str(delta)=='9:31:11' # this seems good, but
- # if we have more than 1 day we get for example str(delta)=='1 day, 1:05:22'
- Really frustrating is also this ->
- sum(ttt,zero_time).strftime('%H:%M:%S') # it is only "modulo" 24 :(
- I really like to see one-liner so, I tried to make one in python3 :P (good result but horrible look)
- import functools
- timeList = ['0:00:00','0:00:15','9:30:56','21:00:00'] # notice additional 21 hours!
- sum_fnc=lambda ttt:(lambda a:'%02d:%02d:%02d' % (divmod(divmod(a,60)[0],60)+(divmod(a,60)[1],)))((lambda a:functools.reduce(lambda x,y:x+y[0]*3600+y[1]*60+y[2],a,0))((lambda a:[list(map(int,i.split()[-1].split(':'))) for i in a])(ttt)))
- # sum_fnc(timeList) -> '30:40:11'
- lines = ["0:00:00", "0:00:15", "9:30:56"]
- total = 0
- for line in lines:
- h, m, s = map(int, line.split(":"))
- total += 3600*h + 60*m + s
- print "%02d:%02d:%02d" % (total / 3600, total / 60 % 60, total % 60)
- """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement