Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- import requests
- import re
- from time import sleep
- download_root= r'C:\Users\Surface\Desktop\scrapy'
- driver = webdriver.Chrome(r'C:\Users\Surface\Desktop\scrapy\chromedriver.exe')
- def entry():
- driver.get('http://www.wenguitar.com//')
- element = driver.find_element(By.XPATH,".//*[text()='繁體中文']")
- element.click()
- element = driver.find_element(By.XPATH,".//*[text()='登入']")
- element.click()
- element_email = driver.find_element(By.XPATH,".//*[@name='inputEmail']")
- element_email.send_keys("jamesxxx1997@gmail.com")
- element_password = driver.find_element(By.XPATH,".//*[@name='inputPassword']")
- element_password.send_keys("WTF FUCK YOU")
- entry_btn = driver.find_element(By.XPATH,".//*[@class='btn']")
- entry_btn.click()
- def count_main_course():
- main_courses = driver.find_elements(By.XPATH,".//*[@class='btn btn-primary']")
- return len(main_courses)
- def click_main_course(number):
- main_courses = driver.find_elements(By.XPATH,".//*[@class='btn btn-primary']")
- main_courses[number].click()
- def get_main_course_name(number):
- main_courses = driver.find_elements(By.XPATH,".//*[@class='btn btn-primary']")
- return main_courses[number].text
- def count_sub_course():
- sub_courses = driver.find_elements(By.XPATH,"(.//*[@class='t1'])//a")
- return len(sub_courses)
- def click_sub_course(number):
- sub_courses = driver.find_elements(By.XPATH,"(.//*[@class='t1'])//a")
- sub_courses[number].click()
- def get_sub_course_name(number):
- sub_courses = driver.find_elements(By.XPATH,"(.//*[@class='t1'])//a")
- return sub_courses[number].text
- ####################################################################################
- def get_page_source():
- driver.switch_to.frame(0)
- page_source = driver.page_source
- driver.switch_to.default_content()
- return page_source
- def check(video_urls):
- p2 = re.compile('"(.*?)"')
- for video_url in video_urls:
- if p2.match(video_url):
- dirty_video_url = p2.match(video_url).group(1)
- return dirty_video_url
- def clean_amp(dirty_video_url):
- clean_url = dirty_video_url.replace('amp;','')
- return clean_url
- def get_video_url(page_source):
- #2 = '"(https:\/\/.*?[.]mp4)"'
- #1 = '"(https:.*?\.mp4\?expires=\d*?&token=.*?)"'
- regex = '("(https:.*?\.mp4\?expires=\d*?&token=.*?)"|"(https:\/\/.*?[.]mp4)")'
- p1 = re.compile(regex)
- video_urls = p1.findall(page_source)[-1]
- dirty_video_url = check(video_urls)
- clean_url = clean_amp(dirty_video_url)
- return clean_url
- def get_video(main_course_name,sub_course_name,clean_url):
- r = requests.get(clean_url , stream = True)
- filename = download_root + '\\' + main_course_name + sub_course_name + '.mp4'
- with open(filename , "wb") as fout:
- for chunk in r.iter_content(chunk_size=1024):
- if chunk:
- fout.write(chunk)
- def video(main_course_name,sub_course_name):
- page_source = get_page_source()
- clean_url = get_video_url(page_source)
- get_video(main_course_name,sub_course_name,clean_url)
- print(clean_url)
- ###########################################################################
- def __main__():
- entry()
- main_courses_number = count_main_course()
- #main_courses_number = 3
- sub_courses_number = count_sub_course()
- #sub_courses_number=2
- for x in range(1,main_courses_number):
- main_course_name = get_main_course_name(x)
- click_main_course(x)
- for y in range(0,sub_courses_number):
- sub_course_name = get_sub_course_name(y)
- click_sub_course(y)
- sleep(2)
- video(main_course_name , sub_course_name)
- sleep(2)
- driver.back()
- sleep(2)
- def test_match():
- main_courses_number = 3
- sub_courses_number=2
- count = 0
- count_er = 0
- entry()
- for x in range(1,main_courses_number):
- click_main_course(x)
- count += 1
- for y in range(0,sub_courses_number):
- click_sub_course(y)
- if get_video_url(get_page_source()) == None:
- count_er += 1
- __main__()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement