Advertisement
Guest User

Untitled

a guest
Jan 28th, 2018
971
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.00 KB | None | 0 0
  1. from selenium import webdriver
  2. from selenium.webdriver.common.by import By
  3. import requests
  4. import re
  5. from time import sleep
  6. download_root= r'C:\Users\Surface\Desktop\scrapy'
  7. driver = webdriver.Chrome(r'C:\Users\Surface\Desktop\scrapy\chromedriver.exe')
  8.  
  9. def entry():
  10.     driver.get('http://www.wenguitar.com//')
  11.     element = driver.find_element(By.XPATH,".//*[text()='繁體中文']")
  12.     element.click()
  13.     element = driver.find_element(By.XPATH,".//*[text()='登入']")
  14.     element.click()
  15.     element_email = driver.find_element(By.XPATH,".//*[@name='inputEmail']")
  16.     element_email.send_keys("jamesxxx1997@gmail.com")
  17.     element_password = driver.find_element(By.XPATH,".//*[@name='inputPassword']")
  18.     element_password.send_keys("WTF FUCK YOU")
  19.     entry_btn = driver.find_element(By.XPATH,".//*[@class='btn']")
  20.     entry_btn.click()  
  21.    
  22.    
  23. def count_main_course():
  24.     main_courses = driver.find_elements(By.XPATH,".//*[@class='btn btn-primary']")
  25.     return len(main_courses)
  26.    
  27.    
  28. def click_main_course(number):
  29.     main_courses = driver.find_elements(By.XPATH,".//*[@class='btn btn-primary']")
  30.     main_courses[number].click()
  31.    
  32. def get_main_course_name(number):
  33.     main_courses = driver.find_elements(By.XPATH,".//*[@class='btn btn-primary']")
  34.     return main_courses[number].text
  35.    
  36.    
  37. def count_sub_course():
  38.     sub_courses = driver.find_elements(By.XPATH,"(.//*[@class='t1'])//a")
  39.     return len(sub_courses)
  40.    
  41.    
  42. def click_sub_course(number):
  43.     sub_courses = driver.find_elements(By.XPATH,"(.//*[@class='t1'])//a")
  44.     sub_courses[number].click()
  45.  
  46. def get_sub_course_name(number):
  47.     sub_courses = driver.find_elements(By.XPATH,"(.//*[@class='t1'])//a")
  48.     return sub_courses[number].text
  49.  
  50. ####################################################################################   
  51.    
  52. def get_page_source():
  53.  
  54.     driver.switch_to.frame(0)
  55.     page_source = driver.page_source
  56.     driver.switch_to.default_content()
  57.     return page_source
  58.  
  59.  
  60.    
  61. def check(video_urls):
  62.     p2 = re.compile('"(.*?)"')
  63.     for video_url in video_urls:
  64.         if p2.match(video_url):
  65.             dirty_video_url = p2.match(video_url).group(1)
  66.             return dirty_video_url
  67.            
  68. def clean_amp(dirty_video_url):
  69.     clean_url = dirty_video_url.replace('amp;','')
  70.     return clean_url
  71.  
  72. def get_video_url(page_source):
  73.     #2 = '"(https:\/\/.*?[.]mp4)"'   
  74.     #1 = '"(https:.*?\.mp4\?expires=\d*?&token=.*?)"'  
  75.     regex =  '("(https:.*?\.mp4\?expires=\d*?&token=.*?)"|"(https:\/\/.*?[.]mp4)")'
  76.     p1 = re.compile(regex)
  77.     video_urls = p1.findall(page_source)[-1]
  78.        
  79.     dirty_video_url = check(video_urls)
  80.     clean_url = clean_amp(dirty_video_url)
  81.     return clean_url
  82.            
  83. def get_video(main_course_name,sub_course_name,clean_url):
  84.     r = requests.get(clean_url , stream = True)
  85.     filename = download_root + '\\' + main_course_name + sub_course_name + '.mp4'
  86.     with open(filename , "wb") as fout:
  87.         for chunk in r.iter_content(chunk_size=1024):
  88.             if chunk:
  89.                 fout.write(chunk)
  90.    
  91.    
  92.  
  93.            
  94. def video(main_course_name,sub_course_name):
  95.     page_source = get_page_source()
  96.     clean_url = get_video_url(page_source)
  97.     get_video(main_course_name,sub_course_name,clean_url)
  98.     print(clean_url)
  99.  
  100. ###########################################################################
  101.    
  102. def __main__():
  103.     entry()
  104.     main_courses_number = count_main_course()
  105.     #main_courses_number = 3
  106.     sub_courses_number = count_sub_course()
  107.     #sub_courses_number=2
  108.     for x in range(1,main_courses_number):
  109.         main_course_name = get_main_course_name(x)
  110.         click_main_course(x)
  111.         for y in range(0,sub_courses_number):
  112.             sub_course_name = get_sub_course_name(y)
  113.             click_sub_course(y)
  114.             sleep(2)
  115.             video(main_course_name , sub_course_name)
  116.             sleep(2)
  117.             driver.back()
  118.             sleep(2)
  119.            
  120.            
  121.            
  122.            
  123.            
  124. def test_match():
  125.     main_courses_number = 3
  126.     sub_courses_number=2
  127.     count = 0
  128.     count_er = 0
  129.     entry()
  130.     for x in range(1,main_courses_number):
  131.         click_main_course(x)
  132.         count += 1
  133.         for y in range(0,sub_courses_number):
  134.             click_sub_course(y)
  135.             if get_video_url(get_page_source()) == None:
  136.                 count_er += 1
  137.    
  138.    
  139. __main__()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement