Advertisement
Guest User

selenium問題

a guest
Dec 15th, 2022
409
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.15 KB | Software | 0 0
  1. import urllib.request as req
  2. import requests
  3. import selenium
  4. import schedule
  5. import time
  6. import json
  7. from time import sleep
  8. import json
  9. import openpyxl
  10. import random
  11. from selenium.webdriver.common.by import By
  12. from selenium.webdriver.common.action_chains import ActionChains
  13. from selenium.webdriver.common.keys import Keys
  14. from selenium import webdriver
  15. from selenium.webdriver.chrome.options import Options
  16. from selenium.webdriver.support import expected_conditions as EC
  17. import bs4
  18.  
  19.  
  20.  
  21.  
  22. pttWeb = openpyxl.load_workbook('pttweb.xlsx')
  23. ws = pttWeb.active
  24. i = 1
  25.  
  26.  
  27. scroll_time = int(input("scroll_Times"))
  28. options = Options()
  29. options.chrome_executable_path = "C:\chromedriver_win32\chromedriver.exe"
  30. driver = webdriver.Chrome(options = options)
  31. sleep(3)
  32. driver.get('https://www.pttweb.cc/hot/all/today')
  33. sleep(5)
  34.  
  35. prev_ele = None
  36. for now_time in range(1, scroll_time+1):
  37.     sleep(2)
  38.     eles = driver.find_elements(by=By.CLASS_NAME,value='e7-right.ml-2')
  39.     # 若串列中存在上一次的最後一個元素,則擷取上一次的最後一個元素到當前最後一個元素進行爬取
  40.     try:
  41.         # print(eles)
  42.         # print(prev_ele)
  43.         eles = eles[eles.index(prev_ele):]
  44.     except:
  45.         pass
  46.     for ele in eles:
  47.         try:
  48.             titleInfo = ele.find_element(by=By.CLASS_NAME, value = "e7-article-default")
  49.             title = titleInfo.text
  50.             href = titleInfo.get_attribute('href')
  51.             ws.cell(i,1,i)
  52.             ws.cell(i,2,title)
  53.             ws.cell(i,3,href)
  54.             sleep(3)
  55.  
  56.             inner =req.Request(href, headers ={
  57.                     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
  58.                 })
  59.             with req.urlopen(inner) as innerRespomse:
  60.                 articleData = innerRespomse.read().decode("utf-8")
  61.                
  62.                 articleRoot = bs4.BeautifulSoup(articleData, "html.parser")
  63.                 main_content = articleRoot.find("div", itemprop="articleBody")
  64.                 boardInfo= articleRoot.find("span", class_="e7-board-name-standalone")
  65.                 authorInfo = articleRoot.find("span", itemprop="name")
  66.                 timeInfo = articleRoot.find("time", itemprop="datePublished")
  67.                 countInfo = articleRoot.find_all("span", class_="e7-head-content")
  68.                 board = boardInfo.text
  69.                 author = authorInfo.text
  70.                 Time = timeInfo.text
  71.                 count = countInfo[4].text
  72.                 allContent = main_content.text
  73.                 pre_text = allContent.split('--')[0]
  74.  
  75.                 ws.cell(i,4,board)
  76.                 ws.cell(i,5,author)
  77.                 ws.cell(i,6,Time)
  78.                 ws.cell(i,7,count)
  79.                 ws.cell(i,8,pre_text)
  80.                 pttWeb.save('pttweb.xlsx')
  81.                 sleep(random.uniform(5,20))
  82.                 i = i+1
  83.         except:
  84.             pass
  85.     prev_ele = eles[-1]
  86.  
  87.     print(f"now scroll {now_time}/{scroll_time}")
  88.     js = "window.scrollTo(0, document.body.scrollHeight);"
  89.     driver.execute_script(js)
  90.     sleep(40)
  91.  
  92.  
  93.  
  94. driver.quit()
Tags: python
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement