Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib.request as req
- import requests
- import selenium
- import schedule
- import time
- import json
- from time import sleep
- import json
- import openpyxl
- import random
- from selenium.webdriver.common.by import By
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.support import expected_conditions as EC
- import bs4
- pttWeb = openpyxl.load_workbook('pttweb.xlsx')
- ws = pttWeb.active
- i = 1
- scroll_time = int(input("scroll_Times"))
- options = Options()
- options.chrome_executable_path = "C:\chromedriver_win32\chromedriver.exe"
- driver = webdriver.Chrome(options = options)
- sleep(3)
- driver.get('https://www.pttweb.cc/hot/all/today')
- sleep(5)
- prev_ele = None
- for now_time in range(1, scroll_time+1):
- sleep(2)
- eles = driver.find_elements(by=By.CLASS_NAME,value='e7-right.ml-2')
- # 若串列中存在上一次的最後一個元素,則擷取上一次的最後一個元素到當前最後一個元素進行爬取
- try:
- # print(eles)
- # print(prev_ele)
- eles = eles[eles.index(prev_ele):]
- except:
- pass
- for ele in eles:
- try:
- titleInfo = ele.find_element(by=By.CLASS_NAME, value = "e7-article-default")
- title = titleInfo.text
- href = titleInfo.get_attribute('href')
- ws.cell(i,1,i)
- ws.cell(i,2,title)
- ws.cell(i,3,href)
- sleep(3)
- inner =req.Request(href, headers ={
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
- })
- with req.urlopen(inner) as innerRespomse:
- articleData = innerRespomse.read().decode("utf-8")
- articleRoot = bs4.BeautifulSoup(articleData, "html.parser")
- main_content = articleRoot.find("div", itemprop="articleBody")
- boardInfo= articleRoot.find("span", class_="e7-board-name-standalone")
- authorInfo = articleRoot.find("span", itemprop="name")
- timeInfo = articleRoot.find("time", itemprop="datePublished")
- countInfo = articleRoot.find_all("span", class_="e7-head-content")
- board = boardInfo.text
- author = authorInfo.text
- Time = timeInfo.text
- count = countInfo[4].text
- allContent = main_content.text
- pre_text = allContent.split('--')[0]
- ws.cell(i,4,board)
- ws.cell(i,5,author)
- ws.cell(i,6,Time)
- ws.cell(i,7,count)
- ws.cell(i,8,pre_text)
- pttWeb.save('pttweb.xlsx')
- sleep(random.uniform(5,20))
- i = i+1
- except:
- pass
- prev_ele = eles[-1]
- print(f"now scroll {now_time}/{scroll_time}")
- js = "window.scrollTo(0, document.body.scrollHeight);"
- driver.execute_script(js)
- sleep(40)
- driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement