Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from seleniumwire import webdriver
- from seleniumwire.utils import decode
- from selenium.webdriver.chrome.service import Service as ChromeSerive
- from subprocess import CREATE_NO_WINDOW
- import json
- import time
- chrome_service = ChromeSerive('chromedriver')
- chrome_service.creation_flags = CREATE_NO_WINDOW
- chrome_options = webdriver.ChromeOptions()
- chrome_options.add_argument('--headless')
- chrome_options.add_argument('--no-sandbox')
- chrome_options.add_argument('--disable-dev-shm-usage')
- chrome_options.add_argument("--disable-3d-apis")
- chrome_options.add_argument('--log-level=3')
- driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options, service=chrome_service)
- class scrape_ruten(object):
- def __init__(self, keyword, pages):
- self.keyword = keyword
- self.pages = int(pages)
- # 將要抓取的頁面連結存到urls[]裡
- def get_url(self):
- self.urls = []
- for i in range(0, self.pages):
- url = f'https://www.ruten.com.tw/find/?q={self.keyword}&p={i+1}'
- self.urls.append(url)
- # 抓取資料
- def scrape(self, url):
- driver.get(url) # 瀏覽器取得網頁連結
- time.sleep(5)
- for request in driver.requests:
- if request.response:
- if request.url.startswith('https://rtapi.ruten.com.tw/api/prod/v2/index.php/prod?id='): # 若網頁成功跳轉到目標頁面才開始執行
- response = request.response
- body = decode(response.body, response.headers.get('Content-Encoding', 'Identity'))
- decode_body = body.decode('utf8')
- json_data = json.loads(decode_body) # 將網頁資料全部存進json_data裡
- data = []
- rows = json_data # 總共獲取幾筆資料
- for i in range(0, len(rows)): # 遍歷每一筆商品
- product_name = json_data[i]['ProdName'] # 商品標題
- price_min = json_data[i]['PriceRange'][0] # 商品最低價
- price_max = json_data[i]['PriceRange'][1] # 商品最高價
- historical_sold = json_data[i]['SoldQty'] # 已售出
- data.append(
- (product_name, price_min, price_max, historical_sold)
- )
- return data
- # 建一個dataframe將資料存進去
- def data_frame(self, data):
- # '商品標題', '商品最低價', '商品最高價', '已售出'
- self.df = pd.DataFrame(data, columns = ['Product Title', 'Price Min', 'Price Max', 'Historical Sold'])
- # 將抓取到的資料存進excel檔
- def save_to_xlsx(self):
- # excel檔名: Ruten_關鍵字名稱.xlsx
- file_name = f'Ruten_{self.keyword}'
- self.df.to_excel(f'{file_name}.xlsx', index = False)
- return f'{file_name}.xlsx'
Advertisement
Add Comment
Please, Sign In to add comment