Guest User

scrape_ruten_controller.py

a guest
Dec 27th, 2022
445
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.92 KB | None | 0 0
  1. import pandas as pd
  2. from seleniumwire import webdriver
  3. from seleniumwire.utils import decode
  4. from selenium.webdriver.chrome.service import Service as ChromeSerive
  5. from subprocess import CREATE_NO_WINDOW
  6. import json
  7. import time
  8.  
  9. chrome_service = ChromeSerive('chromedriver')
  10. chrome_service.creation_flags = CREATE_NO_WINDOW
  11. chrome_options = webdriver.ChromeOptions()
  12. chrome_options.add_argument('--headless')
  13. chrome_options.add_argument('--no-sandbox')
  14. chrome_options.add_argument('--disable-dev-shm-usage')
  15. chrome_options.add_argument("--disable-3d-apis")
  16. chrome_options.add_argument('--log-level=3')
  17.  
  18. driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options, service=chrome_service)
  19.  
  20. class scrape_ruten(object):
  21. def __init__(self, keyword, pages):
  22. self.keyword = keyword
  23. self.pages = int(pages)
  24.  
  25. # 將要抓取的頁面連結存到urls[]裡
  26. def get_url(self):
  27. self.urls = []
  28. for i in range(0, self.pages):
  29. url = f'https://www.ruten.com.tw/find/?q={self.keyword}&p={i+1}'
  30. self.urls.append(url)
  31.  
  32. # 抓取資料
  33. def scrape(self, url):
  34. driver.get(url) # 瀏覽器取得網頁連結
  35. time.sleep(5)
  36. for request in driver.requests:
  37. if request.response:
  38. if request.url.startswith('https://rtapi.ruten.com.tw/api/prod/v2/index.php/prod?id='): # 若網頁成功跳轉到目標頁面才開始執行
  39. response = request.response
  40. body = decode(response.body, response.headers.get('Content-Encoding', 'Identity'))
  41. decode_body = body.decode('utf8')
  42. json_data = json.loads(decode_body) # 將網頁資料全部存進json_data裡
  43.  
  44. data = []
  45. rows = json_data # 總共獲取幾筆資料
  46. for i in range(0, len(rows)): # 遍歷每一筆商品
  47. product_name = json_data[i]['ProdName'] # 商品標題
  48. price_min = json_data[i]['PriceRange'][0] # 商品最低價
  49. price_max = json_data[i]['PriceRange'][1] # 商品最高價
  50. historical_sold = json_data[i]['SoldQty'] # 已售出
  51.  
  52. data.append(
  53. (product_name, price_min, price_max, historical_sold)
  54. )
  55. return data
  56.  
  57. # 建一個dataframe將資料存進去
  58. def data_frame(self, data):
  59. # '商品標題', '商品最低價', '商品最高價', '已售出'
  60. self.df = pd.DataFrame(data, columns = ['Product Title', 'Price Min', 'Price Max', 'Historical Sold'])
  61.  
  62. # 將抓取到的資料存進excel檔
  63. def save_to_xlsx(self):
  64. # excel檔名: Ruten_關鍵字名稱.xlsx
  65. file_name = f'Ruten_{self.keyword}'
  66. self.df.to_excel(f'{file_name}.xlsx', index = False)
  67.  
  68. return f'{file_name}.xlsx'
Advertisement
Add Comment
Please, Sign In to add comment