Guest User

scrape_shopee_controller.py

a guest
Dec 27th, 2022
417
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.44 KB | None | 0 0
  1. import pandas as pd
  2. from seleniumwire import webdriver
  3. from seleniumwire.utils import decode
  4. from selenium.webdriver.chrome.service import Service as ChromeSerive
  5. from subprocess import CREATE_NO_WINDOW
  6. import json
  7. import time
  8.  
  9. from scrape_ui import Ui_MainWindow
  10. from PyQt5 import QtWidgets
  11.  
  12. chrome_service = ChromeSerive('chromedriver')
  13. chrome_service.creation_flags = CREATE_NO_WINDOW
  14. chrome_options = webdriver.ChromeOptions()
  15. chrome_options.add_argument('--headless')
  16. chrome_options.add_argument('--no-sandbox')
  17. chrome_options.add_argument('--disable-dev-shm-usage')
  18. chrome_options.add_argument("--disable-3d-apis")
  19. chrome_options.add_argument('--log-level=3')
  20.  
  21. driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options, service=chrome_service)
  22.  
  23. class scrape_shopee(QtWidgets.QMainWindow):
  24. def __init__(self, keyword, pages):
  25. super().__init__()
  26. self.ui = Ui_MainWindow()
  27. self.ui.setupUi(self)
  28. self.keyword = keyword
  29. self.pages = int(pages)
  30.  
  31. # 將要抓取的頁面連結存到urls[]裡
  32. def get_url(self):
  33. self.urls = []
  34. if self.pages == 1:
  35. url = f'https://shopee.tw/search?keyword={self.keyword}&page=0'
  36. self.urls.append(url)
  37. else:
  38. for i in range(0, self.pages - 1): # 蝦皮頁面是從page=0開始算,所以這邊做-1
  39. url = f'https://shopee.tw/search?keyword={self.keyword}&page={i}'
  40. self.urls.append(url)
  41.  
  42. # 抓取資料
  43. def scrape(self, url):
  44. driver.get(url) # 瀏覽器取得網頁連結
  45. time.sleep(5)
  46. for request in driver.requests:
  47. if request.response:
  48. if request.url.startswith('https://shopee.tw/api/v4/search/search_items?by=relevancy&keyword='): # 若網頁成功跳轉到目標頁面才開始執行
  49. response = request.response
  50. body = decode(response.body, response.headers.get('Content-Encoding', 'Identity'))
  51. decode_body = body.decode('utf8')
  52. json_data = json.loads(decode_body) # 將網頁資料全部存進json_data裡
  53.  
  54. data = []
  55. rows = json_data['items'] # 總共獲取幾筆資料
  56. for i in range(0, len(rows)): # 遍歷每一筆商品
  57. product_name = json_data['items'][i]['item_basic']['name'] # 商品標題
  58. price_min = str(json_data['items'][i]['item_basic']['price_min'])[:-5] # 商品最低價
  59. price_max = str(json_data['items'][i]['item_basic']['price_max'])[:-5] # 商品最高價
  60. historical_sold = json_data['items'][i]['item_basic']['historical_sold'] # 已售出
  61.  
  62. data.append(
  63. (product_name, price_min, price_max, historical_sold)
  64. )
  65. return data
  66.  
  67. # 建一個dataframe將資料存進去
  68. def data_frame(self, data):
  69. # '商品標題', '商品最低價', '商品最高價', '已售出'
  70. self.df = pd.DataFrame(data, columns = ['Product Title', 'Price Min', 'Price Max', 'Historical Sold'])
  71.  
  72. # 將抓取到的資料存進excel檔
  73. def save_to_xlsx(self):
  74. # excel檔名: Shopee_關鍵字名稱.xlsx
  75. file_name = f'Shopee_{self.keyword}'
  76. self.df.to_excel(f'{file_name}.xlsx', index = False)
  77.  
  78. return f'{file_name}.xlsx'
Advertisement
Add Comment
Please, Sign In to add comment