Guest User

scrape_shopee_controller.py

a guest
Dec 26th, 2022
450
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.02 KB | None | 0 0
  1. import pandas as pd
  2. from seleniumwire import webdriver
  3. from seleniumwire.utils import decode
  4. from selenium.webdriver.chrome.service import Service as ChromeSerive
  5. from subprocess import CREATE_NO_WINDOW
  6. import json
  7. import time
  8.  
  9. from scrape_ui import Ui_MainWindow
  10. from PyQt5 import QtWidgets
  11.  
  12. chrome_service = ChromeSerive('chromedriver')
  13. chrome_service.creation_flags = CREATE_NO_WINDOW
  14. chrome_options = webdriver.ChromeOptions()
  15. chrome_options.add_argument('--headless')
  16. chrome_options.add_argument('--no-sandbox')
  17. chrome_options.add_argument('--disable-dev-shm-usage')
  18. chrome_options.add_argument("--disable-3d-apis")
  19. chrome_options.add_argument('--log-level=3')
  20.  
  21. driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options, service=chrome_service)
  22.  
  23. class scrape_shopee(QtWidgets.QMainWindow):
  24. def __init__(self, keyword, pages):
  25. super().__init__()
  26. self.ui = Ui_MainWindow()
  27. self.ui.setupUi(self)
  28. self.keyword = keyword
  29. self.pages = int(pages)
  30.  
  31. self.get_url() # 獲取所要爬的關鍵字連結
  32. dt_all = [] # 用來存取商品資訊
  33. for i in range(0, len(self.urls)):
  34. scrapes = self.scrape(self.urls[i])
  35. dt_all.extend(scrapes)
  36.  
  37. self.data_frame(dt_all) # 建一個dataframe
  38. self.save_to_xlsx() # 把dataframe存成excel檔
  39.  
  40. # 將要抓取的頁面連結存到urls[]裡
  41. def get_url(self):
  42. self.ui.disply_text('獲取連結中')
  43. self.urls = []
  44. if self.pages == 1:
  45. url = f'https://shopee.tw/search?keyword={self.keyword}&page=0'
  46. self.urls.append(url)
  47. else:
  48. for i in range(0, self.pages - 1): # 蝦皮頁面是從page=0開始算,所以這邊做-1
  49. url = f'https://shopee.tw/search?keyword={self.keyword}&page={i}'
  50. self.urls.append(url)
  51.  
  52. # 抓取資料
  53. def scrape(self, url):
  54. self.ui.disply_text('抓取資料中')
  55. driver.get(url) # 瀏覽器取得網頁連結
  56. time.sleep(5)
  57. for request in driver.requests:
  58. if request.response:
  59. if request.url.startswith('https://shopee.tw/api/v4/search/search_items?by=relevancy&keyword='): # 若網頁成功跳轉到目標頁面才開始執行
  60. response = request.response
  61. body = decode(response.body, response.headers.get('Content-Encoding', 'Identity'))
  62. decode_body = body.decode('utf8')
  63. json_data = json.loads(decode_body) # 將網頁資料全部存進json_data裡
  64.  
  65. data = []
  66. rows = json_data['items'] # 總共獲取幾筆資料
  67. for i in range(0, len(rows)): # 遍歷每一筆商品
  68. product_name = json_data['items'][i]['item_basic']['name'] # 商品標題
  69. price_min = str(json_data['items'][i]['item_basic']['price_min'])[:-5] # 商品最低價
  70. price_max = str(json_data['items'][i]['item_basic']['price_max'])[:-5] # 商品最高價
  71. historical_sold = json_data['items'][i]['item_basic']['historical_sold'] # 已售出
  72.  
  73. data.append(
  74. (product_name, price_min, price_max, historical_sold)
  75. )
  76. #self.ui.disply_text(data)
  77. return data
  78.  
  79. # 建一個dataframe將資料存進去
  80. def data_frame(self, data):
  81. self.ui.disply_text('建立資料表格')
  82. # '商品標題', '商品最低價', '商品最高價', '已售出'
  83. self.df = pd.DataFrame(data, columns = ['Product Title', 'Price Min', 'Price Max', 'Historical Sold'])
  84.  
  85. # 將抓取到的資料存進excel檔
  86. def save_to_xlsx(self):
  87. # excel檔名: Shopee_關鍵字名稱.xlsx
  88. file_name = f'Shopee_{self.keyword}'
  89. self.df.to_excel(f'{file_name}.xlsx', index = False)
  90. self.ui.disply_text(f'儲存完畢,檔案名稱為"{file_name}.xlsx"')
Advertisement
Add Comment
Please, Sign In to add comment