Try95th

[just corrections] so_q_74511414

Nov 21st, 2022 (edited)
157
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.37 KB | None | 0 0
  1. import requests
  2. import pandas as pd
  3. from bs4 import BeautifulSoup
  4. import json
  5. import csv
  6. from datetime import datetime
  7. from datetime import date
  8.  
  9. now = datetime.now()
  10. today = date.today()
  11.  
  12.  
  13. class PetBarnProdScraper:
  14.  
  15.     all_info = []
  16.  
  17.     def fetch(self, url):
  18.         print(f"HTTP GET request to URL: {url}", end="")
  19.         res = requests.get(url)
  20.         print(f" | Status Code: {res.status_code}")
  21.  
  22.         return res
  23.  
  24.     def parse(self, response):
  25.         soup = BeautifulSoup(response.text, "html.parser")
  26.         product_urls = [a.get("href") for a in soup.select("a.product-item-link")]
  27.         product_ids = [
  28.             pid.get("id").split("-")[-1] for pid in soup.select("div.product-item-info")
  29.         ]
  30.         titles = [
  31.             a.text.replace("\n", "").strip() for a in soup.select("a.product-item-link")
  32.         ]
  33.         old_price = [
  34.             p.select_one("span.price").text for p in soup.select("span.old-price")
  35.         ]
  36.         ratings = [r.get("title") for r in soup.select("div.rating-result")]
  37.         no_of_reviews = [review.text for review in soup.select("a.action.view")]
  38.         data = (
  39.             soup.select('script[type="text/x-magento-init"]')[3]
  40.             .text.replace("\n", "")
  41.             .strip()
  42.         )
  43.         data_json = json.loads(data)
  44.         data_j = json.loads(
  45.             data_json["*"]["Overdose_AdobeAnalytics/js/view/datalayer"]["datalayer"][0]
  46.         )
  47.  
  48.         for idx in range(len(titles)):
  49.             try:
  50.                 ratings_count = ratings[idx]
  51.                 reviews_count = no_of_reviews[idx]
  52.                 last_price = old_price[idx]
  53.             except:
  54.                 ratings_count = "N/A"
  55.                 reviews_count = "N/A"
  56.                 last_price = "N/A"
  57.             d = {
  58.                 "Scraped_Date": now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[0],
  59.                 "Scraped_Time": now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[1],
  60.                 "product_name": titles[idx],
  61.                 "price": data_j["PLP"]["products"][idx]["productPrice"],
  62.                 "old_price": last_price,
  63.                 "ratings": ratings_count,
  64.                 "number_of_reviews": reviews_count,
  65.                 "productSKU": data_j["PLP"]["products"][idx]["productSKU"],
  66.                 "productSize": data_j["PLP"]["products"][idx]["productSize"],
  67.                 "priceWithoutTax": data_j["PLP"]["products"][idx][
  68.                     "productPriceLessTax"
  69.                 ],
  70.                 "lifeStage": data_j["PLP"]["products"][idx]["lifeStage"],
  71.             }
  72.  
  73.             #!# unlooped below
  74.             prod_id = product_ids[idx]
  75.             details = soup.select_one(
  76.                 f"script:-soup-contains('[data-role=swatch-option-{prod_id}]')"
  77.             )
  78.             if details:
  79.                 json_details = json.loads(details.text.replace("\n", "").strip())
  80.                 dataJC = json_details[f"[data-role=swatch-option-{prod_id}]"][
  81.                     "Magento_Swatches/js/swatch-renderer"
  82.                 ]["jsonConfig"]
  83.                 productId = dataJC["productId"]
  84.                 jcInfs = [
  85.                     {
  86.                         "productId": productId,
  87.                         "optionKey": k,
  88.                         "sku": "?",
  89.                         "index": v["1299"] if "1299" in v else None,
  90.                     }
  91.                     for k, v in dataJC["index"].items()
  92.                 ]
  93.                 orInfs = [
  94.                     ("optionPrices", "amount", "reverseNest"),
  95.                     ("dynamic", "value", "nest1"),
  96.                     ("labels", "", "reverseNest"),
  97.                     ("hasEndDate", "", "noNesting"),
  98.                 ]
  99.                 relevInfs = []
  100.                 for kk, vk, nt in orInfs:
  101.                     if kk not in dataJC:
  102.                         continue
  103.                     if nt == "noNesting":
  104.                         relevInfs += [(kk, vk, dataJC[kk])]
  105.                         continue
  106.                     if nt == "nest1":
  107.                         relevInfs += [(kk, vk, vd) for kk, vd in dataJC[kk].items()]
  108.                         continue
  109.                     if nt != "reverseNest":
  110.                         ## can put a default action here
  111.                         continue
  112.                     ## nt == 'reverseNest'
  113.                     orInf = {}
  114.                     for pk, po in dataJC[kk].items():
  115.                         for kpo, vpo in po.items():
  116.                             if kpo not in orInf:
  117.                                 orInf[kpo] = {}
  118.                             orInf[kpo][pk] = vpo
  119.  
  120.                     relevInfs += [(kk, vk, vi) for kk, vi in orInf.items()]
  121.  
  122.                 for i, j in enumerate(jcInfs):
  123.                     for kk, vk, vd in relevInfs:
  124.                         if j["optionKey"] not in vd:
  125.                             continue
  126.                         relevInf = vd[j["optionKey"]]
  127.                         if type(relevInf) != dict:
  128.                             j[kk] = relevInf
  129.                         elif vk in relevInf and relevInf[vk]:
  130.                             j[kk] = relevInf[vk]
  131.                     # combine with main variation
  132.                     jcInfs[i] = {
  133.                         k: v
  134.                         for k, v in (
  135.                             list(d.items())
  136.                             + [(jk, jv) for jk, jv in j.items() if jk not in d]
  137.                         )
  138.                     }
  139.                 for j in jcInfs:
  140.                     self.all_info.append(j)
  141.  
  142.             # else: self.all_info.append(d) #!# if you want to not repeat any prod/variant
  143.             #!# unlooped above
  144.        
  145.             #!# is this intentional?    
  146.             self.all_info.append(d)
  147.             #!# didn't remove but this will create additional row per product
  148.             #!# unless wrapped into else block from if details
  149.  
  150.     def to_csv(self):
  151.  
  152.         df = pd.DataFrame(self.all_info).fillna("")
  153.  
  154.         df.to_csv(f"{today}_petbarn.csv", index=False)
  155.  
  156.         print(f'Stored results to "{today}_petbarn.csv"') #!# just
  157.  
  158.     def run(self):
  159.         for i in range(1, 2):  # total_number of pages
  160.             url = f"https://www.petbarn.com.au/dogs/dog-food/dry-dog-food?p={i}"
  161.  
  162.             response = self.fetch(url)
  163.  
  164.             self.parse(response)
  165.  
  166.         self.to_csv()
  167.  
  168. if __name__ == "__main__":
  169.     scraper = PetBarnProdScraper()
  170.     scraper.run()
Advertisement
Add Comment
Please, Sign In to add comment