Advertisement
Try95th

[with suggestions] so_q_74511414

Nov 21st, 2022
158
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.73 KB | None | 0 0
  1. import requests
  2. import pandas as pd
  3. from bs4 import BeautifulSoup
  4. import json
  5. import csv
  6. from datetime import datetime
  7. from datetime import date
  8.  
  9. now = datetime.now()
  10. today = date.today()
  11.  
  12.  
  13. class PetBarnProdScraper:
  14.  
  15.     all_info = []
  16.  
  17.     def fetch(self, url):
  18.         print(f"HTTP GET request to URL: {url}", end="")
  19.         res = requests.get(url)
  20.         print(f" | Status Code: {res.status_code}")
  21.  
  22.         return res
  23.  
  24.     def parse(self, response):
  25.         soup = BeautifulSoup(response.text, "html.parser")
  26.  
  27.         #!# [below] can be outside loop - doesn't change inside loop
  28.         data = (
  29.             soup.select('script[type="text/x-magento-init"]')[3]
  30.             .text.replace("\n", "")
  31.             .strip()
  32.         )
  33.         data_json = json.loads(data)
  34.         data_j = json.loads(
  35.             data_json["*"][
  36.                 "Overdose_AdobeAnalytics/js/view/datalayer"
  37.             ]["datalayer"][0]
  38.         )
  39.         #!# [above] can be outside loop - doesn't change inside loop
  40.  
  41.         prodConts = soup.select('div.product-item-info[id^="product-id-"]')
  42.         prodsJ = data_j["PLP"]["products"]
  43.  
  44.         for prodCont, prodJ in zip(prodConts, prodsJ):
  45.             #!# [below] moved inside loop
  46.             prod_id = prodCont.get("id").split("-")[-1]
  47.  
  48.             prodLink = prodCont.select_one("a.product-item-link")
  49.             if prodLink:
  50.                 product_url = prodLink.get("href")
  51.                 title = ' '.join( #!# reduce whitescpace
  52.                     [w for w in prodLink.text.split() if w])
  53.             else: product_url = title = 'N/A'
  54.  
  55.             old_price = prodCont.select_one("span.old-price")
  56.             last_price = ' '.join( #!# reduce whitescpace
  57.                 [w for w in old_price.text.split() if w]
  58.             ) if old_price else 'N/A'
  59.             ratings = prodCont.select_one("div.rating-result[title]")
  60.             ratings_count = ratings.get("title") if ratings else 'N/A'
  61.             no_of_reviews = prodCont.select_one("a.action.view")
  62.             reviews_count = ' '.join( #!# reduce whitescpace
  63.                 [w for w in no_of_reviews.text.split() if w]
  64.             ) if no_of_reviews else 'N/A'
  65.             #!# [above]  moved inside loop
  66.  
  67.         # for idx in range(len(titles)): #!# loops merged
  68.             # try: ... # except... #!# no longer needed, already fixed above
  69.             d = {
  70.                 "Scraped_Date": now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[0],
  71.                 "Scraped_Time": now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[1],
  72.                 "product_id": prod_id, #!# added just bc
  73.                 "product_name": title, #titles[idx], #!# loops merged
  74.                 "price": prodJ["productPrice"], #!# lists zipped
  75.                 "old_price": last_price, #!# loops merged
  76.                 "ratings": ratings_count, #!# loops merged
  77.                 "number_of_reviews": reviews_count, #!# loops merged
  78.                 "productSKU": prodJ["productSKU"], #!# lists zipped
  79.                 "productSize": prodJ["productSize"], #!# lists zipped
  80.                 "priceWithoutTax": prodJ["productPriceLessTax"], #!# lists zipped
  81.                 "lifeStage": prodJ["lifeStage"], #!# lists zipped
  82.             }
  83.  
  84.             #!# [below] unlooped
  85.             details = soup.select_one(
  86.                 f"script:-soup-contains('[data-role=swatch-option-{prod_id}]')"
  87.             )
  88.             if details:
  89.                 json_details = json.loads(details.text.replace("\n", "").strip())
  90.                 dataJC = json_details[f"[data-role=swatch-option-{prod_id}]"][
  91.                     "Magento_Swatches/js/swatch-renderer"
  92.                 ]["jsonConfig"]
  93.                 productId = dataJC["productId"]
  94.                 jcInfs = [
  95.                     {
  96.                         "productId": productId,
  97.                         "optionKey": k,
  98.                         "sku": "?",
  99.                         "index": v["1299"] if "1299" in v else None,
  100.                     }
  101.                     for k, v in dataJC["index"].items()
  102.                 ]
  103.                 orInfs = [
  104.                     ("optionPrices", "amount", "reverseNest"),
  105.                     ("dynamic", "value", "nest1"),
  106.                     ("labels", "", "reverseNest"),
  107.                     ("hasEndDate", "", "noNesting"),
  108.                 ]
  109.                 relevInfs = []
  110.                 for kk, vk, nt in orInfs:
  111.                     if kk not in dataJC:
  112.                         continue
  113.                     if nt == "noNesting":
  114.                         relevInfs += [(kk, vk, dataJC[kk])]
  115.                         continue
  116.                     if nt == "nest1":
  117.                         relevInfs += [(kk, vk, vd) for kk, vd in dataJC[kk].items()]
  118.                         continue
  119.                     if nt != "reverseNest":
  120.                         ## can put a default action here
  121.                         continue
  122.                     ## nt == 'reverseNest'
  123.                     orInf = {}
  124.                     for pk, po in dataJC[kk].items():
  125.                         for kpo, vpo in po.items():
  126.                             if kpo not in orInf:
  127.                                 orInf[kpo] = {}
  128.                             orInf[kpo][pk] = vpo
  129.  
  130.                     relevInfs += [(kk, vk, vi) for kk, vi in orInf.items()]
  131.  
  132.                 for i, j in enumerate(jcInfs):
  133.                     for kk, vk, vd in relevInfs:
  134.                         if j["optionKey"] not in vd:
  135.                             continue
  136.                         relevInf = vd[j["optionKey"]]
  137.                         if type(relevInf) != dict:
  138.                             j[kk] = relevInf
  139.                         elif vk in relevInf and relevInf[vk]:
  140.                             j[kk] = relevInf[vk]
  141.                     # combine with main variation
  142.                     jcInfs[i] = {
  143.                         k: v
  144.                         for k, v in (
  145.                             list(d.items())
  146.                             + [(jk, jv) for jk, jv in j.items() if jk not in d]
  147.                         )
  148.                     }
  149.                 for j in jcInfs:
  150.                     self.all_info.append(j)
  151.             else:
  152.                 self.all_info.append(d)
  153.                 #!# if you do it outside else, you can have a repeated row
  154.             #!# [above] unlooped
  155.  
  156.             #!# [remove last else block if you uncomment below]
  157.             # self.all_info.append(d) #!# uncomment if you WANT a repeated row
  158.             #!# ? extra row for main variant ?
  159.          
  160.  
  161.     def to_csv(self):
  162.        
  163.         df = pd.DataFrame(self.all_info).fillna("")
  164.  
  165.         #!# if you want to push some columns to the left
  166.         firstCols = [ #!# adjust as you like
  167.             'Scraped_Date', 'Scraped_Time', 'product_id', 'productSKU', 'sku',
  168.             'product_name', 'price', 'finalPrice', 'old_price', 'oldPrice',
  169.             'ratings', 'number_of_reviews'
  170.         ] #!# adjust as you like
  171.         lfc = len(firstCols)
  172.         oldCols = [(
  173.             firstCols.index(c) if c in firstCols else lfc, i, c
  174.         ) for i, c in enumerate(list(df.columns.values))]
  175.         newCols = [oc[2] for oc in sorted(oldCols, key=lambda l: (l[0], l[1]))]
  176.         df = df[newCols]
  177.         #!# remove the above if you don't want to re-order columns
  178.        
  179.  
  180.         df.to_csv(f"{today}_petbarn.csv", index=False)
  181.  
  182.         print(f'Stored results to "{today}_petbarn.csv"') #!# just
  183.  
  184.     def run(self):
  185.         for i in range(1, 2):  # total_number of pages
  186.             url = f"https://www.petbarn.com.au/dogs/dog-food/dry-dog-food?p={i}"
  187.  
  188.             response = self.fetch(url)
  189.  
  190.             self.parse(response)
  191.  
  192.         self.to_csv()
  193.  
  194. if __name__ == "__main__":
  195.     scraper = PetBarnProdScraper()
  196.     scraper.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement