Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import pandas as pd
- from bs4 import BeautifulSoup
- import json
- import csv
- from datetime import datetime
- from datetime import date
- now = datetime.now()
- today = date.today()
- class PetBarnProdScraper:
- all_info = []
- def fetch(self, url):
- print(f"HTTP GET request to URL: {url}", end="")
- res = requests.get(url)
- print(f" | Status Code: {res.status_code}")
- return res
- def parse(self, response):
- soup = BeautifulSoup(response.text, "html.parser")
- #!# [below] can be outside loop - doesn't change inside loop
- data = (
- soup.select('script[type="text/x-magento-init"]')[3]
- .text.replace("\n", "")
- .strip()
- )
- data_json = json.loads(data)
- data_j = json.loads(
- data_json["*"][
- "Overdose_AdobeAnalytics/js/view/datalayer"
- ]["datalayer"][0]
- )
- #!# [above] can be outside loop - doesn't change inside loop
- prodConts = soup.select('div.product-item-info[id^="product-id-"]')
- prodsJ = data_j["PLP"]["products"]
- for prodCont, prodJ in zip(prodConts, prodsJ):
- #!# [below] moved inside loop
- prod_id = prodCont.get("id").split("-")[-1]
- prodLink = prodCont.select_one("a.product-item-link")
- if prodLink:
- product_url = prodLink.get("href")
- title = ' '.join( #!# reduce whitescpace
- [w for w in prodLink.text.split() if w])
- else: product_url = title = 'N/A'
- old_price = prodCont.select_one("span.old-price")
- last_price = ' '.join( #!# reduce whitescpace
- [w for w in old_price.text.split() if w]
- ) if old_price else 'N/A'
- ratings = prodCont.select_one("div.rating-result[title]")
- ratings_count = ratings.get("title") if ratings else 'N/A'
- no_of_reviews = prodCont.select_one("a.action.view")
- reviews_count = ' '.join( #!# reduce whitescpace
- [w for w in no_of_reviews.text.split() if w]
- ) if no_of_reviews else 'N/A'
- #!# [above] moved inside loop
- # for idx in range(len(titles)): #!# loops merged
- # try: ... # except... #!# no longer needed, already fixed above
- d = {
- "Scraped_Date": now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[0],
- "Scraped_Time": now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[1],
- "product_id": prod_id, #!# added just bc
- "product_name": title, #titles[idx], #!# loops merged
- "price": prodJ["productPrice"], #!# lists zipped
- "old_price": last_price, #!# loops merged
- "ratings": ratings_count, #!# loops merged
- "number_of_reviews": reviews_count, #!# loops merged
- "productSKU": prodJ["productSKU"], #!# lists zipped
- "productSize": prodJ["productSize"], #!# lists zipped
- "priceWithoutTax": prodJ["productPriceLessTax"], #!# lists zipped
- "lifeStage": prodJ["lifeStage"], #!# lists zipped
- }
- #!# [below] unlooped
- details = soup.select_one(
- f"script:-soup-contains('[data-role=swatch-option-{prod_id}]')"
- )
- if details:
- json_details = json.loads(details.text.replace("\n", "").strip())
- dataJC = json_details[f"[data-role=swatch-option-{prod_id}]"][
- "Magento_Swatches/js/swatch-renderer"
- ]["jsonConfig"]
- productId = dataJC["productId"]
- jcInfs = [
- {
- "productId": productId,
- "optionKey": k,
- "sku": "?",
- "index": v["1299"] if "1299" in v else None,
- }
- for k, v in dataJC["index"].items()
- ]
- orInfs = [
- ("optionPrices", "amount", "reverseNest"),
- ("dynamic", "value", "nest1"),
- ("labels", "", "reverseNest"),
- ("hasEndDate", "", "noNesting"),
- ]
- relevInfs = []
- for kk, vk, nt in orInfs:
- if kk not in dataJC:
- continue
- if nt == "noNesting":
- relevInfs += [(kk, vk, dataJC[kk])]
- continue
- if nt == "nest1":
- relevInfs += [(kk, vk, vd) for kk, vd in dataJC[kk].items()]
- continue
- if nt != "reverseNest":
- ## can put a default action here
- continue
- ## nt == 'reverseNest'
- orInf = {}
- for pk, po in dataJC[kk].items():
- for kpo, vpo in po.items():
- if kpo not in orInf:
- orInf[kpo] = {}
- orInf[kpo][pk] = vpo
- relevInfs += [(kk, vk, vi) for kk, vi in orInf.items()]
- for i, j in enumerate(jcInfs):
- for kk, vk, vd in relevInfs:
- if j["optionKey"] not in vd:
- continue
- relevInf = vd[j["optionKey"]]
- if type(relevInf) != dict:
- j[kk] = relevInf
- elif vk in relevInf and relevInf[vk]:
- j[kk] = relevInf[vk]
- # combine with main variation
- jcInfs[i] = {
- k: v
- for k, v in (
- list(d.items())
- + [(jk, jv) for jk, jv in j.items() if jk not in d]
- )
- }
- for j in jcInfs:
- self.all_info.append(j)
- else:
- self.all_info.append(d)
- #!# if you do it outside else, you can have a repeated row
- #!# [above] unlooped
- #!# [remove last else block if you uncomment below]
- # self.all_info.append(d) #!# uncomment if you WANT a repeated row
- #!# ? extra row for main variant ?
- def to_csv(self):
- df = pd.DataFrame(self.all_info).fillna("")
- #!# if you want to push some columns to the left
- firstCols = [ #!# adjust as you like
- 'Scraped_Date', 'Scraped_Time', 'product_id', 'productSKU', 'sku',
- 'product_name', 'price', 'finalPrice', 'old_price', 'oldPrice',
- 'ratings', 'number_of_reviews'
- ] #!# adjust as you like
- lfc = len(firstCols)
- oldCols = [(
- firstCols.index(c) if c in firstCols else lfc, i, c
- ) for i, c in enumerate(list(df.columns.values))]
- newCols = [oc[2] for oc in sorted(oldCols, key=lambda l: (l[0], l[1]))]
- df = df[newCols]
- #!# remove the above if you don't want to re-order columns
- df.to_csv(f"{today}_petbarn.csv", index=False)
- print(f'Stored results to "{today}_petbarn.csv"') #!# just
- def run(self):
- for i in range(1, 2): # total_number of pages
- url = f"https://www.petbarn.com.au/dogs/dog-food/dry-dog-food?p={i}"
- response = self.fetch(url)
- self.parse(response)
- self.to_csv()
- if __name__ == "__main__":
- scraper = PetBarnProdScraper()
- scraper.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement