Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import pandas as pd
- from bs4 import BeautifulSoup
- import json
- import csv
- from datetime import datetime
- from datetime import date
- now = datetime.now()
- today = date.today()
- class PetBarnProdScraper:
- all_info = []
- def fetch(self, url):
- print(f"HTTP GET request to URL: {url}", end="")
- res = requests.get(url)
- print(f" | Status Code: {res.status_code}")
- return res
- def parse(self, response):
- soup = BeautifulSoup(response.text, "html.parser")
- product_urls = [a.get("href") for a in soup.select("a.product-item-link")]
- product_ids = [
- pid.get("id").split("-")[-1] for pid in soup.select("div.product-item-info")
- ]
- titles = [
- a.text.replace("\n", "").strip() for a in soup.select("a.product-item-link")
- ]
- old_price = [
- p.select_one("span.price").text for p in soup.select("span.old-price")
- ]
- ratings = [r.get("title") for r in soup.select("div.rating-result")]
- no_of_reviews = [review.text for review in soup.select("a.action.view")]
- data = (
- soup.select('script[type="text/x-magento-init"]')[3]
- .text.replace("\n", "")
- .strip()
- )
- data_json = json.loads(data)
- data_j = json.loads(
- data_json["*"]["Overdose_AdobeAnalytics/js/view/datalayer"]["datalayer"][0]
- )
- for idx in range(len(titles)):
- try:
- ratings_count = ratings[idx]
- reviews_count = no_of_reviews[idx]
- last_price = old_price[idx]
- except:
- ratings_count = "N/A"
- reviews_count = "N/A"
- last_price = "N/A"
- d = {
- "Scraped_Date": now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[0],
- "Scraped_Time": now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[1],
- "product_name": titles[idx],
- "price": data_j["PLP"]["products"][idx]["productPrice"],
- "old_price": last_price,
- "ratings": ratings_count,
- "number_of_reviews": reviews_count,
- "productSKU": data_j["PLP"]["products"][idx]["productSKU"],
- "productSize": data_j["PLP"]["products"][idx]["productSize"],
- "priceWithoutTax": data_j["PLP"]["products"][idx][
- "productPriceLessTax"
- ],
- "lifeStage": data_j["PLP"]["products"][idx]["lifeStage"],
- }
- #!# unlooped below
- prod_id = product_ids[idx]
- details = soup.select_one(
- f"script:-soup-contains('[data-role=swatch-option-{prod_id}]')"
- )
- if details:
- json_details = json.loads(details.text.replace("\n", "").strip())
- dataJC = json_details[f"[data-role=swatch-option-{prod_id}]"][
- "Magento_Swatches/js/swatch-renderer"
- ]["jsonConfig"]
- productId = dataJC["productId"]
- jcInfs = [
- {
- "productId": productId,
- "optionKey": k,
- "sku": "?",
- "index": v["1299"] if "1299" in v else None,
- }
- for k, v in dataJC["index"].items()
- ]
- orInfs = [
- ("optionPrices", "amount", "reverseNest"),
- ("dynamic", "value", "nest1"),
- ("labels", "", "reverseNest"),
- ("hasEndDate", "", "noNesting"),
- ]
- relevInfs = []
- for kk, vk, nt in orInfs:
- if kk not in dataJC:
- continue
- if nt == "noNesting":
- relevInfs += [(kk, vk, dataJC[kk])]
- continue
- if nt == "nest1":
- relevInfs += [(kk, vk, vd) for kk, vd in dataJC[kk].items()]
- continue
- if nt != "reverseNest":
- ## can put a default action here
- continue
- ## nt == 'reverseNest'
- orInf = {}
- for pk, po in dataJC[kk].items():
- for kpo, vpo in po.items():
- if kpo not in orInf:
- orInf[kpo] = {}
- orInf[kpo][pk] = vpo
- relevInfs += [(kk, vk, vi) for kk, vi in orInf.items()]
- for i, j in enumerate(jcInfs):
- for kk, vk, vd in relevInfs:
- if j["optionKey"] not in vd:
- continue
- relevInf = vd[j["optionKey"]]
- if type(relevInf) != dict:
- j[kk] = relevInf
- elif vk in relevInf and relevInf[vk]:
- j[kk] = relevInf[vk]
- # combine with main variation
- jcInfs[i] = {
- k: v
- for k, v in (
- list(d.items())
- + [(jk, jv) for jk, jv in j.items() if jk not in d]
- )
- }
- for j in jcInfs:
- self.all_info.append(j)
- # else: self.all_info.append(d) #!# if you want to not repeat any prod/variant
- #!# unlooped above
- #!# is this intentional?
- self.all_info.append(d)
- #!# didn't remove but this will create additional row per product
- #!# unless wrapped into else block from if details
- def to_csv(self):
- df = pd.DataFrame(self.all_info).fillna("")
- df.to_csv(f"{today}_petbarn.csv", index=False)
- print(f'Stored results to "{today}_petbarn.csv"') #!# just
- def run(self):
- for i in range(1, 2): # total_number of pages
- url = f"https://www.petbarn.com.au/dogs/dog-food/dry-dog-food?p={i}"
- response = self.fetch(url)
- self.parse(response)
- self.to_csv()
- if __name__ == "__main__":
- scraper = PetBarnProdScraper()
- scraper.run()
Advertisement
Add Comment
Please, Sign In to add comment