Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # scrap prices of adafruit.com aka why you no sort by price dawg?
- # we love adafruit, but no sorting prices is nonsense.
- import csv, sys, re, concurrent.futures as cf
- from pathlib import Path
- import requests
- from bs4 import BeautifulSoup
- # OG script process was part bash but thats not very portable
- import subprocess
- ################### Get the live URLs
- # only relavent to adafruit
- START = 1000
- END = 9999
- #CONCURENCY=16
- BASE="https://www.adafruit.com/product"
- OUTPUT = "working_links.txt"
- def check_url(id_, base, file_handle):
- url = f"{base}/{id_}"
- cp = subprocess.run(
- ["curl", "-sSfL", "--connect-timeout", "5", "--max-time", "15", "--retry", "2", "-o", "/dev/null",
- url,
- ],
- check=False,
- stdout=subprocess.DEVNULL,
- stderr=subprocess.DEVNULL
- )
- if cp.returncode == 0: # only on success
- print(url)
- file_handle.write(url + "\n")
- # open the output file once, append mode
- with open(OUTPUT, "a") as f:
- for ID in range(START, END + 1):
- check_url(ID, BASE, f)
- #################### Parse the return data
- INPUT = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("working_links.txt")
- OUT = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("products.csv")
- CONCURRENCY = int(sys.argv[3]) if len(sys.argv) > 3 else 16
- session = requests.Session()
- session.headers.update({
- "User-Agent": "Mozilla/5.0 (compatible; pricer/1.0)"
- })
- timeout = (5, 15) # connect, read
- ######## NEW PRODUCTS Since last run?
- '''
- hcksum working_links.txt? maybe on re-run make a checksum, then init script generate working links then chck old_chksum against new_chksum?
- '''
- def clean_money(s: str) -> str:
- # keep digits and dot; remove $ and commas and odd whitespace
- s = s.strip()
- s = re.sub(r"[^\d.]", "", s)
- # normalize to at most 2 decimals if it looks like money
- if s.count(".") > 1:
- # weird cases; keep first dot only
- parts = s.split(".")
- s = parts[0] + "." + "".join(parts[1:])
- return s
- def parse_one(url: str):
- try:
- r = session.get(url, timeout=timeout, allow_redirects=True)
- r.raise_for_status()
- soup = BeautifulSoup(r.text, "html.parser")
- # Title/description
- title = ""
- h1 = soup.select_one("h1.products_name") or soup.select_one("h1")
- if h1:
- title = " ".join(h1.get_text(" ", strip=True).split())
- # Price
- price = ""
- price_el = soup.select_one('#prod-price [itemprop="price"]') or soup.select_one('[itemprop="price"]')
- if price_el:
- # Prefer structured "content" attribute
- if price_el.has_attr("content") and price_el["content"].strip():
- price = clean_money(price_el["content"])
- else:
- price = clean_money(price_el.get_text(" ", strip=True))
- return (url, title, price, "")
- except Exception as e:
- # Return with an error message; price empty so you can spot issues
- return (url, "", "", f"{type(e).__name__}: {e}")
- def main():
- urls = [u.strip() for u in INPUT.read_text().splitlines() if u.strip()]
- with OUT.open("w", newline="", encoding="utf-8") as f:
- w = csv.writer(f)
- w.writerow(["url", "title", "price", "error"]) # keep error column for debugging
- with cf.ThreadPoolExecutor(max_workers=CONCURRENCY) as ex:
- for row in ex.map(parse_one, urls, chunksize=20):
- w.writerow(row)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment