j0h

adafruitPrices.py

j0h
Aug 24th, 2025
229
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.57 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # scrap prices of adafruit.com aka why you no sort by price dawg?
  3. # we love adafruit, but no sorting prices is nonsense.
  4.  
  5. import csv, sys, re, concurrent.futures as cf
  6. from pathlib import Path
  7.  
  8. import requests
  9. from bs4 import BeautifulSoup
  10.  
  11. # OG script process was part bash but thats not very portable
  12. import subprocess
  13. ################### Get the live URLs
  14. # only relavent to adafruit
  15. START = 1000
  16. END = 9999
  17. #CONCURENCY=16
  18. BASE="https://www.adafruit.com/product"
  19. OUTPUT = "working_links.txt"
  20.  
  21. def check_url(id_, base, file_handle):
  22.     url = f"{base}/{id_}"
  23.     cp = subprocess.run(
  24.         ["curl", "-sSfL", "--connect-timeout", "5", "--max-time", "15", "--retry", "2", "-o", "/dev/null",
  25.             url,
  26.         ],
  27.         check=False,
  28.         stdout=subprocess.DEVNULL,
  29.         stderr=subprocess.DEVNULL
  30.     )
  31.     if cp.returncode == 0:        # only on success
  32.         print(url)
  33.         file_handle.write(url + "\n")
  34.  
  35. # open the output file once, append mode
  36. with open(OUTPUT, "a") as f:
  37.     for ID in range(START, END + 1):
  38.         check_url(ID, BASE, f)
  39.        
  40. #################### Parse the return data
  41. INPUT = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("working_links.txt")
  42. OUT   = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("products.csv")
  43. CONCURRENCY = int(sys.argv[3]) if len(sys.argv) > 3 else 16
  44.  
  45. session = requests.Session()
  46. session.headers.update({
  47.     "User-Agent": "Mozilla/5.0 (compatible; pricer/1.0)"
  48. })
  49. timeout = (5, 15)  # connect, read
  50.  
  51. ######## NEW PRODUCTS Since last run?
  52. '''
  53. hcksum working_links.txt? maybe on re-run make a checksum, then init script generate working links then chck old_chksum against new_chksum?
  54. '''
  55.  
  56.  
  57. def clean_money(s: str) -> str:
  58.     # keep digits and dot; remove $ and commas and odd whitespace
  59.     s = s.strip()
  60.     s = re.sub(r"[^\d.]", "", s)
  61.     # normalize to at most 2 decimals if it looks like money
  62.     if s.count(".") > 1:
  63.         # weird cases; keep first dot only
  64.         parts = s.split(".")
  65.         s = parts[0] + "." + "".join(parts[1:])
  66.     return s
  67.  
  68. def parse_one(url: str):
  69.     try:
  70.         r = session.get(url, timeout=timeout, allow_redirects=True)
  71.         r.raise_for_status()
  72.         soup = BeautifulSoup(r.text, "html.parser")
  73.  
  74.         # Title/description
  75.         title = ""
  76.         h1 = soup.select_one("h1.products_name") or soup.select_one("h1")
  77.         if h1:
  78.             title = " ".join(h1.get_text(" ", strip=True).split())
  79.  
  80.         # Price
  81.         price = ""
  82.         price_el = soup.select_one('#prod-price [itemprop="price"]') or soup.select_one('[itemprop="price"]')
  83.         if price_el:
  84.             # Prefer structured "content" attribute
  85.             if price_el.has_attr("content") and price_el["content"].strip():
  86.                 price = clean_money(price_el["content"])
  87.             else:
  88.                 price = clean_money(price_el.get_text(" ", strip=True))
  89.  
  90.         return (url, title, price, "")
  91.     except Exception as e:
  92.         # Return with an error message; price empty so you can spot issues
  93.         return (url, "", "", f"{type(e).__name__}: {e}")
  94.  
  95. def main():
  96.     urls = [u.strip() for u in INPUT.read_text().splitlines() if u.strip()]
  97.     with OUT.open("w", newline="", encoding="utf-8") as f:
  98.         w = csv.writer(f)
  99.         w.writerow(["url", "title", "price", "error"])  # keep error column for debugging
  100.         with cf.ThreadPoolExecutor(max_workers=CONCURRENCY) as ex:
  101.             for row in ex.map(parse_one, urls, chunksize=20):
  102.                 w.writerow(row)
  103.  
  104. if __name__ == "__main__":
  105.     main()
  106.  
Advertisement
Add Comment
Please, Sign In to add comment