adafruitPrices.py

#!/usr/bin/env python3
# scrap prices of adafruit.com aka why you no sort by price dawg?
# we love adafruit, but no sorting prices is nonsense.

import csv, sys, re, concurrent.futures as cf
from pathlib import Path

import requests
from bs4 import BeautifulSoup

# OG script process was part bash but thats not very portable
import subprocess
################### Get the live URLs
# only relavent to adafruit
START = 1000
END = 9999
#CONCURENCY=16
BASE="https://www.adafruit.com/product"
OUTPUT = "working_links.txt"

def check_url(id_, base, file_handle):
    url = f"{base}/{id_}"
    cp = subprocess.run(
        ["curl", "-sSfL", "--connect-timeout", "5", "--max-time", "15", "--retry", "2", "-o", "/dev/null",
            url,
        ],
        check=False,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL
    )
    if cp.returncode == 0:        # only on success
        print(url)
        file_handle.write(url + "\n")

# open the output file once, append mode
with open(OUTPUT, "a") as f:
    for ID in range(START, END + 1):
        check_url(ID, BASE, f)

#################### Parse the return data
INPUT = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("working_links.txt")
OUT   = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("products.csv")
CONCURRENCY = int(sys.argv[3]) if len(sys.argv) > 3 else 16

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (compatible; pricer/1.0)"
})
timeout = (5, 15)  # connect, read

######## NEW PRODUCTS Since last run?
'''
hcksum working_links.txt? maybe on re-run make a checksum, then init script generate working links then chck old_chksum against new_chksum?
'''


def clean_money(s: str) -> str:
    # keep digits and dot; remove $ and commas and odd whitespace
    s = s.strip()
    s = re.sub(r"[^\d.]", "", s)
    # normalize to at most 2 decimals if it looks like money
    if s.count(".") > 1:
        # weird cases; keep first dot only
        parts = s.split(".")
        s = parts[0] + "." + "".join(parts[1:])
    return s

def parse_one(url: str):
    try:
        r = session.get(url, timeout=timeout, allow_redirects=True)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        # Title/description
        title = ""
        h1 = soup.select_one("h1.products_name") or soup.select_one("h1")
        if h1:
            title = " ".join(h1.get_text(" ", strip=True).split())

        # Price
        price = ""
        price_el = soup.select_one('#prod-price [itemprop="price"]') or soup.select_one('[itemprop="price"]')
        if price_el:
            # Prefer structured "content" attribute
            if price_el.has_attr("content") and price_el["content"].strip():
                price = clean_money(price_el["content"])
            else:
                price = clean_money(price_el.get_text(" ", strip=True))

        return (url, title, price, "")
    except Exception as e:
        # Return with an error message; price empty so you can spot issues
        return (url, "", "", f"{type(e).__name__}: {e}")

def main():
    urls = [u.strip() for u in INPUT.read_text().splitlines() if u.strip()]
    with OUT.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["url", "title", "price", "error"])  # keep error column for debugging
        with cf.ThreadPoolExecutor(max_workers=CONCURRENCY) as ex:
            for row in ex.map(parse_one, urls, chunksize=20):
                w.writerow(row)

if __name__ == "__main__":
    main()