extract product information from shopify site

import requests
import re
import json
from bs4 import BeautifulSoup

def get_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_product_info_from_script(script_content):
    product_info = []

    # Find the JSON data in the script
    json_data_match = re.search(r'webPixelsManagerAPI\.publish\("collection_viewed", ({"collection".*?})\);', script_content)
    if json_data_match:
        json_data = json_data_match.group(1)
        data = json.loads(json_data)

        # Extract product details
        for variant in data["collection"]["productVariants"]:
            image_url = f"https:{variant['image']['src']}"
            product_url = f"https://2men.it{variant['product']['url']}"
            product_title = variant['product']['title']

            # Extracting product description
            product_page_html = get_html(product_url)
            if product_page_html:
                soup = BeautifulSoup(product_page_html, 'html.parser')
                description_div = soup.find("div", class_="product__description rte quick-add-hidden")
                product_description = description_div.get_text(strip=True)[:400] if description_div else "Description not available"
            else:
                product_description = "Description not available"

            product_info.append({
                "title": product_title,
                "url": product_url,
                "image_url": image_url,
                "description": product_description
            })
    return product_info

# URL of the category page
category_url = "https://2men.it/collections/loafers"

# Fetch the HTML content of the category page
category_page_html = get_html(category_url)

# Extract the script content
script_content_match = re.search(r'<script id="web-pixels-manager-setup">(.+?)</script>', category_page_html, re.DOTALL)
script_content = script_content_match.group(1) if script_content_match else ""

# Extract product info from the script
product_info = extract_product_info_from_script(script_content)

for info in product_info:
    print(info)