Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import re
- import json
- from bs4 import BeautifulSoup
- def get_html(url):
- try:
- response = requests.get(url)
- response.raise_for_status()
- return response.text
- except requests.RequestException as e:
- print(f"Error fetching {url}: {e}")
- return None
- def extract_product_info_from_script(script_content):
- product_info = []
- # Find the JSON data in the script
- json_data_match = re.search(r'webPixelsManagerAPI\.publish\("collection_viewed", ({"collection".*?})\);', script_content)
- if json_data_match:
- json_data = json_data_match.group(1)
- data = json.loads(json_data)
- # Extract product details
- for variant in data["collection"]["productVariants"]:
- image_url = f"https:{variant['image']['src']}"
- product_url = f"https://2men.it{variant['product']['url']}"
- product_title = variant['product']['title']
- # Extracting product description
- product_page_html = get_html(product_url)
- if product_page_html:
- soup = BeautifulSoup(product_page_html, 'html.parser')
- description_div = soup.find("div", class_="product__description rte quick-add-hidden")
- product_description = description_div.get_text(strip=True)[:400] if description_div else "Description not available"
- else:
- product_description = "Description not available"
- product_info.append({
- "title": product_title,
- "url": product_url,
- "image_url": image_url,
- "description": product_description
- })
- return product_info
- # URL of the category page
- category_url = "https://2men.it/collections/loafers"
- # Fetch the HTML content of the category page
- category_page_html = get_html(category_url)
- # Extract the script content
- script_content_match = re.search(r'<script id="web-pixels-manager-setup">(.+?)</script>', category_page_html, re.DOTALL)
- script_content = script_content_match.group(1) if script_content_match else ""
- # Extract product info from the script
- product_info = extract_product_info_from_script(script_content)
- for info in product_info:
- print(info)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement