Advertisement
incomestreamsurfer

extract product information from shopify site

Nov 30th, 2023
797
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.26 KB | None | 0 0
  1. import requests
  2. import re
  3. import json
  4. from bs4 import BeautifulSoup
  5.  
  6. def get_html(url):
  7. try:
  8. response = requests.get(url)
  9. response.raise_for_status()
  10. return response.text
  11. except requests.RequestException as e:
  12. print(f"Error fetching {url}: {e}")
  13. return None
  14.  
  15. def extract_product_info_from_script(script_content):
  16. product_info = []
  17.  
  18. # Find the JSON data in the script
  19. json_data_match = re.search(r'webPixelsManagerAPI\.publish\("collection_viewed", ({"collection".*?})\);', script_content)
  20. if json_data_match:
  21. json_data = json_data_match.group(1)
  22. data = json.loads(json_data)
  23.  
  24. # Extract product details
  25. for variant in data["collection"]["productVariants"]:
  26. image_url = f"https:{variant['image']['src']}"
  27. product_url = f"https://2men.it{variant['product']['url']}"
  28. product_title = variant['product']['title']
  29.  
  30. # Extracting product description
  31. product_page_html = get_html(product_url)
  32. if product_page_html:
  33. soup = BeautifulSoup(product_page_html, 'html.parser')
  34. description_div = soup.find("div", class_="product__description rte quick-add-hidden")
  35. product_description = description_div.get_text(strip=True)[:400] if description_div else "Description not available"
  36. else:
  37. product_description = "Description not available"
  38.  
  39. product_info.append({
  40. "title": product_title,
  41. "url": product_url,
  42. "image_url": image_url,
  43. "description": product_description
  44. })
  45. return product_info
  46.  
  47. # URL of the category page
  48. category_url = "https://2men.it/collections/loafers"
  49.  
  50. # Fetch the HTML content of the category page
  51. category_page_html = get_html(category_url)
  52.  
  53. # Extract the script content
  54. script_content_match = re.search(r'<script id="web-pixels-manager-setup">(.+?)</script>', category_page_html, re.DOTALL)
  55. script_content = script_content_match.group(1) if script_content_match else ""
  56.  
  57. # Extract product info from the script
  58. product_info = extract_product_info_from_script(script_content)
  59.  
  60. for info in product_info:
  61. print(info)
  62.  
  63.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement