Advertisement
incomestreamsurfer

scraper shopify sitemap

Nov 24th, 2023
656
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.38 KB | None | 0 0
  1. import xml.etree.ElementTree as ET
  2. import random
  3. import requests
  4. from bs4 import BeautifulSoup
  5. from tqdm import tqdm
  6.  
  7. def extract_product_info(url):
  8. try:
  9. response = requests.get(url, timeout=10)
  10. response.raise_for_status()
  11. soup = BeautifulSoup(response.text, 'html.parser')
  12.  
  13. title_tag = soup.find('h1')
  14. title = title_tag.get_text(strip=True) if title_tag else 'No title available'
  15.  
  16. description_tag = soup.find('div', {'class': 'product__description rte quick-add-hidden'})
  17. description = description_tag.get_text(strip=True) if description_tag else 'No description available'
  18.  
  19. price_tag = soup.find('span', {'class': 'price-item--regular'})
  20. price = price_tag.get_text(strip=True) if price_tag else 'No price available'
  21.  
  22. return title, description, price
  23. except Exception as e:
  24. return 'Failed to retrieve product info'
  25.  
  26. def extract_sitemap_data(xml_file_path, search_terms, num_urls=30):
  27. tree = ET.parse(xml_file_path)
  28. root = tree.getroot()
  29.  
  30. namespaces = {
  31. 'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9',
  32. 'image': 'http://www.google.com/schemas/sitemap-image/1.1'
  33. }
  34.  
  35. products = []
  36. for url in tqdm(root.findall('ns:url', namespaces), desc="Processing URLs"):
  37. if len(products) >= num_urls:
  38. break # Stop processing once 20 valid URLs are found
  39.  
  40. loc = url.find('ns:loc', namespaces).text
  41. image = url.find('image:image', namespaces)
  42. if image is not None:
  43. image_loc = image.find('image:loc', namespaces).text
  44. image_title = image.find('image:title', namespaces).text.lower()
  45. if any(term.lower() in image_title for term in search_terms):
  46. title, description, price = extract_product_info(loc)
  47. products.append((loc, image_loc, title, description, price))
  48.  
  49. return products
  50.  
  51. def main():
  52. xml_file_path = 'sitemap_products_20.xml' # Replace with your XML file path
  53. search_terms = ['Jeans'] # Add your search terms here
  54. random_entries = extract_sitemap_data(xml_file_path, search_terms)
  55.  
  56. for entry in tqdm(random_entries, desc="Displaying Products"):
  57. print(f"URL: {entry[0]}\nImage URL: {entry[1]}\nTitle: {entry[2]}\nDescription: {entry[3]}\nPrice: {entry[4]}\n")
  58.  
  59. if __name__ == "__main__":
  60. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement