Advertisement
incomestreamsurfer

sitemap product image scraper

Nov 18th, 2023
1,273
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.26 KB | None | 0 0
  1. import xml.etree.ElementTree as ET
  2. import random
  3.  
  4. def extract_sitemap_data(xml_file_path):
  5. # Parse the XML file
  6. tree = ET.parse(xml_file_path)
  7. root = tree.getroot()
  8.  
  9. # Namespace handling
  10. namespaces = {
  11. 'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9',
  12. 'image': 'http://www.google.com/schemas/sitemap-image/1.1'
  13. }
  14.  
  15. # Extracting product information
  16. products = []
  17. for url in root.findall('ns:url', namespaces):
  18. loc = url.find('ns:loc', namespaces).text
  19. image = url.find('image:image', namespaces)
  20. if image is not None:
  21. image_loc = image.find('image:loc', namespaces).text
  22. image_title = image.find('image:title', namespaces).text
  23. products.append((loc, image_loc, image_title))
  24.  
  25. # Select 30 random entries
  26. selected_products = random.sample(products, min(30, len(products)))
  27. return selected_products
  28.  
  29. def main():
  30. xml_file_path = 'sitemap_products_1.xml' # Replace with your XML file path
  31. random_entries = extract_sitemap_data(xml_file_path)
  32.  
  33. # Display the selected entries
  34. for entry in random_entries:
  35. print(f"URL: {entry[0]}\nImage URL: {entry[1]}\nTitle: {entry[2]}\n")
  36.  
  37. if __name__ == "__main__":
  38. main()
  39.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement