Advertisement
incomestreamsurfer

video

Jan 3rd, 2024
902
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.59 KB | None | 0 0
  1. import xml.etree.ElementTree as ET
  2. from tqdm import tqdm
  3.  
  4. def extract_sitemap_data(xml_file_path, include_terms, exclude_terms, num_urls=30):
  5. tree = ET.parse(xml_file_path)
  6. root = tree.getroot()
  7.  
  8. namespaces = {
  9. 'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9',
  10. 'image': 'http://www.google.com/schemas/sitemap-image/1.1'
  11. }
  12.  
  13. filtered_urls = []
  14. for url in tqdm(root.findall('ns:url', namespaces), desc="Processing URLs"):
  15. if len(filtered_urls) >= num_urls:
  16. break # Stop processing once the desired number of URLs are found
  17.  
  18. loc = url.find('ns:loc', namespaces).text
  19. image = url.find('image:image', namespaces)
  20. if image is not None:
  21. image_title = image.find('image:title', namespaces).text.lower()
  22. if (any(term.lower() in image_title for term in include_terms) and
  23. not any(term.lower() in image_title for term in exclude_terms)):
  24. image_loc = image.find('image:loc', namespaces).text
  25. filtered_urls.append((loc, image_loc))
  26.  
  27. return filtered_urls
  28.  
  29. def main():
  30. xml_file_path = 'sitemap_products_1 (18).xml' # Replace with your XML file path
  31. include_terms = ['sneakers'] # Terms to include
  32. exclude_terms = ['faux', 't-shirt', 'short'] # Terms to exclude
  33. filtered_entries = extract_sitemap_data(xml_file_path, include_terms, exclude_terms)
  34.  
  35. for entry in tqdm(filtered_entries, desc="Displaying Filtered URLs"):
  36. print(f"URL: {entry[0]}\nImage URL: {entry[1]}\n")
  37.  
  38. if __name__ == "__main__":
  39. main()
  40.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement