Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import xml.etree.ElementTree as ET
- from tqdm import tqdm
- def extract_sitemap_data(xml_file_path, include_terms, exclude_terms, num_urls=30):
- tree = ET.parse(xml_file_path)
- root = tree.getroot()
- namespaces = {
- 'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9',
- 'image': 'http://www.google.com/schemas/sitemap-image/1.1'
- }
- filtered_urls = []
- for url in tqdm(root.findall('ns:url', namespaces), desc="Processing URLs"):
- if len(filtered_urls) >= num_urls:
- break # Stop processing once the desired number of URLs are found
- loc = url.find('ns:loc', namespaces).text
- image = url.find('image:image', namespaces)
- if image is not None:
- image_title = image.find('image:title', namespaces).text.lower()
- if (any(term.lower() in image_title for term in include_terms) and
- not any(term.lower() in image_title for term in exclude_terms)):
- image_loc = image.find('image:loc', namespaces).text
- filtered_urls.append((loc, image_loc))
- return filtered_urls
- def main():
- xml_file_path = 'sitemap_products_1 (18).xml' # Replace with your XML file path
- include_terms = ['sneakers'] # Terms to include
- exclude_terms = ['faux', 't-shirt', 'short'] # Terms to exclude
- filtered_entries = extract_sitemap_data(xml_file_path, include_terms, exclude_terms)
- for entry in tqdm(filtered_entries, desc="Displaying Filtered URLs"):
- print(f"URL: {entry[0]}\nImage URL: {entry[1]}\n")
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement