video

import xml.etree.ElementTree as ET
from tqdm import tqdm

def extract_sitemap_data(xml_file_path, include_terms, exclude_terms, num_urls=30):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    namespaces = {
        'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9',
        'image': 'http://www.google.com/schemas/sitemap-image/1.1'
    }

    filtered_urls = []
    for url in tqdm(root.findall('ns:url', namespaces), desc="Processing URLs"):
        if len(filtered_urls) >= num_urls:
            break  # Stop processing once the desired number of URLs are found

        loc = url.find('ns:loc', namespaces).text
        image = url.find('image:image', namespaces)
        if image is not None:
            image_title = image.find('image:title', namespaces).text.lower()
            if (any(term.lower() in image_title for term in include_terms) and
                not any(term.lower() in image_title for term in exclude_terms)):
                image_loc = image.find('image:loc', namespaces).text
                filtered_urls.append((loc, image_loc))

    return filtered_urls

def main():
    xml_file_path = 'sitemap_products_1 (18).xml'  # Replace with your XML file path
    include_terms = ['sneakers']  # Terms to include
    exclude_terms = ['faux', 't-shirt', 'short']  # Terms to exclude
    filtered_entries = extract_sitemap_data(xml_file_path, include_terms, exclude_terms)

    for entry in tqdm(filtered_entries, desc="Displaying Filtered URLs"):
        print(f"URL: {entry[0]}\nImage URL: {entry[1]}\n")

if __name__ == "__main__":
    main()