Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- import csv
- from urllib.parse import urljoin
- import random
- HEADERS = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
- 'Accept-Language': 'en-US,en;q=0.9'
- }
- def get_sitemap_urls(sitemap_url):
- try:
- response = requests.get(sitemap_url, headers=HEADERS)
- response.raise_for_status() # Raise an HTTPError for bad responses
- soup = BeautifulSoup(response.text, 'xml')
- urls = []
- # Check for sitemap index
- sitemap_tags = soup.find_all('sitemap')
- if sitemap_tags:
- for sitemap in sitemap_tags:
- sitemap_loc = sitemap.find('loc').text.strip()
- urls.extend(get_sitemap_urls(sitemap_loc))
- else:
- # Extract URLs from the sitemap
- url_tags = soup.find_all('url')
- for url_tag in url_tags:
- loc_tag = url_tag.find('loc')
- if loc_tag:
- url = loc_tag.text.strip()
- if 'facebook' not in url:
- urls.append(url)
- return urls
- except requests.RequestException as e:
- print(f"Error fetching sitemap: {e}")
- return []
- def find_images(url):
- try:
- response = requests.get(url, headers=HEADERS)
- response.raise_for_status() # Raise an HTTPError for bad responses
- soup = BeautifulSoup(response.text, 'html.parser')
- img_tags = soup.find_all('img')
- image_urls = []
- for img in img_tags:
- src = img.get('src')
- if src and not is_excluded(src) and not is_facebook_link(src):
- image_urls.append(src)
- return random.sample(image_urls, min(3, len(image_urls)))
- except requests.RequestException as e:
- print(f"Error fetching images from {url}: {e}")
- return []
- except Exception as e:
- print(f"Unexpected error while processing {url}: {e}")
- return []
- def is_excluded(url):
- excluded_keywords = ['logo', 'icon', 'avatar', 'profile', 'button', 'social']
- url_lower = url.lower()
- return any(keyword in url_lower for keyword in excluded_keywords)
- def is_facebook_link(url):
- return 'facebook' in url.lower()
- def main(sitemap_url, output_file):
- urls = get_sitemap_urls(sitemap_url)
- with open(output_file, 'w', newline='') as csvfile:
- writer = csv.writer(csvfile)
- writer.writerow(['Page URL', 'Image URL'])
- for url in urls:
- images = find_images(url)
- for image_url in images:
- absolute_url = urljoin(url, image_url)
- writer.writerow([url, absolute_url])
- print(f"Image URLs and their corresponding page URLs have been saved to {output_file}")
- if __name__ == '__main__':
- sitemap_url = 'https://braidsbylaures.com/sitemap_index.xml'
- output_file = 'image_urls.csv'
- main(sitemap_url, output_file)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement