Advertisement
incomestreamsurfer

hi

May 29th, 2024
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.11 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import csv
  4. from urllib.parse import urljoin
  5. import random
  6.  
  7. HEADERS = {
  8. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
  9. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
  10. 'Accept-Language': 'en-US,en;q=0.9'
  11. }
  12.  
  13. def get_sitemap_urls(sitemap_url):
  14. try:
  15. response = requests.get(sitemap_url, headers=HEADERS)
  16. response.raise_for_status() # Raise an HTTPError for bad responses
  17. soup = BeautifulSoup(response.text, 'xml')
  18. urls = []
  19.  
  20. # Check for sitemap index
  21. sitemap_tags = soup.find_all('sitemap')
  22. if sitemap_tags:
  23. for sitemap in sitemap_tags:
  24. sitemap_loc = sitemap.find('loc').text.strip()
  25. urls.extend(get_sitemap_urls(sitemap_loc))
  26. else:
  27. # Extract URLs from the sitemap
  28. url_tags = soup.find_all('url')
  29. for url_tag in url_tags:
  30. loc_tag = url_tag.find('loc')
  31. if loc_tag:
  32. url = loc_tag.text.strip()
  33. if 'facebook' not in url:
  34. urls.append(url)
  35.  
  36. return urls
  37. except requests.RequestException as e:
  38. print(f"Error fetching sitemap: {e}")
  39. return []
  40.  
  41. def find_images(url):
  42. try:
  43. response = requests.get(url, headers=HEADERS)
  44. response.raise_for_status() # Raise an HTTPError for bad responses
  45. soup = BeautifulSoup(response.text, 'html.parser')
  46. img_tags = soup.find_all('img')
  47. image_urls = []
  48. for img in img_tags:
  49. src = img.get('src')
  50. if src and not is_excluded(src) and not is_facebook_link(src):
  51. image_urls.append(src)
  52. return random.sample(image_urls, min(3, len(image_urls)))
  53. except requests.RequestException as e:
  54. print(f"Error fetching images from {url}: {e}")
  55. return []
  56. except Exception as e:
  57. print(f"Unexpected error while processing {url}: {e}")
  58. return []
  59.  
  60. def is_excluded(url):
  61. excluded_keywords = ['logo', 'icon', 'avatar', 'profile', 'button', 'social']
  62. url_lower = url.lower()
  63. return any(keyword in url_lower for keyword in excluded_keywords)
  64.  
  65. def is_facebook_link(url):
  66. return 'facebook' in url.lower()
  67.  
  68. def main(sitemap_url, output_file):
  69. urls = get_sitemap_urls(sitemap_url)
  70. with open(output_file, 'w', newline='') as csvfile:
  71. writer = csv.writer(csvfile)
  72. writer.writerow(['Page URL', 'Image URL'])
  73. for url in urls:
  74. images = find_images(url)
  75. for image_url in images:
  76. absolute_url = urljoin(url, image_url)
  77. writer.writerow([url, absolute_url])
  78. print(f"Image URLs and their corresponding page URLs have been saved to {output_file}")
  79.  
  80. if __name__ == '__main__':
  81. sitemap_url = 'https://braidsbylaures.com/sitemap_index.xml'
  82. output_file = 'image_urls.csv'
  83. main(sitemap_url, output_file)
  84.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement