Advertisement
Guest User

Very basic scraper

a guest
Aug 19th, 2024
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.43 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from urllib.parse import urljoin, urlparse, urldefrag
  4. from collections import defaultdict
  5.  
  6. # Initialize variables
  7. base_url = "https://example.com"
  8. visited_urls = set()
  9. urls_to_visit = [base_url]
  10. status_codes = defaultdict(int)
  11. max_pages = 1000
  12. refresh_interval = 100 # Refresh cookies every 100 pages
  13.  
  14. # Custom headers and initial cookies
  15. headers = {
  16. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0',
  17. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8',
  18. 'Accept-Language': 'en-US,en;q=0.5',
  19. 'Accept-Encoding': 'gzip, deflate, br, zstd',
  20. 'Alt-Used': 'example.com',
  21. 'Connection': 'keep-alive',
  22. 'Upgrade-Insecure-Requests': '1',
  23. 'Sec-Fetch-Dest': 'document',
  24. 'Sec-Fetch-Mode': 'navigate',
  25. 'Sec-Fetch-Site': 'cross-site',
  26. 'Priority': 'u=0, i',
  27. 'Pragma': 'no-cache',
  28. 'Cache-Control': 'no-cache',
  29. 'TE': 'trailers'
  30. }
  31.  
  32. # Function to refresh cookies
  33. def refresh_cookies():
  34. response = requests.get(base_url, headers=headers)
  35. return response.cookies
  36.  
  37. # Initialize cookies
  38. cookies = refresh_cookies()
  39.  
  40. # Helper function to check if URL is within the same domain
  41. def is_same_domain(url, base):
  42. return urlparse(url).netloc == urlparse(base).netloc
  43.  
  44. # Main loop
  45. while urls_to_visit and len(visited_urls) < max_pages:
  46. current_url = urls_to_visit.pop(0)
  47.  
  48. # Refresh cookies periodically
  49. if len(visited_urls) % refresh_interval == 0:
  50. cookies = refresh_cookies()
  51. print(f"Cookies refreshed after visiting {len(visited_urls)} pages.")
  52.  
  53. # Strip the fragment from the URL
  54. current_url, _ = urldefrag(current_url)
  55.  
  56. # Skip if already visited
  57. if current_url in visited_urls:
  58. continue
  59.  
  60. try:
  61. # Fetch the page with custom headers and cookies
  62. response = requests.get(current_url, headers=headers, cookies=cookies)
  63. status_codes[response.status_code] += 1
  64.  
  65. # Only proceed if the request was successful
  66. if response.status_code == 200:
  67. # Parse the HTML
  68. soup = BeautifulSoup(response.text, 'html.parser')
  69.  
  70. # Find all links in the page
  71. for link in soup.find_all('a', href=True):
  72. absolute_url, _ = urldefrag(urljoin(base_url, link['href']))
  73. parsed_url = urlparse(absolute_url)
  74.  
  75. # Skip URLs that start with a query string (?)
  76. if ('?' in absolute_url):
  77. continue
  78.  
  79. # Check if the URL without fragment is already visited
  80. if is_same_domain(absolute_url, base_url) and absolute_url not in visited_urls:
  81. urls_to_visit.append(absolute_url)
  82.  
  83. # Mark the URL as visited
  84. visited_urls.add(current_url)
  85.  
  86. # Print progress
  87. print(f"Visited {len(visited_urls)} pages. Currently visiting: {current_url}")
  88.  
  89. except requests.RequestException as e:
  90. # Handle any request errors
  91. status_codes['error'] += 1
  92. print(f"Error fetching {current_url}: {e}")
  93.  
  94. # Output the results
  95. print("Status codes encountered:")
  96. for code, count in status_codes.items():
  97. print(f"{code}: {count}")
  98.  
  99. print(f"Total pages visited: {len(visited_urls)}")
  100.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement