Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin, urlparse, urldefrag
- from collections import defaultdict
- # Initialize variables
- base_url = "https://example.com"
- visited_urls = set()
- urls_to_visit = [base_url]
- status_codes = defaultdict(int)
- max_pages = 1000
- refresh_interval = 100 # Refresh cookies every 100 pages
- # Custom headers and initial cookies
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:128.0) Gecko/20100101 Firefox/128.0',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8',
- 'Accept-Language': 'en-US,en;q=0.5',
- 'Accept-Encoding': 'gzip, deflate, br, zstd',
- 'Alt-Used': 'example.com',
- 'Connection': 'keep-alive',
- 'Upgrade-Insecure-Requests': '1',
- 'Sec-Fetch-Dest': 'document',
- 'Sec-Fetch-Mode': 'navigate',
- 'Sec-Fetch-Site': 'cross-site',
- 'Priority': 'u=0, i',
- 'Pragma': 'no-cache',
- 'Cache-Control': 'no-cache',
- 'TE': 'trailers'
- }
- # Function to refresh cookies
- def refresh_cookies():
- response = requests.get(base_url, headers=headers)
- return response.cookies
- # Initialize cookies
- cookies = refresh_cookies()
- # Helper function to check if URL is within the same domain
- def is_same_domain(url, base):
- return urlparse(url).netloc == urlparse(base).netloc
- # Main loop
- while urls_to_visit and len(visited_urls) < max_pages:
- current_url = urls_to_visit.pop(0)
- # Refresh cookies periodically
- if len(visited_urls) % refresh_interval == 0:
- cookies = refresh_cookies()
- print(f"Cookies refreshed after visiting {len(visited_urls)} pages.")
- # Strip the fragment from the URL
- current_url, _ = urldefrag(current_url)
- # Skip if already visited
- if current_url in visited_urls:
- continue
- try:
- # Fetch the page with custom headers and cookies
- response = requests.get(current_url, headers=headers, cookies=cookies)
- status_codes[response.status_code] += 1
- # Only proceed if the request was successful
- if response.status_code == 200:
- # Parse the HTML
- soup = BeautifulSoup(response.text, 'html.parser')
- # Find all links in the page
- for link in soup.find_all('a', href=True):
- absolute_url, _ = urldefrag(urljoin(base_url, link['href']))
- parsed_url = urlparse(absolute_url)
- # Skip URLs that start with a query string (?)
- if ('?' in absolute_url):
- continue
- # Check if the URL without fragment is already visited
- if is_same_domain(absolute_url, base_url) and absolute_url not in visited_urls:
- urls_to_visit.append(absolute_url)
- # Mark the URL as visited
- visited_urls.add(current_url)
- # Print progress
- print(f"Visited {len(visited_urls)} pages. Currently visiting: {current_url}")
- except requests.RequestException as e:
- # Handle any request errors
- status_codes['error'] += 1
- print(f"Error fetching {current_url}: {e}")
- # Output the results
- print("Status codes encountered:")
- for code, count in status_codes.items():
- print(f"{code}: {count}")
- print(f"Total pages visited: {len(visited_urls)}")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement