Advertisement
OpticalAlgorithm

Python Web Scraper

Dec 26th, 2024
154
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 15.84 KB | Source Code | 0 0
  1. import tkinter as tk
  2. from tkinter import ttk, filedialog, messagebox
  3. import threading
  4. import sys
  5. import os
  6. import requests
  7. from bs4 import BeautifulSoup
  8. from urllib.parse import urljoin, urlparse
  9. import time
  10. import re
  11.  
  12.  
  13. class WebsiteCrawler:
  14.     def __init__(self, base_url, output_dir="downloaded_site", max_depth=3):
  15.         self.base_url = base_url
  16.         self.domain = urlparse(base_url).netloc
  17.         self.output_dir = output_dir
  18.         self.visited_urls = set()
  19.         self.rate_limit = 1  # Delay between requests in seconds
  20.         self.max_depth = max_depth
  21.         self.stop_requested = False  # Flag to control crawling
  22.  
  23.         # Create output directory if it doesn't exist
  24.         if not os.path.exists(output_dir):
  25.             os.makedirs(output_dir)
  26.  
  27.     def stop(self):
  28.         """Request the crawler to stop after current page"""
  29.         self.stop_requested = True
  30.         print("\nStop requested. Finishing current page...")
  31.  
  32.     def is_valid_url(self, url):
  33.         """Check if URL belongs to the same domain and is a webpage."""
  34.         try:
  35.             parsed = urlparse(url)
  36.  
  37.             # Skip any image files
  38.             image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.bmp', '.ico')
  39.             if url.lower().endswith(image_extensions):
  40.                 print(f"Skipping image file: {url}")
  41.                 return False
  42.  
  43.             # Skip Wikipedia special pages and non-article pages
  44.             skip_patterns = [
  45.                 '/wiki/Wikipedia:',
  46.                 '/wiki/File:',
  47.                 '/wiki/Help:',
  48.                 '/wiki/Special:',
  49.                 '/wiki/Talk:',
  50.                 '/wiki/User:',
  51.                 '/wiki/Template:',
  52.                 '/wiki/Category:',
  53.                 '/wiki/Portal:',
  54.                 'action=',
  55.                 'oldid=',
  56.                 'diff=',
  57.                 'printable=',
  58.                 'mobileaction='
  59.             ]
  60.  
  61.             # Check if this is a Wikipedia URL and if so, apply special rules
  62.             if 'wikipedia.org' in parsed.netloc:
  63.                 # Skip special pages but allow all regular article pages
  64.                 if any(pattern in url for pattern in skip_patterns):
  65.                     print(f"Skipping Wikipedia special page: {url}")
  66.                     return False
  67.  
  68.                 # Make sure it's a wiki article page
  69.                 if not '/wiki/' in url:
  70.                     print(f"Skipping non-article page: {url}")
  71.                     return False
  72.  
  73.                 # Allow all regular Wikipedia articles
  74.                 if '/wiki/' in url and parsed.netloc == self.domain:
  75.                     return True
  76.  
  77.             # For non-Wikipedia sites, use standard validation
  78.             is_valid = (
  79.                     parsed.netloc == self.domain and
  80.                     parsed.scheme in ['http', 'https'] and
  81.                     not url.endswith(('.pdf', '.zip', '.doc', '.docx'))
  82.             )
  83.  
  84.             if not is_valid:
  85.                 print(f"URL rejected: {url}")
  86.  
  87.             return is_valid
  88.  
  89.         except Exception as e:
  90.             print(f"Error parsing URL {url}: {str(e)}")
  91.             return False
  92.  
  93.     def clean_filename(self, url):
  94.         """Convert URL to a valid filename."""
  95.         # Remove the domain and scheme
  96.         filename = urlparse(url).path
  97.         if not filename or filename.endswith('/'):
  98.             filename += 'index.html'
  99.         elif not filename.endswith('.html'):
  100.             filename += '.html'
  101.  
  102.         # Clean the filename
  103.         filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
  104.         return filename.lstrip('/')
  105.  
  106.     def download_page(self, url):
  107.         """Download a webpage and return its content."""
  108.         try:
  109.             headers = {
  110.                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  111.             }
  112.             print(f"\nAttempting to download: {url}")
  113.             response = requests.get(url, timeout=10, headers=headers)
  114.             response.raise_for_status()
  115.             print(f"Download successful! Status code: {response.status_code}")
  116.             print(f"Content length: {len(response.text)} characters")
  117.             return response.text
  118.         except requests.exceptions.RequestException as e:
  119.             print(f"Error downloading {url}: {str(e)}")
  120.             return None
  121.  
  122.     def save_page(self, content, url):
  123.         """Save webpage content to a file and rewrite links to point to local files."""
  124.         if content:
  125.             filename = self.clean_filename(url)
  126.             filepath = os.path.join(self.output_dir, filename)
  127.  
  128.             print(f"\nSaving page to: {filepath}")
  129.  
  130.             try:
  131.                 # Parse the HTML
  132.                 soup = BeautifulSoup(content, 'html.parser')
  133.  
  134.                 # Remove all image elements
  135.                 for img in soup.find_all('img'):
  136.                     img.decompose()
  137.  
  138.                 # Remove all picture elements
  139.                 for picture in soup.find_all('picture'):
  140.                     picture.decompose()
  141.  
  142.                 # Remove image-related elements like figure/figcaption if they're empty after image removal
  143.                 for figure in soup.find_all('figure'):
  144.                     if not figure.find(string=True, recursive=False):
  145.                         figure.decompose()
  146.  
  147.                 # Rewrite links to point to local files
  148.                 for anchor in soup.find_all('a', href=True):
  149.                     href = anchor['href']
  150.                     absolute_url = urljoin(url, href)
  151.  
  152.                     if absolute_url in self.visited_urls:
  153.                         # Convert the absolute URL to a local path
  154.                         local_path = self.clean_filename(absolute_url)
  155.                         # Make the path relative to the current file
  156.                         current_depth = len(os.path.dirname(filename).split(os.sep))
  157.                         relative_path = os.path.relpath(local_path, os.path.dirname(filename))
  158.                         anchor['href'] = relative_path
  159.                         print(f"Rewriting link: {href} -> {relative_path}")
  160.  
  161.                 # Create subdirectories if needed
  162.                 os.makedirs(os.path.dirname(filepath), exist_ok=True)
  163.                 print(f"Directory structure created/verified")
  164.  
  165.                 # Save the modified HTML
  166.                 with open(filepath, 'w', encoding='utf-8') as f:
  167.                     f.write(str(soup))
  168.                 print(f"Successfully saved modified HTML to {filename}")
  169.  
  170.             except Exception as e:
  171.                 print(f"Error saving {filename}: {str(e)}")
  172.                 print(f"Full path attempted: {os.path.abspath(filepath)}")
  173.  
  174.     def extract_links(self, content, url):
  175.         """Extract all valid links from a webpage."""
  176.         soup = BeautifulSoup(content, 'html.parser')
  177.         links = set()
  178.  
  179.         print(f"\nExtracting links from {url}")
  180.         link_count = 0
  181.  
  182.         for anchor in soup.find_all('a', href=True):
  183.             link = urljoin(url, anchor['href'])
  184.             if self.is_valid_url(link):
  185.                 links.add(link)
  186.                 link_count += 1
  187.  
  188.         print(f"Found {link_count} valid links on this page")
  189.         return links
  190.  
  191.     def crawl(self):
  192.         """Start the crawling process."""
  193.         # Reset stop flag
  194.         self.stop_requested = False
  195.  
  196.         # Queue now contains tuples of (url, depth)
  197.         queue = [(self.base_url, 0)]
  198.         pages_processed = 0
  199.         start_time = time.time()
  200.  
  201.         print(f"\nStarting crawl of {self.base_url}")
  202.         print(f"Maximum depth: {self.max_depth}")
  203.  
  204.         while queue and not self.stop_requested:
  205.             url, depth = queue.pop(0)
  206.  
  207.             if url in self.visited_urls or depth >= self.max_depth:
  208.                 continue
  209.  
  210.             pages_processed += 1
  211.             print(f"\n--- Processing page {pages_processed} ---")
  212.             print(f"URL: {url}")
  213.             print(f"Depth: {depth}/{self.max_depth}")
  214.             print(f"Queue size: {len(queue)}")
  215.  
  216.             self.visited_urls.add(url)
  217.  
  218.             # Download the page
  219.             content = self.download_page(url)
  220.             if content:
  221.                 # Save the page
  222.                 self.save_page(content, url)
  223.  
  224.                 # Only add new links if we haven't reached max_depth
  225.                 if depth < self.max_depth - 1:
  226.                     # Extract and add new links to the queue with incremented depth
  227.                     new_links = self.extract_links(content, url)
  228.                     queue.extend([(link, depth + 1) for link in new_links if link not in self.visited_urls])
  229.  
  230.                 # Rate limiting
  231.                 if queue and not self.stop_requested:
  232.                     print(f"Waiting {self.rate_limit} seconds before next page...")
  233.                     time.sleep(self.rate_limit)
  234.  
  235.         elapsed_time = time.time() - start_time
  236.         print(f"\nCrawling completed!")
  237.         if self.stop_requested:
  238.             print("Crawling was stopped by user")
  239.         print(f"Total pages processed: {pages_processed}")
  240.         print(f"Total unique URLs visited: {len(self.visited_urls)}")
  241.         print(f"Total time: {elapsed_time:.2f} seconds")
  242.  
  243.  
  244. class WebCrawlerGUI:
  245.     def __init__(self, root):
  246.         self.root = root
  247.         self.root.title("Web Crawler")
  248.         self.root.geometry("600x500")
  249.         self.crawler = None  # Initialize crawler reference
  250.  
  251.         # Create main frame
  252.         main_frame = ttk.Frame(root, padding="10")
  253.         main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
  254.  
  255.         # URL input
  256.         ttk.Label(main_frame, text="URL to crawl:").grid(row=0, column=0, sticky=tk.W, pady=5)
  257.         self.url_var = tk.StringVar()
  258.         self.url_entry = ttk.Entry(main_frame, textvariable=self.url_var, width=50)
  259.         self.url_entry.grid(row=0, column=1, columnspan=2, sticky=(tk.W, tk.E), pady=5)
  260.  
  261.         # Output directory
  262.         ttk.Label(main_frame, text="Output directory:").grid(row=1, column=0, sticky=tk.W, pady=5)
  263.         self.output_var = tk.StringVar(value=os.path.join(os.getcwd(), "downloaded_site"))
  264.         self.output_entry = ttk.Entry(main_frame, textvariable=self.output_var, width=50)
  265.         self.output_entry.grid(row=1, column=1, sticky=(tk.W, tk.E), pady=5)
  266.         ttk.Button(main_frame, text="Browse", command=self.browse_output).grid(row=1, column=2, sticky=tk.W, pady=5,
  267.                                                                                padx=5)
  268.  
  269.         # Depth input
  270.         ttk.Label(main_frame, text="Maximum depth:").grid(row=2, column=0, sticky=tk.W, pady=5)
  271.         self.depth_var = tk.StringVar(value="3")
  272.         depth_entry = ttk.Entry(main_frame, textvariable=self.depth_var, width=10)
  273.         depth_entry.grid(row=2, column=1, sticky=tk.W, pady=5)
  274.  
  275.         # Progress frame
  276.         progress_frame = ttk.LabelFrame(main_frame, text="Progress", padding="5")
  277.         progress_frame.grid(row=3, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=10)
  278.  
  279.         # Progress bar
  280.         self.progress_var = tk.StringVar(value="Ready")
  281.         ttk.Label(progress_frame, textvariable=self.progress_var).grid(row=0, column=0, sticky=tk.W)
  282.  
  283.         # Log text area
  284.         self.log_text = tk.Text(main_frame, height=15, width=60, wrap=tk.WORD)
  285.         self.log_text.grid(row=4, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=5)
  286.  
  287.         # Scrollbar for log
  288.         scrollbar = ttk.Scrollbar(main_frame, orient=tk.VERTICAL, command=self.log_text.yview)
  289.         scrollbar.grid(row=4, column=3, sticky=(tk.N, tk.S))
  290.         self.log_text.configure(yscrollcommand=scrollbar.set)
  291.  
  292.         # Buttons frame
  293.         button_frame = ttk.Frame(main_frame)
  294.         button_frame.grid(row=5, column=0, columnspan=3, pady=10)
  295.  
  296.         # Start button
  297.         self.start_button = ttk.Button(button_frame, text="Start Crawling", command=self.start_crawling)
  298.         self.start_button.pack(side=tk.LEFT, padx=5)
  299.  
  300.         # Stop button (initially disabled)
  301.         self.stop_button = ttk.Button(button_frame, text="Stop", command=self.stop_crawling, state='disabled')
  302.         self.stop_button.pack(side=tk.LEFT, padx=5)
  303.  
  304.         # Configure grid weights
  305.         main_frame.columnconfigure(1, weight=1)
  306.  
  307.         # Redirect stdout to our log
  308.         sys.stdout = self
  309.  
  310.     def write(self, text):
  311.         """Handle stdout redirection"""
  312.         self.log_text.insert(tk.END, text)
  313.         self.log_text.see(tk.END)
  314.         self.root.update_idletasks()
  315.  
  316.     def flush(self):
  317.         """Required for stdout redirection"""
  318.         pass
  319.  
  320.     def browse_output(self):
  321.         """Open directory browser"""
  322.         directory = filedialog.askdirectory(initialdir=self.output_var.get())
  323.         if directory:
  324.             self.output_var.set(directory)
  325.  
  326.     def stop_crawling(self):
  327.         """Stop the crawling process"""
  328.         self.stop_button.configure(state='disabled')
  329.         self.progress_var.set("Stopping...")
  330.         self.crawler.stop()  # Request the crawler to stop
  331.  
  332.     def start_crawling(self):
  333.         """Start the crawling process"""
  334.         # Validate inputs
  335.         url = self.url_var.get().strip()
  336.         output_dir = self.output_var.get().strip()
  337.  
  338.         try:
  339.             depth = int(self.depth_var.get())
  340.             if depth < 1:
  341.                 raise ValueError("Depth must be at least 1")
  342.         except ValueError as e:
  343.             messagebox.showerror("Error", "Invalid depth value. Please enter a positive number.")
  344.             return
  345.  
  346.         if not url:
  347.             messagebox.showerror("Error", "Please enter a URL")
  348.             return
  349.  
  350.         if not url.startswith(('http://', 'https://')):
  351.             messagebox.showerror("Error", "URL must start with http:// or https://")
  352.             return
  353.  
  354.         # Disable inputs while crawling
  355.         self.start_button.configure(state='disabled')
  356.         self.url_entry.configure(state='disabled')
  357.         self.output_entry.configure(state='disabled')
  358.         self.progress_var.set("Crawling...")
  359.  
  360.         # Clear log
  361.         self.log_text.delete(1.0, tk.END)
  362.  
  363.         # Start crawling in a separate thread
  364.         def crawl_thread():
  365.             try:
  366.                 self.crawler = WebsiteCrawler(url, output_dir, depth)
  367.                 self.stop_button.configure(state='normal')  # Enable stop button
  368.                 self.crawler.crawl()
  369.                 if self.crawler.stop_requested:
  370.                     self.root.after(0, self.crawling_finished, True, "Crawling stopped by user")
  371.                 else:
  372.                     self.root.after(0, self.crawling_finished, True)
  373.             except Exception as e:
  374.                 self.root.after(0, self.crawling_finished, False, str(e))
  375.  
  376.         threading.Thread(target=crawl_thread, daemon=True).start()
  377.  
  378.     def crawling_finished(self, success, error_message=None):
  379.         """Called when crawling is complete"""
  380.         # Re-enable inputs
  381.         self.start_button.configure(state='normal')
  382.         self.url_entry.configure(state='normal')
  383.         self.output_entry.configure(state='normal')
  384.  
  385.         # Disable stop button
  386.         self.stop_button.configure(state='disabled')
  387.  
  388.         if success:
  389.             if error_message and "stopped by user" in error_message:
  390.                 self.progress_var.set("Crawling stopped")
  391.                 messagebox.showinfo("Stopped", "Website crawling was stopped by user")
  392.             else:
  393.                 self.progress_var.set("Crawling completed!")
  394.                 messagebox.showinfo("Success", "Website crawling completed successfully!")
  395.         else:
  396.             self.progress_var.set("Error occurred!")
  397.             messagebox.showerror("Error", f"An error occurred while crawling:\n{error_message}")
  398.  
  399.  
  400. def main():
  401.     root = tk.Tk()
  402.     app = WebCrawlerGUI(root)
  403.     root.mainloop()
  404.  
  405.  
  406. if __name__ == "__main__":
  407.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement