Najeebsk

WEB-SCRAPING-ALL-ELEMENTS2.py

Mar 19th, 2024
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.72 KB | None | 0 0
  1. import tkinter as tk
  2. from tkinter import ttk
  3. from tkinter import scrolledtext
  4. import requests
  5. from bs4 import BeautifulSoup
  6. import re
  7. from urllib.parse import urljoin
  8. from tkinter import filedialog
  9.  
  10. class WebScraperGUI:
  11.     def __init__(self, root):
  12.         self.root = root
  13.         self.root.title("Najeeb All Web Scraper")
  14.         self.root.configure(bg="#336699")
  15.         self.root.geometry("900x660")
  16.        
  17.         # Apply a theme
  18.         self.style = ttk.Style()
  19.         self.style.theme_use('clam')  # You can change 'clam' to other available themes
  20.        
  21.         # URL Entry
  22.         self.url_label = ttk.Label(root, text="Enter URL:")
  23.         self.url_label.grid(column=0, row=0, sticky=tk.W)
  24.         self.url_entry = ttk.Entry(root, width=120)
  25.         self.url_entry.grid(column=1, row=0, columnspan=4, sticky=tk.W)
  26.  
  27.         # Options
  28.         self.options_label = ttk.Label(root, text="Select Options:")
  29.         self.options_label.grid(column=0, row=1, sticky=tk.W)
  30.  
  31.         # Checkboxes
  32.         self.check_var_html = tk.BooleanVar()
  33.         self.check_var_heading = tk.BooleanVar()
  34.         self.check_var_paragraph = tk.BooleanVar()
  35.         self.check_var_css = tk.BooleanVar()
  36.         self.check_var_table = tk.BooleanVar()
  37.         self.check_var_links = tk.BooleanVar()
  38.         self.check_var_files = tk.BooleanVar()
  39.         self.check_var_m3u = tk.BooleanVar()
  40.  
  41.         self.html_check = ttk.Checkbutton(root, text="Full HTML", variable=self.check_var_html)
  42.         self.html_check.grid(column=1, row=1, sticky=tk.W)
  43.  
  44.         self.heading_check = ttk.Checkbutton(root, text="Headings", variable=self.check_var_heading)
  45.         self.heading_check.grid(column=2, row=1, sticky=tk.W)
  46.  
  47.         self.paragraph_check = ttk.Checkbutton(root, text="Paragraphs", variable=self.check_var_paragraph)
  48.         self.paragraph_check.grid(column=3, row=1, sticky=tk.W)
  49.  
  50.         self.css_check = ttk.Checkbutton(root, text="CSS", variable=self.check_var_css)
  51.         self.css_check.grid(column=4, row=1, sticky=tk.W)
  52.  
  53.         self.table_check = ttk.Checkbutton(root, text="Tables", variable=self.check_var_table)
  54.         self.table_check.grid(column=1, row=2, sticky=tk.W)
  55.  
  56.         self.links_check = ttk.Checkbutton(root, text="Links", variable=self.check_var_links)
  57.         self.links_check.grid(column=2, row=2, sticky=tk.W)
  58.  
  59.         self.files_check = ttk.Checkbutton(root, text="Files", variable=self.check_var_files)
  60.         self.files_check.grid(column=3, row=2, sticky=tk.W)
  61.  
  62.         self.m3u_check = ttk.Checkbutton(root, text="M3U/M3U8 Files", variable=self.check_var_m3u)
  63.         self.m3u_check.grid(column=4, row=2, sticky=tk.W)
  64.  
  65.         # Scrape Button
  66.         self.scrape_button = ttk.Button(root, text="SCRAPE", command=self.scrape)
  67.         self.scrape_button.grid(column=4, row=6, columnspan=8, pady=5)
  68.  
  69.         # Save Result Button
  70.         self.save_result_button = ttk.Button(root, text="Save Result", command=self.save_result, style='Red.TButton')
  71.         self.save_result_button.grid(column=0, row=6, columnspan=8, pady=5)
  72.  
  73.         # Result Text Field
  74.         self.result_label = ttk.Label(root, text="Scraped Content of Websites:")
  75.         self.result_label.grid(column=0, row=4, sticky=tk.W)
  76.  
  77.         self.result_text = scrolledtext.ScrolledText(root, width=110, height=33, wrap=tk.WORD)
  78.         self.result_text.grid(column=0, row=5, columnspan=5)
  79.  
  80.         # Define style for the "Save Result" button
  81.         self.style.configure('Red.TButton', foreground='red')
  82.  
  83.     def scrape(self):
  84.         url = self.url_entry.get()
  85.         if not url:
  86.             return
  87.        
  88.         options = {
  89.             'html': self.check_var_html.get(),
  90.             'heading': self.check_var_heading.get(),
  91.             'paragraph': self.check_var_paragraph.get(),
  92.             'css': self.check_var_css.get(),
  93.             'table': self.check_var_table.get(),
  94.             'links': self.check_var_links.get(),
  95.             'files': self.check_var_files.get(),
  96.             'm3u': self.check_var_m3u.get()
  97.         }
  98.        
  99.         response = requests.get(url)
  100.         soup = BeautifulSoup(response.content, 'html.parser')
  101.  
  102.         result = ""
  103.         if options['html']:
  104.             result += str(soup) + '\n\n'
  105.  
  106.         if options['heading']:
  107.             headings = soup.find_all(re.compile('^h[1-6]$'))
  108.             for heading in headings:
  109.                 result += heading.text + '\n'
  110.             result += '\n'
  111.  
  112.         if options['paragraph']:
  113.             paragraphs = soup.find_all('p')
  114.             for paragraph in paragraphs:
  115.                 result += paragraph.text + '\n'
  116.             result += '\n'
  117.  
  118.         if options['css']:
  119.             css_links = [link['href'] for link in soup.find_all('link', rel='stylesheet')]
  120.             result += "CSS Links:\n"
  121.             for css_link in css_links:
  122.                 full_url = urljoin(url, css_link)
  123.                 result += full_url + '\n'
  124.             result += '\n'
  125.  
  126.         if options['table']:
  127.             tables = soup.find_all('table')
  128.             result += "Tables:\n"
  129.             for table in tables:
  130.                 result += str(table) + '\n'
  131.             result += '\n'
  132.  
  133.         if options['links']:
  134.             links = soup.find_all('a', href=True)
  135.             result += "Links:\n"
  136.             for link in links:
  137.                 if link['href'].startswith('http'):
  138.                     result += f"Text: {link.text}, URL: {link['href']}\n"
  139.                 else:
  140.                     full_url = urljoin(url, link['href'])
  141.                     result += f"Text: {link.text}, URL: {full_url}\n"
  142.             result += '\n'
  143.  
  144.         if options['files']:
  145.             try:
  146.                 file_links = [link['href'] for link in soup.find_all('a', href=True) if re.search(r'\.[^.]+$', link['href'])]
  147.                 result += "File Links:\n"
  148.                 for file_link in file_links:
  149.                     full_url = urljoin(url, file_link)
  150.                     result += full_url + '\n'
  151.                 result += '\n'
  152.             except AttributeError as e:
  153.                 result += f"Error occurred while fetching file links: {e}\n\n"
  154.  
  155.         if options['m3u']:
  156.             m3u_links = soup.find_all('a', href=True, text=re.compile(r'.*\.m3u8?$'))
  157.             result += "M3U/M3U8 Files:\n"
  158.             for m3u_link in m3u_links:
  159.                 if m3u_link['href'].startswith('http'):
  160.                     result += f"URL: {m3u_link['href']}\n"
  161.                 else:
  162.                     full_url = urljoin(url, m3u_link['href'])
  163.                     result += f"URL: {full_url}\n"
  164.                
  165.                 response = requests.get(full_url)
  166.                 if response.ok:
  167.                     content = response.text.split('\n')
  168.                     for line in content:
  169.                         if line.startswith('#EXTINF'):
  170.                             result += f"Segment: {line.split(',')[-1]}\n"
  171.                         elif line and not line.startswith('#'):
  172.                             result += f"URL: {line}\n"
  173.                 else:
  174.                     result += f"Failed to fetch M3U/M3U8 content from {full_url}\n"
  175.             result += '\n'
  176.  
  177.         self.result_text.delete(1.0, tk.END)
  178.         self.result_text.insert(tk.END, result)
  179.  
  180.     def save_result(self):
  181.         result_text = self.result_text.get(1.0, tk.END)
  182.         if not result_text.strip():
  183.             return
  184.         file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt")])
  185.         if file_path:
  186.             with open(file_path, "w", encoding="utf-8") as file:
  187.                 file.write(result_text)
  188.  
  189. def main():
  190.     root = tk.Tk()
  191.     app = WebScraperGUI(root)
  192.     root.mainloop()
  193.  
  194. if __name__ == "__main__":
  195.     main()
  196.  
Add Comment
Please, Sign In to add comment