WEB-SCRAPING-ALL-ELEMENTS2.py

import tkinter as tk
from tkinter import ttk
from tkinter import scrolledtext
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
from tkinter import filedialog

class WebScraperGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("Najeeb All Web Scraper")
        self.root.configure(bg="#336699")
        self.root.geometry("900x660")

        # Apply a theme
        self.style = ttk.Style()
        self.style.theme_use('clam')  # You can change 'clam' to other available themes

        # URL Entry
        self.url_label = ttk.Label(root, text="Enter URL:")
        self.url_label.grid(column=0, row=0, sticky=tk.W)
        self.url_entry = ttk.Entry(root, width=120)
        self.url_entry.grid(column=1, row=0, columnspan=4, sticky=tk.W)

        # Options
        self.options_label = ttk.Label(root, text="Select Options:")
        self.options_label.grid(column=0, row=1, sticky=tk.W)

        # Checkboxes
        self.check_var_html = tk.BooleanVar()
        self.check_var_heading = tk.BooleanVar()
        self.check_var_paragraph = tk.BooleanVar()
        self.check_var_css = tk.BooleanVar()
        self.check_var_table = tk.BooleanVar()
        self.check_var_links = tk.BooleanVar()
        self.check_var_files = tk.BooleanVar()
        self.check_var_m3u = tk.BooleanVar()

        self.html_check = ttk.Checkbutton(root, text="Full HTML", variable=self.check_var_html)
        self.html_check.grid(column=1, row=1, sticky=tk.W)

        self.heading_check = ttk.Checkbutton(root, text="Headings", variable=self.check_var_heading)
        self.heading_check.grid(column=2, row=1, sticky=tk.W)

        self.paragraph_check = ttk.Checkbutton(root, text="Paragraphs", variable=self.check_var_paragraph)
        self.paragraph_check.grid(column=3, row=1, sticky=tk.W)

        self.css_check = ttk.Checkbutton(root, text="CSS", variable=self.check_var_css)
        self.css_check.grid(column=4, row=1, sticky=tk.W)

        self.table_check = ttk.Checkbutton(root, text="Tables", variable=self.check_var_table)
        self.table_check.grid(column=1, row=2, sticky=tk.W)

        self.links_check = ttk.Checkbutton(root, text="Links", variable=self.check_var_links)
        self.links_check.grid(column=2, row=2, sticky=tk.W)

        self.files_check = ttk.Checkbutton(root, text="Files", variable=self.check_var_files)
        self.files_check.grid(column=3, row=2, sticky=tk.W)

        self.m3u_check = ttk.Checkbutton(root, text="M3U/M3U8 Files", variable=self.check_var_m3u)
        self.m3u_check.grid(column=4, row=2, sticky=tk.W)

        # Scrape Button
        self.scrape_button = ttk.Button(root, text="SCRAPE", command=self.scrape)
        self.scrape_button.grid(column=4, row=6, columnspan=8, pady=5)

        # Save Result Button
        self.save_result_button = ttk.Button(root, text="Save Result", command=self.save_result, style='Red.TButton')
        self.save_result_button.grid(column=0, row=6, columnspan=8, pady=5)

        # Result Text Field
        self.result_label = ttk.Label(root, text="Scraped Content of Websites:")
        self.result_label.grid(column=0, row=4, sticky=tk.W)

        self.result_text = scrolledtext.ScrolledText(root, width=110, height=33, wrap=tk.WORD)
        self.result_text.grid(column=0, row=5, columnspan=5)

        # Define style for the "Save Result" button
        self.style.configure('Red.TButton', foreground='red')

    def scrape(self):
        url = self.url_entry.get()
        if not url:
            return

        options = {
            'html': self.check_var_html.get(),
            'heading': self.check_var_heading.get(),
            'paragraph': self.check_var_paragraph.get(),
            'css': self.check_var_css.get(),
            'table': self.check_var_table.get(),
            'links': self.check_var_links.get(),
            'files': self.check_var_files.get(),
            'm3u': self.check_var_m3u.get()
        }

        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        result = ""
        if options['html']:
            result += str(soup) + '\n\n'

        if options['heading']:
            headings = soup.find_all(re.compile('^h[1-6]$'))
            for heading in headings:
                result += heading.text + '\n'
            result += '\n'

        if options['paragraph']:
            paragraphs = soup.find_all('p')
            for paragraph in paragraphs:
                result += paragraph.text + '\n'
            result += '\n'

        if options['css']:
            css_links = [link['href'] for link in soup.find_all('link', rel='stylesheet')]
            result += "CSS Links:\n"
            for css_link in css_links:
                full_url = urljoin(url, css_link)
                result += full_url + '\n'
            result += '\n'

        if options['table']:
            tables = soup.find_all('table')
            result += "Tables:\n"
            for table in tables:
                result += str(table) + '\n'
            result += '\n'

        if options['links']:
            links = soup.find_all('a', href=True)
            result += "Links:\n"
            for link in links:
                if link['href'].startswith('http'):
                    result += f"Text: {link.text}, URL: {link['href']}\n"
                else:
                    full_url = urljoin(url, link['href'])
                    result += f"Text: {link.text}, URL: {full_url}\n"
            result += '\n'

        if options['files']:
            try:
                file_links = [link['href'] for link in soup.find_all('a', href=True) if re.search(r'\.[^.]+$', link['href'])]
                result += "File Links:\n"
                for file_link in file_links:
                    full_url = urljoin(url, file_link)
                    result += full_url + '\n'
                result += '\n'
            except AttributeError as e:
                result += f"Error occurred while fetching file links: {e}\n\n"

        if options['m3u']:
            m3u_links = soup.find_all('a', href=True, text=re.compile(r'.*\.m3u8?$'))
            result += "M3U/M3U8 Files:\n"
            for m3u_link in m3u_links:
                if m3u_link['href'].startswith('http'):
                    result += f"URL: {m3u_link['href']}\n"
                else:
                    full_url = urljoin(url, m3u_link['href'])
                    result += f"URL: {full_url}\n"

                response = requests.get(full_url)
                if response.ok:
                    content = response.text.split('\n')
                    for line in content:
                        if line.startswith('#EXTINF'):
                            result += f"Segment: {line.split(',')[-1]}\n"
                        elif line and not line.startswith('#'):
                            result += f"URL: {line}\n"
                else:
                    result += f"Failed to fetch M3U/M3U8 content from {full_url}\n"
            result += '\n'

        self.result_text.delete(1.0, tk.END)
        self.result_text.insert(tk.END, result)

    def save_result(self):
        result_text = self.result_text.get(1.0, tk.END)
        if not result_text.strip():
            return
        file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt")])
        if file_path:
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(result_text)

def main():
    root = tk.Tk()
    app = WebScraperGUI(root)
    root.mainloop()

if __name__ == "__main__":
    main()