media_downloader.py

# media_downloader.py

import requests
from bs4 import BeautifulSoup
import os
import urllib.parse
import re
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
import threading
import hashlib
import tkinter as tk
from tkinter import filedialog, ttk, scrolledtext
from queue import Queue
import time

def sanitize_filename(filename):
    """Sanitize filename by removing invalid characters."""
    return re.sub(r'[^\w\-_\. ]', '', filename.replace('/', '_'))

def calculate_file_hash(filepath):
    """Calculate SHA-256 hash of a file."""
    sha256 = hashlib.sha256()
    with open(filepath, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b""):
            sha256.update(chunk)
    return sha256.hexdigest()

def calculate_data_hash(data):
    """Calculate SHA-256 hash of data in memory."""
    sha256 = hashlib.sha256()
    for chunk in data:
        sha256.update(chunk)
    return sha256.hexdigest()

def download_file(url, folder, filename, log_queue, progress_queue, total_files):
    """Download a file and save it to the specified folder, overwriting only if hashes match."""
    try:
        response = requests.get(url, stream=True, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        if response.status_code == 200:
            filename = sanitize_filename(filename)
            filepath = os.path.join(folder, filename)

            # Store downloaded chunks in memory for hash calculation
            chunks = []
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    chunks.append(chunk)

            # Calculate hash of downloaded data
            downloaded_hash = calculate_data_hash(chunks)

            # Check if file exists and compare hashes
            if os.path.exists(filepath):
                existing_hash = calculate_file_hash(filepath)
                if existing_hash == downloaded_hash:
                    with open(filepath, 'wb') as f:
                        for chunk in chunks:
                            f.write(chunk)
                    log_queue.put(f"[{threading.current_thread().name}] Overwritten (identical hash): {filepath}")
                    progress_queue.put(1 / total_files)
                    return
                else:
                    base, ext = os.path.splitext(filename)
                    counter = 1
                    while os.path.exists(filepath):
                        new_filename = f"{base}_{counter}{ext}"
                        filepath = os.path.join(folder, new_filename)
                        if os.path.exists(filepath):
                            existing_hash = calculate_file_hash(filepath)
                            if existing_hash == downloaded_hash:
                                with open(filepath, 'wb') as f:
                                    for chunk in chunks:
                                        f.write(chunk)
                                log_queue.put(f"[{threading.current_thread().name}] Overwritten (identical hash): {filepath}")
                                progress_queue.put(1 / total_files)
                                return
                        counter += 1
                    filename = new_filename

            # Write new file
            with open(filepath, 'wb') as f:
                for chunk in chunks:
                    f.write(chunk)
            log_queue.put(f"[{threading.current_thread().name}] Downloaded: {filepath}")
            progress_queue.put(1 / total_files)
        else:
            log_queue.put(f"[{threading.current_thread().name}] Failed to download: {url} (Status: {response.status_code})")
            progress_queue.put(1 / total_files)
    except Exception as e:
        log_queue.put(f"[{threading.current_thread().name}] Error downloading {url}: {str(e)}")
        progress_queue.put(1 / total_files)

def get_media_files(url, folder, log_queue, progress_queue):
    """Main function to scrape and download media files."""
    try:
        # Send request to the webpage
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        # Parse the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all media links
        media_extensions = {'.mp4', '.webm', '.mov', '.avi', '.mkv', '.flv', '.wmv'}
        media_urls = set()

        # Check all <a> tags
        for link in soup.find_all('a', href=True):
            href = link['href']
            absolute_url = urljoin(url, href)
            if any(ext in href.lower() for ext in media_extensions):
                media_urls.add((absolute_url, os.path.basename(urllib.parse.urlparse(absolute_url).path)))

        # Check all <source> tags (for video elements)
        for source in soup.find_all('source'):
            if 'src' in source.attrs:
                src = urljoin(url, source['src'])
                if any(ext in src.lower() for ext in media_extensions):
                    media_urls.add((src, os.path.basename(urllib.parse.urlparse(src).path)))

        if not media_urls:
            log_queue.put("No media files found on the page.")
            return

        # Create output directory with sanitized name
        title = soup.find('title')
        if title and title.text.strip():
            folder_name = sanitize_filename(title.text.strip()[:50])  # Limit length
        else:
            parsed_url = urllib.parse.urlparse(url)
            folder_name = sanitize_filename(parsed_url.path.split('/')[-1] or 'media_download')

        output_path = os.path.join(folder, folder_name)
        os.makedirs(output_path, exist_ok=True)

        # Download files using ThreadPoolExecutor
        cpu_count = os.cpu_count() or 4  # Fallback to 4 if cpu_count is None
        max_workers = min(len(media_urls), cpu_count * 2)  # Twice the CPU cores or number of files
        log_queue.put(f"Starting download of {len(media_urls)} files with {max_workers} threads...")
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [
                executor.submit(download_file, url, output_path, filename, log_queue, progress_queue, len(media_urls))
                for url, filename in media_urls
            ]
            # Wait for all downloads to complete
            for future in futures:
                future.result()

        log_queue.put(f"Download complete! Files saved in: {output_path}")

    except requests.RequestException as e:
        log_queue.put(f"Error accessing the webpage: {str(e)}")
    except Exception as e:
        log_queue.put(f"An error occurred: {str(e)}")

class MediaDownloaderApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Media Downloader")
        self.root.geometry("600x400")

        self.log_queue = Queue()
        self.progress_queue = Queue()
        self.download_thread = None
        self.running = False

        # URL input
        tk.Label(root, text="URL:").grid(row=0, column=0, padx=5, pady=5, sticky="e")
        self.url_entry = tk.Entry(root, width=50)
        self.url_entry.grid(row=0, column=1, padx=5, pady=5, columnspan=2)

        # Folder selection
        tk.Label(root, text="Save to:").grid(row=1, column=0, padx=5, pady=5, sticky="e")
        self.folder_entry = tk.Entry(root, width=40)
        self.folder_entry.grid(row=1, column=1, padx=5, pady=5)
        self.folder_entry.insert(0, os.getcwd())
        tk.Button(root, text="Browse", command=self.browse_folder).grid(row=1, column=2, padx=5, pady=5)

        # Progress bar
        self.progress = ttk.Progressbar(root, length=400, mode='determinate')
        self.progress.grid(row=2, column=0, columnspan=3, padx=5, pady=5)

        # Log area
        self.log_text = scrolledtext.ScrolledText(root, height=15, width=60, state='disabled')
        self.log_text.grid(row=3, column=0, columnspan=3, padx=5, pady=5)

        # Buttons
        tk.Button(root, text="Start Download", command=self.start_download).grid(row=4, column=0, padx=5, pady=5)
        tk.Button(root, text="Clear Log", command=self.clear_log).grid(row=4, column=1, padx=5, pady=5)
        tk.Button(root, text="Quit", command=self.quit).grid(row=4, column=2, padx=5, pady=5)

        # Start checking for log and progress updates
        self.update_gui()

    def browse_folder(self):
        folder = filedialog.askdirectory()
        if folder:
            self.folder_entry.delete(0, tk.END)
            self.folder_entry.insert(0, folder)

    def start_download(self):
        if self.running:
            self.log("Download already in progress.")
            return

        url = self.url_entry.get().strip()
        if not url:
            self.log("Please enter a URL.")
            return
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        folder = self.folder_entry.get().strip()
        if not folder:
            self.log("Please select a save directory.")
            return

        self.running = True
        self.progress['value'] = 0
        self.download_thread = threading.Thread(target=get_media_files, args=(url, folder, self.log_queue, self.progress_queue))
        self.download_thread.start()
        self.log(f"Processing {url}...")

    def log(self, message):
        self.log_queue.put(message)

    def update_gui(self):
        """Update the GUI with log messages and progress."""
        while not self.log_queue.empty():
            message = self.log_queue.get()
            self.log_text.config(state='normal')
            self.log_text.insert(tk.END, message + "\n")
            self.log_text.see(tk.END)
            self.log_text.config(state='disabled')

        while not self.progress_queue.empty():
            increment = self.progress_queue.get()
            self.progress['value'] += increment * 100  # Convert fraction to percentage
            if self.progress['value'] >= 99.9:
                self.running = False

        self.root.after(100, self.update_gui)

    def clear_log(self):
        self.log_text.config(state='normal')
        self.log_text.delete(1.0, tk.END)
        self.log_text.config(state='disabled')

    def quit(self):
        self.running = False
        if self.download_thread and self.download_thread.is_alive():
            self.log("Waiting for downloads to complete...")
            self.download_thread.join()
        self.root.destroy()

def main():
    """Launch the Tkinter GUI."""
    root = tk.Tk()
    app = MediaDownloaderApp(root)
    root.mainloop()

if __name__ == "__main__":
    main()