Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # media_downloader.py
- import requests
- from bs4 import BeautifulSoup
- import os
- import urllib.parse
- import re
- from urllib.parse import urljoin
- from concurrent.futures import ThreadPoolExecutor
- import threading
- import hashlib
- import tkinter as tk
- from tkinter import filedialog, ttk, scrolledtext
- from queue import Queue
- import time
- def sanitize_filename(filename):
- """Sanitize filename by removing invalid characters."""
- return re.sub(r'[^\w\-_\. ]', '', filename.replace('/', '_'))
- def calculate_file_hash(filepath):
- """Calculate SHA-256 hash of a file."""
- sha256 = hashlib.sha256()
- with open(filepath, 'rb') as f:
- for chunk in iter(lambda: f.read(8192), b""):
- sha256.update(chunk)
- return sha256.hexdigest()
- def calculate_data_hash(data):
- """Calculate SHA-256 hash of data in memory."""
- sha256 = hashlib.sha256()
- for chunk in data:
- sha256.update(chunk)
- return sha256.hexdigest()
- def download_file(url, folder, filename, log_queue, progress_queue, total_files):
- """Download a file and save it to the specified folder, overwriting only if hashes match."""
- try:
- response = requests.get(url, stream=True, headers={
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
- })
- if response.status_code == 200:
- filename = sanitize_filename(filename)
- filepath = os.path.join(folder, filename)
- # Store downloaded chunks in memory for hash calculation
- chunks = []
- for chunk in response.iter_content(chunk_size=8192):
- if chunk:
- chunks.append(chunk)
- # Calculate hash of downloaded data
- downloaded_hash = calculate_data_hash(chunks)
- # Check if file exists and compare hashes
- if os.path.exists(filepath):
- existing_hash = calculate_file_hash(filepath)
- if existing_hash == downloaded_hash:
- with open(filepath, 'wb') as f:
- for chunk in chunks:
- f.write(chunk)
- log_queue.put(f"[{threading.current_thread().name}] Overwritten (identical hash): {filepath}")
- progress_queue.put(1 / total_files)
- return
- else:
- base, ext = os.path.splitext(filename)
- counter = 1
- while os.path.exists(filepath):
- new_filename = f"{base}_{counter}{ext}"
- filepath = os.path.join(folder, new_filename)
- if os.path.exists(filepath):
- existing_hash = calculate_file_hash(filepath)
- if existing_hash == downloaded_hash:
- with open(filepath, 'wb') as f:
- for chunk in chunks:
- f.write(chunk)
- log_queue.put(f"[{threading.current_thread().name}] Overwritten (identical hash): {filepath}")
- progress_queue.put(1 / total_files)
- return
- counter += 1
- filename = new_filename
- # Write new file
- with open(filepath, 'wb') as f:
- for chunk in chunks:
- f.write(chunk)
- log_queue.put(f"[{threading.current_thread().name}] Downloaded: {filepath}")
- progress_queue.put(1 / total_files)
- else:
- log_queue.put(f"[{threading.current_thread().name}] Failed to download: {url} (Status: {response.status_code})")
- progress_queue.put(1 / total_files)
- except Exception as e:
- log_queue.put(f"[{threading.current_thread().name}] Error downloading {url}: {str(e)}")
- progress_queue.put(1 / total_files)
- def get_media_files(url, folder, log_queue, progress_queue):
- """Main function to scrape and download media files."""
- try:
- # Send request to the webpage
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
- }
- response = requests.get(url, headers=headers)
- response.raise_for_status()
- # Parse the page
- soup = BeautifulSoup(response.text, 'html.parser')
- # Find all media links
- media_extensions = {'.mp4', '.webm', '.mov', '.avi', '.mkv', '.flv', '.wmv'}
- media_urls = set()
- # Check all <a> tags
- for link in soup.find_all('a', href=True):
- href = link['href']
- absolute_url = urljoin(url, href)
- if any(ext in href.lower() for ext in media_extensions):
- media_urls.add((absolute_url, os.path.basename(urllib.parse.urlparse(absolute_url).path)))
- # Check all <source> tags (for video elements)
- for source in soup.find_all('source'):
- if 'src' in source.attrs:
- src = urljoin(url, source['src'])
- if any(ext in src.lower() for ext in media_extensions):
- media_urls.add((src, os.path.basename(urllib.parse.urlparse(src).path)))
- if not media_urls:
- log_queue.put("No media files found on the page.")
- return
- # Create output directory with sanitized name
- title = soup.find('title')
- if title and title.text.strip():
- folder_name = sanitize_filename(title.text.strip()[:50]) # Limit length
- else:
- parsed_url = urllib.parse.urlparse(url)
- folder_name = sanitize_filename(parsed_url.path.split('/')[-1] or 'media_download')
- output_path = os.path.join(folder, folder_name)
- os.makedirs(output_path, exist_ok=True)
- # Download files using ThreadPoolExecutor
- cpu_count = os.cpu_count() or 4 # Fallback to 4 if cpu_count is None
- max_workers = min(len(media_urls), cpu_count * 2) # Twice the CPU cores or number of files
- log_queue.put(f"Starting download of {len(media_urls)} files with {max_workers} threads...")
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
- futures = [
- executor.submit(download_file, url, output_path, filename, log_queue, progress_queue, len(media_urls))
- for url, filename in media_urls
- ]
- # Wait for all downloads to complete
- for future in futures:
- future.result()
- log_queue.put(f"Download complete! Files saved in: {output_path}")
- except requests.RequestException as e:
- log_queue.put(f"Error accessing the webpage: {str(e)}")
- except Exception as e:
- log_queue.put(f"An error occurred: {str(e)}")
- class MediaDownloaderApp:
- def __init__(self, root):
- self.root = root
- self.root.title("Media Downloader")
- self.root.geometry("600x400")
- self.log_queue = Queue()
- self.progress_queue = Queue()
- self.download_thread = None
- self.running = False
- # URL input
- tk.Label(root, text="URL:").grid(row=0, column=0, padx=5, pady=5, sticky="e")
- self.url_entry = tk.Entry(root, width=50)
- self.url_entry.grid(row=0, column=1, padx=5, pady=5, columnspan=2)
- # Folder selection
- tk.Label(root, text="Save to:").grid(row=1, column=0, padx=5, pady=5, sticky="e")
- self.folder_entry = tk.Entry(root, width=40)
- self.folder_entry.grid(row=1, column=1, padx=5, pady=5)
- self.folder_entry.insert(0, os.getcwd())
- tk.Button(root, text="Browse", command=self.browse_folder).grid(row=1, column=2, padx=5, pady=5)
- # Progress bar
- self.progress = ttk.Progressbar(root, length=400, mode='determinate')
- self.progress.grid(row=2, column=0, columnspan=3, padx=5, pady=5)
- # Log area
- self.log_text = scrolledtext.ScrolledText(root, height=15, width=60, state='disabled')
- self.log_text.grid(row=3, column=0, columnspan=3, padx=5, pady=5)
- # Buttons
- tk.Button(root, text="Start Download", command=self.start_download).grid(row=4, column=0, padx=5, pady=5)
- tk.Button(root, text="Clear Log", command=self.clear_log).grid(row=4, column=1, padx=5, pady=5)
- tk.Button(root, text="Quit", command=self.quit).grid(row=4, column=2, padx=5, pady=5)
- # Start checking for log and progress updates
- self.update_gui()
- def browse_folder(self):
- folder = filedialog.askdirectory()
- if folder:
- self.folder_entry.delete(0, tk.END)
- self.folder_entry.insert(0, folder)
- def start_download(self):
- if self.running:
- self.log("Download already in progress.")
- return
- url = self.url_entry.get().strip()
- if not url:
- self.log("Please enter a URL.")
- return
- if not url.startswith(('http://', 'https://')):
- url = 'https://' + url
- folder = self.folder_entry.get().strip()
- if not folder:
- self.log("Please select a save directory.")
- return
- self.running = True
- self.progress['value'] = 0
- self.download_thread = threading.Thread(target=get_media_files, args=(url, folder, self.log_queue, self.progress_queue))
- self.download_thread.start()
- self.log(f"Processing {url}...")
- def log(self, message):
- self.log_queue.put(message)
- def update_gui(self):
- """Update the GUI with log messages and progress."""
- while not self.log_queue.empty():
- message = self.log_queue.get()
- self.log_text.config(state='normal')
- self.log_text.insert(tk.END, message + "\n")
- self.log_text.see(tk.END)
- self.log_text.config(state='disabled')
- while not self.progress_queue.empty():
- increment = self.progress_queue.get()
- self.progress['value'] += increment * 100 # Convert fraction to percentage
- if self.progress['value'] >= 99.9:
- self.running = False
- self.root.after(100, self.update_gui)
- def clear_log(self):
- self.log_text.config(state='normal')
- self.log_text.delete(1.0, tk.END)
- self.log_text.config(state='disabled')
- def quit(self):
- self.running = False
- if self.download_thread and self.download_thread.is_alive():
- self.log("Waiting for downloads to complete...")
- self.download_thread.join()
- self.root.destroy()
- def main():
- """Launch the Tkinter GUI."""
- root = tk.Tk()
- app = MediaDownloaderApp(root)
- root.mainloop()
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment