Guest User

media_downloader.py

a guest
Oct 2nd, 2025
50
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.03 KB | None | 0 0
  1. # media_downloader.py
  2.  
  3. import requests
  4. from bs4 import BeautifulSoup
  5. import os
  6. import urllib.parse
  7. import re
  8. from urllib.parse import urljoin
  9. from concurrent.futures import ThreadPoolExecutor
  10. import threading
  11. import hashlib
  12. import tkinter as tk
  13. from tkinter import filedialog, ttk, scrolledtext
  14. from queue import Queue
  15. import time
  16.  
  17. def sanitize_filename(filename):
  18. """Sanitize filename by removing invalid characters."""
  19. return re.sub(r'[^\w\-_\. ]', '', filename.replace('/', '_'))
  20.  
  21. def calculate_file_hash(filepath):
  22. """Calculate SHA-256 hash of a file."""
  23. sha256 = hashlib.sha256()
  24. with open(filepath, 'rb') as f:
  25. for chunk in iter(lambda: f.read(8192), b""):
  26. sha256.update(chunk)
  27. return sha256.hexdigest()
  28.  
  29. def calculate_data_hash(data):
  30. """Calculate SHA-256 hash of data in memory."""
  31. sha256 = hashlib.sha256()
  32. for chunk in data:
  33. sha256.update(chunk)
  34. return sha256.hexdigest()
  35.  
  36. def download_file(url, folder, filename, log_queue, progress_queue, total_files):
  37. """Download a file and save it to the specified folder, overwriting only if hashes match."""
  38. try:
  39. response = requests.get(url, stream=True, headers={
  40. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
  41. })
  42. if response.status_code == 200:
  43. filename = sanitize_filename(filename)
  44. filepath = os.path.join(folder, filename)
  45.  
  46. # Store downloaded chunks in memory for hash calculation
  47. chunks = []
  48. for chunk in response.iter_content(chunk_size=8192):
  49. if chunk:
  50. chunks.append(chunk)
  51.  
  52. # Calculate hash of downloaded data
  53. downloaded_hash = calculate_data_hash(chunks)
  54.  
  55. # Check if file exists and compare hashes
  56. if os.path.exists(filepath):
  57. existing_hash = calculate_file_hash(filepath)
  58. if existing_hash == downloaded_hash:
  59. with open(filepath, 'wb') as f:
  60. for chunk in chunks:
  61. f.write(chunk)
  62. log_queue.put(f"[{threading.current_thread().name}] Overwritten (identical hash): {filepath}")
  63. progress_queue.put(1 / total_files)
  64. return
  65. else:
  66. base, ext = os.path.splitext(filename)
  67. counter = 1
  68. while os.path.exists(filepath):
  69. new_filename = f"{base}_{counter}{ext}"
  70. filepath = os.path.join(folder, new_filename)
  71. if os.path.exists(filepath):
  72. existing_hash = calculate_file_hash(filepath)
  73. if existing_hash == downloaded_hash:
  74. with open(filepath, 'wb') as f:
  75. for chunk in chunks:
  76. f.write(chunk)
  77. log_queue.put(f"[{threading.current_thread().name}] Overwritten (identical hash): {filepath}")
  78. progress_queue.put(1 / total_files)
  79. return
  80. counter += 1
  81. filename = new_filename
  82.  
  83. # Write new file
  84. with open(filepath, 'wb') as f:
  85. for chunk in chunks:
  86. f.write(chunk)
  87. log_queue.put(f"[{threading.current_thread().name}] Downloaded: {filepath}")
  88. progress_queue.put(1 / total_files)
  89. else:
  90. log_queue.put(f"[{threading.current_thread().name}] Failed to download: {url} (Status: {response.status_code})")
  91. progress_queue.put(1 / total_files)
  92. except Exception as e:
  93. log_queue.put(f"[{threading.current_thread().name}] Error downloading {url}: {str(e)}")
  94. progress_queue.put(1 / total_files)
  95.  
  96. def get_media_files(url, folder, log_queue, progress_queue):
  97. """Main function to scrape and download media files."""
  98. try:
  99. # Send request to the webpage
  100. headers = {
  101. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
  102. }
  103. response = requests.get(url, headers=headers)
  104. response.raise_for_status()
  105.  
  106. # Parse the page
  107. soup = BeautifulSoup(response.text, 'html.parser')
  108.  
  109. # Find all media links
  110. media_extensions = {'.mp4', '.webm', '.mov', '.avi', '.mkv', '.flv', '.wmv'}
  111. media_urls = set()
  112.  
  113. # Check all <a> tags
  114. for link in soup.find_all('a', href=True):
  115. href = link['href']
  116. absolute_url = urljoin(url, href)
  117. if any(ext in href.lower() for ext in media_extensions):
  118. media_urls.add((absolute_url, os.path.basename(urllib.parse.urlparse(absolute_url).path)))
  119.  
  120. # Check all <source> tags (for video elements)
  121. for source in soup.find_all('source'):
  122. if 'src' in source.attrs:
  123. src = urljoin(url, source['src'])
  124. if any(ext in src.lower() for ext in media_extensions):
  125. media_urls.add((src, os.path.basename(urllib.parse.urlparse(src).path)))
  126.  
  127. if not media_urls:
  128. log_queue.put("No media files found on the page.")
  129. return
  130.  
  131. # Create output directory with sanitized name
  132. title = soup.find('title')
  133. if title and title.text.strip():
  134. folder_name = sanitize_filename(title.text.strip()[:50]) # Limit length
  135. else:
  136. parsed_url = urllib.parse.urlparse(url)
  137. folder_name = sanitize_filename(parsed_url.path.split('/')[-1] or 'media_download')
  138.  
  139. output_path = os.path.join(folder, folder_name)
  140. os.makedirs(output_path, exist_ok=True)
  141.  
  142. # Download files using ThreadPoolExecutor
  143. cpu_count = os.cpu_count() or 4 # Fallback to 4 if cpu_count is None
  144. max_workers = min(len(media_urls), cpu_count * 2) # Twice the CPU cores or number of files
  145. log_queue.put(f"Starting download of {len(media_urls)} files with {max_workers} threads...")
  146. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  147. futures = [
  148. executor.submit(download_file, url, output_path, filename, log_queue, progress_queue, len(media_urls))
  149. for url, filename in media_urls
  150. ]
  151. # Wait for all downloads to complete
  152. for future in futures:
  153. future.result()
  154.  
  155. log_queue.put(f"Download complete! Files saved in: {output_path}")
  156.  
  157. except requests.RequestException as e:
  158. log_queue.put(f"Error accessing the webpage: {str(e)}")
  159. except Exception as e:
  160. log_queue.put(f"An error occurred: {str(e)}")
  161.  
  162. class MediaDownloaderApp:
  163. def __init__(self, root):
  164. self.root = root
  165. self.root.title("Media Downloader")
  166. self.root.geometry("600x400")
  167.  
  168. self.log_queue = Queue()
  169. self.progress_queue = Queue()
  170. self.download_thread = None
  171. self.running = False
  172.  
  173. # URL input
  174. tk.Label(root, text="URL:").grid(row=0, column=0, padx=5, pady=5, sticky="e")
  175. self.url_entry = tk.Entry(root, width=50)
  176. self.url_entry.grid(row=0, column=1, padx=5, pady=5, columnspan=2)
  177.  
  178. # Folder selection
  179. tk.Label(root, text="Save to:").grid(row=1, column=0, padx=5, pady=5, sticky="e")
  180. self.folder_entry = tk.Entry(root, width=40)
  181. self.folder_entry.grid(row=1, column=1, padx=5, pady=5)
  182. self.folder_entry.insert(0, os.getcwd())
  183. tk.Button(root, text="Browse", command=self.browse_folder).grid(row=1, column=2, padx=5, pady=5)
  184.  
  185. # Progress bar
  186. self.progress = ttk.Progressbar(root, length=400, mode='determinate')
  187. self.progress.grid(row=2, column=0, columnspan=3, padx=5, pady=5)
  188.  
  189. # Log area
  190. self.log_text = scrolledtext.ScrolledText(root, height=15, width=60, state='disabled')
  191. self.log_text.grid(row=3, column=0, columnspan=3, padx=5, pady=5)
  192.  
  193. # Buttons
  194. tk.Button(root, text="Start Download", command=self.start_download).grid(row=4, column=0, padx=5, pady=5)
  195. tk.Button(root, text="Clear Log", command=self.clear_log).grid(row=4, column=1, padx=5, pady=5)
  196. tk.Button(root, text="Quit", command=self.quit).grid(row=4, column=2, padx=5, pady=5)
  197.  
  198. # Start checking for log and progress updates
  199. self.update_gui()
  200.  
  201. def browse_folder(self):
  202. folder = filedialog.askdirectory()
  203. if folder:
  204. self.folder_entry.delete(0, tk.END)
  205. self.folder_entry.insert(0, folder)
  206.  
  207. def start_download(self):
  208. if self.running:
  209. self.log("Download already in progress.")
  210. return
  211.  
  212. url = self.url_entry.get().strip()
  213. if not url:
  214. self.log("Please enter a URL.")
  215. return
  216. if not url.startswith(('http://', 'https://')):
  217. url = 'https://' + url
  218. folder = self.folder_entry.get().strip()
  219. if not folder:
  220. self.log("Please select a save directory.")
  221. return
  222.  
  223. self.running = True
  224. self.progress['value'] = 0
  225. self.download_thread = threading.Thread(target=get_media_files, args=(url, folder, self.log_queue, self.progress_queue))
  226. self.download_thread.start()
  227. self.log(f"Processing {url}...")
  228.  
  229. def log(self, message):
  230. self.log_queue.put(message)
  231.  
  232. def update_gui(self):
  233. """Update the GUI with log messages and progress."""
  234. while not self.log_queue.empty():
  235. message = self.log_queue.get()
  236. self.log_text.config(state='normal')
  237. self.log_text.insert(tk.END, message + "\n")
  238. self.log_text.see(tk.END)
  239. self.log_text.config(state='disabled')
  240.  
  241. while not self.progress_queue.empty():
  242. increment = self.progress_queue.get()
  243. self.progress['value'] += increment * 100 # Convert fraction to percentage
  244. if self.progress['value'] >= 99.9:
  245. self.running = False
  246.  
  247. self.root.after(100, self.update_gui)
  248.  
  249. def clear_log(self):
  250. self.log_text.config(state='normal')
  251. self.log_text.delete(1.0, tk.END)
  252. self.log_text.config(state='disabled')
  253.  
  254. def quit(self):
  255. self.running = False
  256. if self.download_thread and self.download_thread.is_alive():
  257. self.log("Waiting for downloads to complete...")
  258. self.download_thread.join()
  259. self.root.destroy()
  260.  
  261. def main():
  262. """Launch the Tkinter GUI."""
  263. root = tk.Tk()
  264. app = MediaDownloaderApp(root)
  265. root.mainloop()
  266.  
  267. if __name__ == "__main__":
  268. main()
Advertisement
Add Comment
Please, Sign In to add comment