Advertisement
Guest User

Untitled

a guest
Sep 14th, 2024
133
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.49 KB | None | 0 0
  1. import argparse
  2. import csv
  3. import hashlib
  4. import json
  5. import multiprocessing as mp
  6. import os
  7. import random
  8. import re
  9. import subprocess
  10. import shutil
  11. import time
  12. import urllib.request
  13. from urllib.parse import urlparse
  14. import uuid
  15.  
  16. import requests
  17. from tqdm import tqdm
  18. import yt_dlp
  19.  
  20.  
  21. """
  22. Example usage: python download.py --csv_file "data.csv" --output_dir "downloads" --workers 4
  23. """
  24.  
  25. def setup_argparse():
  26.     parser = argparse.ArgumentParser(description="Download audio from YouTube, Bilibili, or direct MP3 links.")
  27.     parser.add_argument("--csv_file", help="Path to the CSV file containing download information.")
  28.     parser.add_argument("--output_dir", help="Directory to save downloaded files.")
  29.     parser.add_argument("--workers", type=int, default=4, help="Number of worker processes (default: 4)")
  30.     return parser.parse_args()
  31.  
  32. def read_csv(csv_file):
  33.     with open(csv_file, 'r', encoding='utf-8') as f:
  34.         reader = csv.DictReader(f)
  35.         return list(reader)
  36.  
  37. def sanitize_filename(filename):
  38.     return re.sub(r'[\\/*?:"<>|]', "", filename)
  39.  
  40. def bv2av(x):
  41.     table = 'fZodR9XQDSUm21yCkr6zBqiveYah8bt4xsWpHnJE7jL5VG3guMTKNPAwcF'
  42.     tr = {table[i]: i for i in range(58)}
  43.     s = [11, 10, 3, 8, 4, 6]
  44.     xor = 177451812
  45.     add = 8728348608
  46.     r = 0
  47.     for i in range(6):
  48.         r += tr[x[s[i]]] * (58 ** i)
  49.     return (r - add) ^ xor
  50.  
  51. def bv_url_to_av_url(url):
  52.     bv = re.search(r'/BV([0-9A-Za-z]+)', url).group(0)
  53.     av = bv2av(bv[1:])
  54.     new_url = url.replace(bv, f'/av{av}')
  55.     return new_url
  56.  
  57. import requests
  58. import re
  59. import os
  60. import json
  61. from urllib.parse import urlparse
  62. from tqdm import tqdm
  63.  
  64. # Regular expressions to extract play info
  65. REGEX_PLAY_INFO = r'<script>window\.__playinfo__=(.*?)</script>'
  66. REGEX_INITIAL_STATE = r'__INITIAL_STATE__=(.*?);\(function\(\)'
  67.  
  68. USER_AGENT = (
  69.     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
  70.     "AppleWebKit/537.36 (KHTML, like Gecko) "
  71.     "Chrome/90.0.4430.212 Safari/537.36"
  72. )
  73. BILIBILI_URL = "https://www.bilibili.com"
  74.  
  75. def get_play_url_web_page(url):
  76.     headers = {
  77.         'User-Agent': USER_AGENT,
  78.         'Referer': BILIBILI_URL,
  79.         'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
  80.         'accept-encoding': 'gzip, deflate, br'
  81.     }
  82.     response = requests.get(url, headers=headers)
  83.     html = response.text
  84.  
  85.     match_initial_state = re.search(REGEX_INITIAL_STATE, html)
  86.     match_play_info = re.search(REGEX_PLAY_INFO, html)
  87.  
  88.     if not match_initial_state or not match_play_info:
  89.         return None, None
  90.  
  91.     initial_state = match_initial_state.group(1)
  92.     play_info = match_play_info.group(1)
  93.  
  94.     # Parse JSON data
  95.     initial_state_json = json.loads(initial_state)
  96.     play_info_json = json.loads(play_info)
  97.  
  98.     return initial_state_json, play_info_json
  99.  
  100. def parse_video(play_info):
  101.     video_list = play_info['data']['dash']['video']
  102.     # Find the best quality video
  103.     best_video = max(video_list, key=lambda x: x['height'] * x['width'])
  104.     return best_video
  105.  
  106. def parse_audio(play_info):
  107.     audio_list = play_info['data']['dash']['audio']
  108.     # Choose the highest bitrate audio stream
  109.     best_audio = max(audio_list, key=lambda x: x['bandwidth'])
  110.     return best_audio
  111.  
  112. def download_file(url, headers, filename):
  113.     response = requests.get(url, headers=headers, stream=True)
  114.     total = int(response.headers.get('content-length', 0))
  115.     with open(filename, 'ab') as file, tqdm(
  116.         desc=os.path.basename(filename),
  117.         total=total,
  118.         unit='iB',
  119.         unit_scale=True,
  120.         unit_divisor=1024,
  121.         initial=file.tell(),
  122.     ) as bar:
  123.         for data in response.iter_content(chunk_size=1024):
  124.             size = file.write(data)
  125.             bar.update(size)
  126.  
  127. def download_bilibili(bilibili_url, output_dir):
  128.     initial_state, play_info = get_play_url_web_page(bilibili_url)
  129.     if not initial_state or not play_info:
  130.         print("Failed to retrieve video information.")
  131.         return
  132.  
  133.     video_data = initial_state['videoData']
  134.     bvid = video_data['bvid']
  135.     title = video_data['title']
  136.     unique_name = f"[{bvid}]{title}"
  137.     unique_name = re.sub(r'[\\/*?:"<>|]', "", unique_name)
  138.  
  139.     best_audio = parse_audio(play_info)
  140.     audio_url = best_audio['baseUrl']
  141.  
  142.     # Set up headers
  143.     headers = {
  144.         'User-Agent': USER_AGENT,
  145.         'Referer': BILIBILI_URL,
  146.     }
  147.    
  148.     if not os.path.exists(output_dir):
  149.         os.makedirs(output_dir)
  150.  
  151.     # Download audio
  152.     audio_filename = os.path.join(output_dir, f"{unique_name}_audio.m4a")
  153.     print(f"Downloading audio to {audio_filename}")
  154.     download_file(audio_url, headers, audio_filename)
  155.     return audio_filename
  156.  
  157. def get_unique_filename(directory, extension):
  158.     return os.path.join(directory, f"temp_{uuid.uuid4().hex}.{extension}")
  159.  
  160. def download_youtube(url, output_path):
  161.     # Process URL and extract the video ID
  162.     parsed_url = urlparse(url)
  163.     video_id = parsed_url.query.split('v=')[-1][:11]
  164.     url_constructed = f"https://www.youtube.com/watch?v={video_id}"
  165.  
  166.     ydl_opts = {
  167.         'format': 'bestaudio/best',
  168.         'postprocessors': [{
  169.             'key': 'FFmpegExtractAudio',
  170.             'preferredcodec': 'flac',
  171.         }],
  172.         'outtmpl': output_path,
  173.         'socket_timeout': 300,  # 5 minutes timeout
  174.         'noplaylist': True,     # Ensure only the video is downloaded, not a playlist
  175.         'extract_flat': True,   # Don't extract more info than necessary
  176.     }
  177.  
  178.     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
  179.         ydl.download([url_constructed])
  180.  
  181.     return output_path
  182.  
  183. def download_mp3(url, output_path):
  184.     try:
  185.         response = requests.get(url)
  186.         response.raise_for_status()
  187.         mp3_path = get_unique_filename(os.path.dirname(output_path), "mp3")
  188.         with open(mp3_path, 'wb') as f:
  189.             f.write(response.content)
  190.         return mp3_path
  191.     except Exception as e:
  192.         raise Exception(f"MP3 download failed: {str(e)}")
  193.  
  194. def convert_to_flac(input_path, output_path):
  195.     try:
  196.         subprocess.run(['ffmpeg', '-y', '-i', input_path, '-c:a', 'flac', output_path], check=True)
  197.     except subprocess.CalledProcessError as e:
  198.         raise Exception(f"FLAC conversion failed: {str(e)}")
  199.  
  200. def process_row(row, output_dir):
  201.     url = row['Url']
  202.     title = sanitize_filename(row['Title'])
  203.     bonafide_or_deepfake = row['Bonafide Or Deepfake'].lower()
  204.     output_filename = f"{bonafide_or_deepfake}_{title}.flac"
  205.     output_path = os.path.join(output_dir, output_filename)
  206.    
  207.     # if the output_path has been downloaded, then skip.
  208.     if os.path.exists(output_path + '.flac'):
  209.         return True, None
  210.  
  211.     temp_file = None
  212.     try:
  213.         if 'youtube.com' in url or 'youtu.be' in url:
  214.             download_youtube(url, output_path)
  215.         elif 'bilibili.com' in url:
  216.             temp_file = download_bilibili(url, output_dir)
  217.             convert_to_flac(temp_file, output_path)
  218.         elif urlparse(url).path.endswith('.mp3'):
  219.             temp_file = download_mp3(url, output_dir)
  220.             convert_to_flac(temp_file, output_path)
  221.         else:
  222.             raise ValueError(f"Unsupported URL: {url}")
  223.  
  224.         return True, None
  225.     except Exception as e:
  226.         return False, str(e)
  227.     finally:
  228.         if temp_file and os.path.exists(temp_file):
  229.             os.remove(temp_file)
  230.  
  231. def worker(queue, output_dir, log_queue, progress_queue):
  232.     while True:
  233.         item = queue.get()
  234.         if item is None:
  235.             break
  236.         index, row = item
  237.         success, error = process_row(row, output_dir)
  238.         log_queue.put((index, row['Url'], success, error))
  239.         progress_queue.put(1)
  240.         time.sleep(random.uniform(1, 2.5))
  241.  
  242. def logger(log_queue, log_file):
  243.     with open(log_file, 'w', encoding='utf-8') as f:
  244.         while True:
  245.             item = log_queue.get()
  246.             if item is None:
  247.                 break
  248.             index, url, success, error = item
  249.             log_entry = f"{index},{url},{'Success' if success else 'Failure'},{error if error else ''}\n"
  250.             f.write(log_entry)
  251.             f.flush()
  252.  
  253. def progress_tracker(progress_queue, total_rows):
  254.     pbar = tqdm(total=total_rows, desc="Downloading")
  255.     completed = 0
  256.     while completed < total_rows:
  257.         increment = progress_queue.get()
  258.         if increment is None:
  259.             break
  260.         completed += increment
  261.         pbar.update(increment)
  262.     pbar.close()
  263.  
  264. def main():
  265.     args = setup_argparse()
  266.     os.makedirs(args.output_dir, exist_ok=True)
  267.  
  268.     rows = read_csv(args.csv_file)
  269.     total_rows = len(rows)
  270.  
  271.     task_queue = mp.Queue()
  272.     log_queue = mp.Queue()
  273.     progress_queue = mp.Queue()
  274.  
  275.     for i, row in enumerate(rows):
  276.         task_queue.put((i, row))
  277.  
  278.     log_file = os.path.join(args.output_dir, 'download_log.csv')
  279.     log_process = mp.Process(target=logger, args=(log_queue, log_file))
  280.     log_process.start()
  281.  
  282.     progress_process = mp.Process(target=progress_tracker, args=(progress_queue, total_rows))
  283.     progress_process.start()
  284.  
  285.     workers = []
  286.     for _ in range(args.workers):
  287.         p = mp.Process(target=worker, args=(task_queue, args.output_dir, log_queue, progress_queue))
  288.         workers.append(p)
  289.         p.start()
  290.  
  291.     for _ in range(args.workers):
  292.         task_queue.put(None)
  293.  
  294.     for w in workers:
  295.         w.join()
  296.  
  297.     log_queue.put(None)
  298.     log_process.join()
  299.  
  300.     progress_queue.put(None)
  301.     progress_process.join()
  302.  
  303. if __name__ == "__main__":
  304.     mp.freeze_support()
  305.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement