Advertisement
Guest User

Untitled

a guest
Aug 26th, 2024
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.89 KB | None | 0 0
  1. import argparse
  2. import csv
  3. import hashlib
  4. import json
  5. import multiprocessing as mp
  6. import os
  7. import random
  8. import re
  9. import subprocess
  10. import time
  11. import urllib.request
  12. from urllib.parse import urlparse
  13. import uuid
  14.  
  15. import requests
  16. from tqdm import tqdm
  17. import yt_dlp
  18.  
  19.  
  20. """
  21. Example usage: python download.py --csv_file "data.csv" --output_dir "downloads" --workers 4
  22. """
  23.  
  24. def setup_argparse():
  25.     parser = argparse.ArgumentParser(description="Download audio from YouTube, Bilibili, or direct MP3 links.")
  26.     parser.add_argument("--csv_file", help="Path to the CSV file containing download information.")
  27.     parser.add_argument("--output_dir", help="Directory to save downloaded files.")
  28.     parser.add_argument("--workers", type=int, default=4, help="Number of worker processes (default: 4)")
  29.     return parser.parse_args()
  30.  
  31. def read_csv(csv_file):
  32.     with open(csv_file, 'r', encoding='utf-8') as f:
  33.         reader = csv.DictReader(f)
  34.         return list(reader)
  35.  
  36. def sanitize_filename(filename):
  37.     return re.sub(r'[\\/*?:"<>|]', "", filename)
  38.  
  39. def bv2av(x):
  40.     table = 'fZodR9XQDSUm21yCkr6zBqiveYah8bt4xsWpHnJE7jL5VG3guMTKNPAwcF'
  41.     tr = {table[i]: i for i in range(58)}
  42.     s = [11, 10, 3, 8, 4, 6]
  43.     xor = 177451812
  44.     add = 8728348608
  45.     r = 0
  46.     for i in range(6):
  47.         r += tr[x[s[i]]] * (58 ** i)
  48.     return (r - add) ^ xor
  49.  
  50. def bv_url_to_av_url(url):
  51.     def replacer(match):
  52.         bv = match.group(1)
  53.         av_code = bv2av(bv)
  54.         return match.group(0).replace(bv, 'av' + str(av_code))
  55.     return re.sub(r'/video/(BV([a-zA-Z0-9]+))', replacer, url)
  56.  
  57. def get_play_list(start_url, cid, quality):
  58.     entropy = 'rbMCKn@KuamXWlPMoJGsKcbiJKUfkPF_8dABscJntvqhRSETg'
  59.     appkey, sec = ''.join([chr(ord(i) + 2) for i in entropy[::-1]]).split(':')
  60.     params = f'appkey={appkey}&cid={cid}&otype=json&qn={quality}&quality={quality}&type='
  61.     chksum = hashlib.md5(bytes(params + sec, 'utf8')).hexdigest()
  62.     url_api = f'https://interface.bilibili.com/v2/playurl?{params}&sign={chksum}'
  63.     headers = {
  64.         'Referer': start_url,
  65.         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
  66.     }
  67.     response = requests.get(url_api, headers=headers)
  68.     video_list = []
  69.     if response.status_code == 200:
  70.         html = response.json()
  71.         video_list = [i['url'] for i in html['durl']]
  72.     return video_list
  73.  
  74. def download_bilibili(url, output_path):
  75.     try:
  76.         start_url = bv_url_to_av_url(url)
  77.         p_id = re.search(r'\?p=(\d+)', start_url).group(1) if '?p=' in start_url else None
  78.         aid = re.search(r'/av(\d+)/*', start_url).group(1)
  79.         api_url = f'https://api.bilibili.com/x/web-interface/view?aid={aid}'
  80.         headers = {
  81.             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
  82.         }
  83.         response = requests.get(api_url, headers=headers)
  84.         if response.status_code == 200:
  85.             data = response.json()['data']
  86.             cid_list = data['pages'][int(p_id) - 1] if p_id else data['pages'][0]
  87.             cid = str(cid_list['cid'])
  88.             title = cid_list['part'] or data["title"].replace(" ", "_")
  89.             title = sanitize_filename(title)
  90.             video_list = get_play_list(start_url, cid, quality=80)
  91.             if video_list:
  92.                 opener = urllib.request.build_opener()
  93.                 opener.addheaders = [
  94.                     ('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0'),
  95.                     ('Accept', '*/*'),
  96.                     ('Accept-Language', 'en-US,en;q=0.5'),
  97.                     ('Accept-Encoding', 'gzip, deflate, br'),
  98.                     ('Range', 'bytes=0-'),
  99.                     ('Referer', start_url),
  100.                     ('Origin', 'https://www.bilibili.com'),
  101.                     ('Connection', 'keep-alive'),
  102.                 ]
  103.                 urllib.request.install_opener(opener)
  104.                 temp_file = get_unique_filename(output_path, "flv")
  105.                 urllib.request.urlretrieve(url=video_list[0], filename=temp_file)
  106.                 return temp_file
  107.             else:
  108.                 raise Exception("No video URL found")
  109.         else:
  110.             raise Exception(f"Failed to fetch video info: HTTP {response.status_code}")
  111.     except Exception as e:
  112.         raise Exception(f"Bilibili download failed: {str(e)}")
  113.  
  114. def get_unique_filename(directory, extension):
  115.     return os.path.join(directory, f"temp_{uuid.uuid4().hex}.{extension}")
  116.  
  117. def download_youtube(url, output_path):
  118.     temp_file = get_unique_filename(output_path, "mp3")
  119.     ydl_opts = {
  120.         'format': 'bestaudio/best',
  121.         'postprocessors': [{
  122.             'key': 'FFmpegExtractAudio',
  123.             'preferredcodec': 'mp3',
  124.             'preferredquality': '192',
  125.         }],
  126.         'outtmpl': temp_file,
  127.     }
  128.     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
  129.         ydl.download([url])
  130.     return temp_file
  131.  
  132. def download_mp3(url, output_path):
  133.     try:
  134.         response = requests.get(url)
  135.         response.raise_for_status()
  136.         mp3_path = get_unique_filename(output_path, "mp3")
  137.         with open(mp3_path, 'wb') as f:
  138.             f.write(response.content)
  139.         return mp3_path
  140.     except Exception as e:
  141.         raise Exception(f"MP3 download failed: {str(e)}")
  142.  
  143. def convert_to_flac(input_path, output_path):
  144.     input_path += ".mp3"
  145.     try:
  146.         subprocess.run(['ffmpeg', '-i', input_path, '-c:a', 'flac', output_path], check=True)
  147.     except subprocess.CalledProcessError as e:
  148.         raise Exception(f"FLAC conversion failed: {str(e)}")
  149.  
  150. def process_row(row, output_dir):
  151.     url = row['Url']
  152.     title = sanitize_filename(row['Title'])
  153.     bonafide_or_deepfake = row['Bonafide Or Deepfake'].lower()
  154.     output_filename = f"{bonafide_or_deepfake}_{title}.flac"
  155.     output_path = os.path.join(output_dir, output_filename)
  156.  
  157.     temp_file = None
  158.     try:
  159.         if 'youtube.com' in url or 'youtu.be' in url:
  160.             temp_file = download_youtube(url, output_dir)
  161.         elif 'bilibili.com' in url:
  162.             temp_file = download_bilibili(url, output_dir)
  163.         elif urlparse(url).path.endswith('.mp3'):
  164.             temp_file = download_mp3(url, output_dir)
  165.         else:
  166.             raise ValueError(f"Unsupported URL: {url}")
  167.  
  168.         convert_to_flac(temp_file, output_path)
  169.         return True, None
  170.     except Exception as e:
  171.         return False, str(e)
  172.     finally:
  173.         if temp_file and os.path.exists(temp_file):
  174.             os.remove(temp_file)
  175.             temp_file += ".mp3"
  176.             if os.path.exists(temp_file):
  177.                 os.remove(temp_file)
  178.  
  179. def worker(queue, output_dir, log_queue, progress_queue):
  180.     while True:
  181.         item = queue.get()
  182.         if item is None:
  183.             break
  184.         index, row = item
  185.         success, error = process_row(row, output_dir)
  186.         log_queue.put((index, row['Url'], success, error))
  187.         progress_queue.put(1)
  188.         time.sleep(random.uniform(0.5, 1.5))
  189.  
  190. def logger(log_queue, log_file):
  191.     with open(log_file, 'w', encoding='utf-8') as f:
  192.         while True:
  193.             item = log_queue.get()
  194.             if item is None:
  195.                 break
  196.             index, url, success, error = item
  197.             log_entry = f"{index},{url},{'Success' if success else 'Failure'},{error if error else ''}\n"
  198.             f.write(log_entry)
  199.             f.flush()
  200.  
  201. def progress_tracker(progress_queue, total_rows):
  202.     pbar = tqdm(total=total_rows, desc="Downloading")
  203.     completed = 0
  204.     while completed < total_rows:
  205.         increment = progress_queue.get()
  206.         if increment is None:
  207.             break
  208.         completed += increment
  209.         pbar.update(increment)
  210.     pbar.close()
  211.  
  212. def main():
  213.     args = setup_argparse()
  214.     os.makedirs(args.output_dir, exist_ok=True)
  215.  
  216.     rows = read_csv(args.csv_file)
  217.     total_rows = len(rows)
  218.  
  219.     task_queue = mp.Queue()
  220.     log_queue = mp.Queue()
  221.     progress_queue = mp.Queue()
  222.  
  223.     for i, row in enumerate(rows):
  224.         task_queue.put((i, row))
  225.  
  226.     log_file = os.path.join(args.output_dir, 'download_log.csv')
  227.     log_process = mp.Process(target=logger, args=(log_queue, log_file))
  228.     log_process.start()
  229.  
  230.     progress_process = mp.Process(target=progress_tracker, args=(progress_queue, total_rows))
  231.     progress_process.start()
  232.  
  233.     workers = []
  234.     for _ in range(args.workers):
  235.         p = mp.Process(target=worker, args=(task_queue, args.output_dir, log_queue, progress_queue))
  236.         workers.append(p)
  237.         p.start()
  238.  
  239.     for _ in range(args.workers):
  240.         task_queue.put(None)
  241.  
  242.     for w in workers:
  243.         w.join()
  244.  
  245.     log_queue.put(None)
  246.     log_process.join()
  247.  
  248.     progress_queue.put(None)
  249.     progress_process.join()
  250.  
  251. if __name__ == "__main__":
  252.     mp.freeze_support()
  253.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement