Guest User

Python adblocker skipper

a guest
Oct 26th, 2024
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.50 KB | None | 0 0
  1. from openai import OpenAI
  2. import os
  3. import subprocess
  4. import argparse
  5. import chardet
  6. import re
  7.  
  8. def detect_encoding(file_path):
  9.     with open(file_path, 'rb') as file:
  10.         raw_data = file.read()
  11.     result = chardet.detect(raw_data)
  12.     return result['encoding']
  13.  
  14. def read_srt(file_path):
  15.     encoding = detect_encoding(file_path)
  16.     with open(file_path, 'r', encoding=encoding) as file:
  17.         return file.readlines()
  18.  
  19. def clean_srt_lines(srt_lines):
  20.     cleaned_lines = []
  21.     for line in srt_lines:
  22.         if line.strip() and not line.strip().isdigit():
  23.             cleaned_lines.append(line)
  24.     return cleaned_lines
  25.  
  26. def extract_relevant_parts(srt_lines, keywords=None, lines_before=4, lines_after=100):
  27.     if keywords is None:
  28.         keywords = ["this episode","sponsor", "promotion", "brought to you",  "presented by",
  29.                     "message from", "partnered with", "powered by", "supported by"]
  30.    
  31.     keywords = [keyword.lower() for keyword in keywords]
  32.     relevant_lines = []
  33.    
  34.     for i, line in enumerate(srt_lines):
  35.         if any(keyword in line.lower() for keyword in keywords):
  36.             start = max(i - lines_before, 0)
  37.             end = min(i + lines_after + 1, len(srt_lines))
  38.             relevant_lines.extend(srt_lines[start:end])
  39.     print("".join(relevant_lines))
  40.     return "".join(relevant_lines)
  41.  
  42. def get_sponsored_segments(relevant_srt_content):
  43.     client = OpenAI(
  44.         api_key = 'sk-proj-XYZ'
  45.     )
  46.     response = client.chat.completions.create(
  47.         model="gpt-4o-mini",
  48.         temperature=0.3,
  49.         seed = 1000,
  50.         messages=[
  51.             {"role": "system", "content": "Identify the sponsored segments of the podcast transcription and only output the to-and-from time stamps of the segments. Provide the entirety of segments in one line, like '00:05:18,520 --> 00:07:24,760'. If there are less than 00:00:30 between the start and stop of two segments, combine them into one. One segment per line. ONLY output the timestamps, NOTHING else. If the transcription seems to end before the end of the segment, just provide the last timestamp, as the end of the segment."},
  52.             {"role": "user", "content": relevant_srt_content}
  53.         ]
  54.     )
  55.     # Filter to only include lines with timestamps
  56.     return "\n".join(line for line in response.choices[0].message.content.split("\n") if " --> " in line)
  57.  
  58. def convert_timestamp(timestamp):
  59.     return timestamp.replace(',', '.')
  60.  
  61. def validate_timestamp_format(timestamp):
  62.     return bool(re.match(r'^\d{2}:\d{2}:\d{2},\d{3}$', timestamp))
  63.  
  64. def validate_and_convert_timestamp(timestamp):
  65.     if validate_timestamp_format(timestamp):
  66.         return convert_timestamp(timestamp)
  67.     else:
  68.         raise ValueError(f"Invalid timestamp format: {timestamp}")
  69.  
  70. def run_ffmpeg_command(command):
  71.     result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8')
  72.     if result.returncode != 0:
  73.         print(f"Error running command: {' '.join(command)}")
  74.         print(result.stderr)
  75.     return result
  76.  
  77. def validate_and_split_timestamps(line):
  78.     try:
  79.         start, end = line.split(" --> ")
  80.         start = validate_and_convert_timestamp(start)
  81.         end = validate_and_convert_timestamp(end)
  82.         if start >= end:
  83.             raise ValueError(f"Invalid timestamp logic: start {start} is not before end {end}")
  84.         return start, end
  85.     except ValueError as e:
  86.         print(f"Skipping invalid timestamp line: {line}")
  87.         return None, None
  88.  
  89. def remove_segments(audio_file, segments):
  90.     base_name = os.path.splitext(audio_file)[0]
  91.     output_file = f"{base_name}_edited.mp3"
  92.    
  93.     times = segments.strip().split("\n")
  94.     temp_files = []
  95.     current_start = "00:00:00.000"
  96.     total_saved_time = 0
  97.  
  98.     for i, time in enumerate(times):
  99.         start, end = validate_and_split_timestamps(time)
  100.         if start is None or end is None:
  101.             continue
  102.        
  103.         part = f"{base_name}_part_{i}.mp3"
  104.        
  105.         # Extract the non-sponsored segment before the current sponsored segment
  106.         if current_start < start:
  107.             run_ffmpeg_command(["ffmpeg", "-i", audio_file, "-ss", current_start, "-to", start, "-c", "copy", part])
  108.             temp_files.append(part)
  109.        
  110.         # Calculate the saved time
  111.         start_sec = sum(float(x) * 60 ** i for i, x in enumerate(reversed(start.split(":"))))
  112.         end_sec = sum(float(x) * 60 ** i for i, x in enumerate(reversed(end.split(":"))))
  113.         total_saved_time += end_sec - start_sec
  114.  
  115.         # Update the start time for the next iteration to the end of the current sponsored segment
  116.         current_start = end
  117.  
  118.     # Extract the last non-sponsored segment after the last sponsored segment
  119.     part = f"{base_name}_part_end.mp3"
  120.     run_ffmpeg_command(["ffmpeg", "-i", audio_file, "-ss", current_start, "-c", "copy", part])
  121.     temp_files.append(part)
  122.    
  123.     # Concatenate all parts together
  124.     with open("concat_list.txt", "w", encoding='utf-8') as f:
  125.         for temp_file in temp_files:
  126.             f.write(f"file '{temp_file}'\n")
  127.    
  128.     run_ffmpeg_command(["ffmpeg", "-f", "concat", "-safe", "0", "-i", "concat_list.txt", "-c", "copy", output_file])
  129.    
  130.     # Clean up temporary files
  131.  #   for temp_file in temp_files:
  132.  #       os.remove(temp_file)
  133.  #   os.remove("concat_list.txt")
  134.    
  135.     return output_file, total_saved_time
  136.  
  137. def main(audio_file, srt_file=None):
  138.     # Remove leading/trailing whitespaces and get absolute path
  139.     audio_file = os.path.abspath(audio_file.strip())
  140.    
  141.     # Check if the first argument is an .mp3 file
  142.     if not audio_file.lower().endswith('.mp3'):
  143.         print("Error: The first argument must be an .mp3 file.")
  144.         return
  145.    
  146.     # If the second argument is not provided, look for the corresponding .srt file
  147.     if srt_file is None:
  148.         base_name = os.path.splitext(audio_file)[0]
  149.         srt_file = f"{base_name}.srt"
  150.    
  151.     srt_file = os.path.abspath(srt_file.strip())
  152.  
  153.     # Debugging: Check if the files exist
  154.     if not os.path.exists(audio_file):
  155.         print(f"File not found: {audio_file}")
  156.         return
  157.     if not os.path.exists(srt_file):
  158.         print(f"File not found: {srt_file}")
  159.         return
  160.    
  161.     srt_content = read_srt(srt_file)
  162.     cleaned_srt_content = clean_srt_lines(srt_content)
  163.     relevant_srt_content = extract_relevant_parts(cleaned_srt_content)
  164.     sponsored_time_stamps = get_sponsored_segments(relevant_srt_content)
  165.    
  166.     # Print the segment timestamps
  167.     print("Sponsored Segments Timestamps:")
  168.     print(sponsored_time_stamps.strip())
  169.    
  170.     # Print the number of segments
  171.     num_segments = sponsored_time_stamps.strip().count("\n") + 1 if sponsored_time_stamps.strip() else 0
  172.     print(f"Number of Segments: {num_segments}")
  173.  
  174.     edited_file, total_saved_time = remove_segments(audio_file, sponsored_time_stamps)
  175.    
  176.     # Print the amount of time saved
  177.     minutes, seconds = divmod(int(total_saved_time), 60)
  178.     print(f"Total Time Saved: {minutes} minutes and {seconds} seconds")
  179.    
  180.     print(f"Edited file created: {edited_file}")
  181.  
  182. if __name__ == "__main__":
  183.     parser = argparse.ArgumentParser(description="Remove sponsored segments from a podcast")
  184.     parser.add_argument("audio_file", type=str, help="Path to the podcast audio file")
  185.     parser.add_argument("srt_file", type=str, nargs='?', help="Path to the SRT transcription file")
  186.    
  187.     args = parser.parse_args()
  188.    
  189.     main(args.audio_file, args.srt_file)
  190.  
Advertisement
Add Comment
Please, Sign In to add comment