Advertisement
user_137

Untitled

Sep 1st, 2018
170
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.03 KB | None | 0 0
  1. """Creates readable text file from SRT file.
  2.  
  3. NOTES
  4. * Run from command line as
  5. ** python srt_to_txt.py file.srt cp1252
  6. * Creates file.txt with extracted text from file.srt
  7. * Script assumes that lines beginning with lowercase letters or commas
  8. * are part of the previous line and lines beginning with any other character
  9. * are new lines. This won't always be correct.
  10.  
  11. Verbosity: 0 is silent. 2 is after every file. 1 is upon completion only.
  12. """
  13.  
  14. # TODO:
  15. # test that it still works via argv
  16. # consider letting them flag that they want the whole directory done, in argv,
  17.  
  18. import sys
  19. import os
  20. import re
  21.  
  22. _FILE = None
  23. _ENCODING = None
  24. _ONE_FILE = False
  25. ##_FILE = 'L1_S1-en.srt'
  26. _OUT_FORMAT = 3
  27. _VERBOSITY = 2
  28.  
  29.  
  30. def is_time_stamp(l):
  31.     if l[:2].isnumeric() and l[2] == ':':
  32.         return True
  33.     return False
  34.  
  35.  
  36. def has_letters(line):
  37.     if re.search('[a-zA-Z]', line):
  38.         return True
  39.     return False
  40.  
  41.  
  42. def has_no_text(line):
  43.     l = line.strip()
  44.     if not len(l):
  45.         return True
  46.     if l.isnumeric():
  47.         return True
  48.     if is_time_stamp(l):
  49.         return True
  50.     if l[0] == '(' and l[-1] == ')':
  51.         return True
  52.     if not has_letters(line):
  53.         return True
  54.     return False
  55.  
  56.  
  57. def is_lowercase_letter_or_comma(letter):
  58.     if letter.isalpha() and letter.lower() == letter:
  59.         return True
  60.     if letter == ',':
  61.         return True
  62.     return False
  63.  
  64.  
  65. def clean_up(lines):
  66.     """Get rid of all non-text lines and try to combine text broken into multiple lines.
  67.    """
  68.     new_lines = []
  69.     for line in lines[1:]:
  70.         if has_no_text(line):
  71.             continue
  72.         elif len(new_lines) and is_lowercase_letter_or_comma(line[0]):
  73.             # combine with previous line
  74.             new_lines[-1] = new_lines[-1].strip()   ' '   line
  75.         else:
  76.             new_lines.append(line)
  77.     return new_lines
  78.  
  79.  
  80. def main_argv(args):
  81.     """
  82.      args[1]: file name
  83.      args[2]: encoding. Default: utf-8.
  84.        - If you get a lot of [?]s replacing characters,
  85.        - you probably need to change file_encoding to 'cp1252'
  86.    """
  87.     file = args[1]
  88.     file_encoding = 'utf-8' if len(args) < 3 else args[2]
  89.  
  90.     with open(file, encoding=file_encoding, errors='replace') as f:
  91.         lines = f.readlines()
  92.         new_lines = clean_up(lines)
  93.     new_file = file[:-4]   '.txt'
  94.     with open(new_file, 'w') as f:
  95.         for line in new_lines:
  96.             f.write(line)
  97.  
  98.  
  99. def output_all_files(encoding=None, out_format=None):
  100.     """Writes a file *.txt from *.srt for each .srt in the cwd.
  101.    """
  102.     files = scan_cwd_for_srt_files()
  103.     if _VERBOSITY >= 1:
  104.         print('Writing Files:')
  105.     for f in files:
  106.         output_one_file(f, encoding, out_format)
  107.     if _VERBOSITY >= 1:
  108.         print('Done writing all files.')
  109.  
  110.  
  111. def scan_cwd_for_srt_files():
  112.     """Returns a list of the .srt files in the cwd.
  113.    """
  114.     files = [f for f in os.listdir('.') if os.path.isfile(f) and eval("'.srt' in f")]
  115.     return files
  116.  
  117.  
  118. def trunc_after_80(line):
  119.     outlines = []
  120.     while len(line) > 80:
  121.         # is 80 NOT whitespace?
  122.         if line[80].split():
  123.             wordlen = len(line[80:].split()[0])
  124.             outlines.append(line[:80   wordlen]   '\n')
  125.             line = line[80   wordlen   1:]
  126.         else:
  127.             outlines.append(line[:80]   '\n')
  128.             line = line[81:]
  129.     else:
  130.         outlines.append(line)
  131.         return outlines
  132.  
  133.  
  134. def output_one_file(file, encoding=None, out_format=None):
  135.     """Writes a file *.txt from *.srt, using utf-8 encoding by default.
  136.    out_format:
  137.        1 = Simple file with one line per entry in the .srt
  138.        2 = Blank line between every entry
  139.        3 = Like 2, but with strings split after whitespace on col 80
  140.    """
  141.  
  142.     file_encoding = encoding
  143.     new_file = file[:-4]   '.txt'
  144.  
  145.     if out_format is None:
  146.         out_format = _OUT_FORMAT
  147.     if file_encoding is None:
  148.         file_encoding = 'utf-8'
  149.  
  150.     with open(file, encoding=file_encoding, errors='replace') as f:
  151.         lines = f.readlines()
  152.         new_lines = clean_up(lines)
  153.  
  154.     if out_format >= 2:
  155.         new_lines = [line   '\n' for line in new_lines]
  156.  
  157.     if out_format == 3:
  158.         formatted_lines = []
  159.         for line in new_lines:
  160.             for new in trunc_after_80(line):
  161.                 formatted_lines.append(new)
  162.         new_lines = formatted_lines
  163.  
  164.     with open(new_file, 'w') as f:
  165.         for line in new_lines:
  166.             f.write(line)
  167.  
  168.     if _VERBOSITY == 2:
  169.         print('Wrote file:', new_file)
  170.  
  171.  
  172. if __name__ == '__main__':
  173.     if len(sys.argv) > 1:
  174.         main_argv(sys.argv)
  175.     else:
  176.         print('\n\n')
  177.         if _ONE_FILE:
  178.             output_one_file(_FILE, encoding=_ENCODING, out_format=_OUT_FORMAT)
  179.         else:
  180.             output_all_files(encoding=_ENCODING, out_format=_OUT_FORMAT)
  181.         print('\n\n')
  182.  
  183. ##From stackexchange:
  184. ##But be careful while applying this to other directory, like
  185. ##files = [f for f in os.listdir(somedir) if os.path.isfile(f)].
  186. ##which would not work because f is not a full path but relative to the current dir.
  187. ##Therefore, for filtering on another directory, do os.path.isfile(os.path.join(somedir, f))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement