Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """Creates readable text file from SRT file.
- NOTES
- * Run from command line as
- ** python srt_to_txt.py file.srt cp1252
- * Creates file.txt with extracted text from file.srt
- * Script assumes that lines beginning with lowercase letters or commas
- * are part of the previous line and lines beginning with any other character
- * are new lines. This won't always be correct.
- Verbosity: 0 is silent. 2 is after every file. 1 is upon completion only.
- """
- # TODO:
- # test that it still works via argv
- # consider letting them flag that they want the whole directory done, in argv,
- import sys
- import os
- import re
- _FILE = None
- _ENCODING = None
- _ONE_FILE = False
- ##_FILE = 'L1_S1-en.srt'
- _OUT_FORMAT = 3
- _VERBOSITY = 2
- def is_time_stamp(l):
- if l[:2].isnumeric() and l[2] == ':':
- return True
- return False
- def has_letters(line):
- if re.search('[a-zA-Z]', line):
- return True
- return False
- def has_no_text(line):
- l = line.strip()
- if not len(l):
- return True
- if l.isnumeric():
- return True
- if is_time_stamp(l):
- return True
- if l[0] == '(' and l[-1] == ')':
- return True
- if not has_letters(line):
- return True
- return False
- def is_lowercase_letter_or_comma(letter):
- if letter.isalpha() and letter.lower() == letter:
- return True
- if letter == ',':
- return True
- return False
- def clean_up(lines):
- """Get rid of all non-text lines and try to combine text broken into multiple lines.
- """
- new_lines = []
- for line in lines[1:]:
- if has_no_text(line):
- continue
- elif len(new_lines) and is_lowercase_letter_or_comma(line[0]):
- # combine with previous line
- new_lines[-1] = new_lines[-1].strip() ' ' line
- else:
- new_lines.append(line)
- return new_lines
- def main_argv(args):
- """
- args[1]: file name
- args[2]: encoding. Default: utf-8.
- - If you get a lot of [?]s replacing characters,
- - you probably need to change file_encoding to 'cp1252'
- """
- file = args[1]
- file_encoding = 'utf-8' if len(args) < 3 else args[2]
- with open(file, encoding=file_encoding, errors='replace') as f:
- lines = f.readlines()
- new_lines = clean_up(lines)
- new_file = file[:-4] '.txt'
- with open(new_file, 'w') as f:
- for line in new_lines:
- f.write(line)
- def output_all_files(encoding=None, out_format=None):
- """Writes a file *.txt from *.srt for each .srt in the cwd.
- """
- files = scan_cwd_for_srt_files()
- if _VERBOSITY >= 1:
- print('Writing Files:')
- for f in files:
- output_one_file(f, encoding, out_format)
- if _VERBOSITY >= 1:
- print('Done writing all files.')
- def scan_cwd_for_srt_files():
- """Returns a list of the .srt files in the cwd.
- """
- files = [f for f in os.listdir('.') if os.path.isfile(f) and eval("'.srt' in f")]
- return files
- def trunc_after_80(line):
- outlines = []
- while len(line) > 80:
- # is 80 NOT whitespace?
- if line[80].split():
- wordlen = len(line[80:].split()[0])
- outlines.append(line[:80 wordlen] '\n')
- line = line[80 wordlen 1:]
- else:
- outlines.append(line[:80] '\n')
- line = line[81:]
- else:
- outlines.append(line)
- return outlines
- def output_one_file(file, encoding=None, out_format=None):
- """Writes a file *.txt from *.srt, using utf-8 encoding by default.
- out_format:
- 1 = Simple file with one line per entry in the .srt
- 2 = Blank line between every entry
- 3 = Like 2, but with strings split after whitespace on col 80
- """
- file_encoding = encoding
- new_file = file[:-4] '.txt'
- if out_format is None:
- out_format = _OUT_FORMAT
- if file_encoding is None:
- file_encoding = 'utf-8'
- with open(file, encoding=file_encoding, errors='replace') as f:
- lines = f.readlines()
- new_lines = clean_up(lines)
- if out_format >= 2:
- new_lines = [line '\n' for line in new_lines]
- if out_format == 3:
- formatted_lines = []
- for line in new_lines:
- for new in trunc_after_80(line):
- formatted_lines.append(new)
- new_lines = formatted_lines
- with open(new_file, 'w') as f:
- for line in new_lines:
- f.write(line)
- if _VERBOSITY == 2:
- print('Wrote file:', new_file)
- if __name__ == '__main__':
- if len(sys.argv) > 1:
- main_argv(sys.argv)
- else:
- print('\n\n')
- if _ONE_FILE:
- output_one_file(_FILE, encoding=_ENCODING, out_format=_OUT_FORMAT)
- else:
- output_all_files(encoding=_ENCODING, out_format=_OUT_FORMAT)
- print('\n\n')
- ##From stackexchange:
- ##But be careful while applying this to other directory, like
- ##files = [f for f in os.listdir(somedir) if os.path.isfile(f)].
- ##which would not work because f is not a full path but relative to the current dir.
- ##Therefore, for filtering on another directory, do os.path.isfile(os.path.join(somedir, f))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement