Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #This script converts .srt subtitle files to .csv files with timestamp,
- #subtitle number, and utterance stored as separate variables
- #I only sort of know Python, so sorry in advance.
- import csv
- import os
- import string
- filePath = "./"
- dirList = os.listdir(filePath)
- for filename in dirList:
- timestamps = ["timestamp"] #the complete timestamp of each subtitle
- timeStarts = ["start time"] #when each subtitle appears on screen
- timeEnds = ["end time"] #when each subtitle is removed from the screen
- subtitleIndices = ["index"]
- utterances = ["utterance"]
- extension = str(os.path.splitext(filename)[1])
- root = str(os.path.splitext(filename)[0])
- if extension != ".srt": #skip files other than subtitle files
- continue
- currentFile = open(filename)
- prevLineWasUtt = False #was the most recent line looked at an utterance?
- for line in currentFile: #populate the lists that will be put in the csv
- line = line.strip() #get rid of whitespace characters on end of line
- if line.find("-") == 0: #prevents Excel from viewing entry as formula
- line = "\'" + line
- if line.isdigit():
- subtitleIndices.append(line)
- elif "-->" in line:
- timestamps.append(line)
- timeStarts.append(line[:8])
- timeEnds.append(line[17:25])
- elif len(line) > 1:
- if prevLineWasUtt:
- utterances[len(utterances)-1] = utterances[len(utterances)-1] + " " + line
- else:
- utterances.append(line)
- prevLineWasUtt = True
- continue
- prevLineWasUtt = False
- #put all the lists in a csv file with the same name as the subtitle file
- csvWriter = csv.writer(file(root + ".csv",'wb'), dialect = 'excel')
- csvWriter.writerows([subtitleIndices, timestamps, timeStarts, timeEnds, utterances])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement