Untitled

# Tries to fix a srt file to match the sub time with the actual dialog time.
# Inputs:
#  - the srt file
#  - a list of (srt time, spoken dialog time) pairs that will be used
#    to infer the parameters for the srt time adjustment
# Outputs:
#  - the corrected srt file to standard output
#  - the correction parameters to standard error

import sys
import datetime
import re
import math

class SrtEntry(object):
    def __init__(self, number, startTime, endTime, text):
        self.number = number
        self.startTime = startTime
        self.endTime = endTime
        self.text = text

    def __repr__(self):
        return "SrtEntry(%d, %lf, %lf, %s)" % (self.number,
                                               self.startTime,
                                               self.endTime,
                                               repr(self.text))

    def __str__(self):
        def breakTime(t):
            h = int(t / 3600)
            m = int((t - h * 3600) / 60)
            s = int(t - h * 3600 - m * 60)
            ms = int((t - math.floor(t)) * 1000)
            return [h,m,s,ms]

        args = [self.number] + breakTime(self.startTime) + breakTime(self.endTime) + [self.text]
        return "%d\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\n%s\n" % tuple(args)

def parseSubtitle(lines):
    result = SrtEntry(int(lines[0]), 0, 0, "\n".join([line.strip() for line in lines[2:]]))
    pattern = '(\d\d):(\d\d):(\d\d),(\d\d\d) --> (\d\d):(\d\d):(\d\d),(\d\d\d)'
    match = re.search(pattern, lines[1])
    result.startTime = int(match.group(1)) * 3600 \
                       + int(match.group(2)) * 60 \
                       + int(match.group(3)) \
                       + int(match.group(4)) / 1000.0
    result.endTime = int(match.group(5)) * 3600 \
                     + int(match.group(6)) * 60 \
                     + int(match.group(7)) \
                     + int(match.group(8)) / 1000.0
    return result

def parseFile(fileName):
    f = file(fileName)
    content = f.readlines()
    f.close()
    subtitles = []
    currentSubtitle = []
    for line in content:
        if line.strip() == "":
            if len(currentSubtitle) > 0:
                subtitles.append(currentSubtitle)
                currentSubtitle = []
        else:
            currentSubtitle.append(line)
    if len(currentSubtitle) > 0:
        subtitles.append(currentSubtitle)
    return [parseSubtitle(lines) for lines in subtitles]

def parseOneCorrection(line):
    pattern = '(\d\d):(\d\d):(\d\d),(\d\d\d) --> (\d\d):(\d\d):(\d\d),(\d\d\d)'
    match = re.search(pattern, line)
    srtTime = int(match.group(1)) * 3600 \
              + int(match.group(2)) * 60 \
              + int(match.group(3)) \
              + int(match.group(4)) / 1000.0
    correctTime = int(match.group(5)) * 3600 \
                  + int(match.group(6)) * 60 \
                  + int(match.group(7)) \
                  + int(match.group(8)) / 1000.0
    return (srtTime, correctTime)

def parseCorrections(fileName):
    f = file(fileName)
    content = f.readlines()
    f.close()
    return [parseOneCorrection(line) for line in content if line.strip != ""]

def leastSquares(corrections):
    sum_x=0
    sum_y=0
    sum_xx=0
    sum_xy=0
    for (x, y) in corrections:
        sum_x = sum_x + x
        sum_y = sum_y + y
        xx = math.pow(x, 2)
        sum_xx = sum_xx + xx
        xy = x*y
        sum_xy = sum_xy + xy
    n = len(corrections)
    b = (-sum_x * sum_xy + sum_xx * sum_y) / (n * sum_xx-sum_x * sum_x)
    a = (-sum_x * sum_y + n * sum_xy) / (n * sum_xx-sum_x * sum_x)
    return (a, b)

def processSub(sub, a, b):
    return SrtEntry(sub.number,
                    sub.startTime * a + b,
                    sub.endTime * a + b,
                    sub.text)

if __name__ == "__main__":
    subs = parseFile(sys.argv[1])
    corrections = parseCorrections(sys.argv[2])
    a, b = leastSquares(corrections)
    sys.stderr.write("%lf, %lf\n" % (a, b))
    fixedSubs = [processSub(sub, a, b) for sub in subs]
    for sub in fixedSubs:
        print sub