Philip Jägenstedt

#!/usr/bin/env python

import re
import sys

timestamp_pattern = r'(\d\d):(\d\d):(\d\d)([,.])(\d\d\d)'
timerange_pattern = '(' + timestamp_pattern + r') --> (' + timestamp_pattern + r')(.*)'

timerange_re = re.compile(timerange_pattern)

tag_re = re.compile(r'<(\w+)>')

subrip_re = re.compile(r'\s+X1:\d+ X2:\d+ Y1:\d+ Y2:\d+\s*')

current_file = None
def log(msg):
    print "%s: %s" % (current_file, msg)

def get_time(hh, mm, ss, ms):
    hh = int(hh)
    mm = int(mm)
    ss = int(ss)
    ms = int(ms)

    if mm > 59 or ss > 59:
        return None

    return hh*3600+mm*60+ss+ms/1000.0

def overlaps(start1, end1, start2, end2):
    assert start1 < end1
    assert start2 < end2
    return max(start1, start2) < min(end1, end2)

def parse(text):
    lines = text.splitlines()
    cues = []
    cue = None
    prev_line = ''
    for line,lineno in zip(lines,range(1,len(lines)+1)):
        m = timerange_re.match(line)
        if (m):
            unterminated = (cue != None)
            if unterminated:
                cues.append(cue)
            cue = {}
            cue['lineno'] = lineno
            cue['unterminated'] = unterminated
            cue['id'] = prev_line.strip()
            cue['start'] = get_time(m.group(2), m.group(3), m.group(4), m.group(6))
            cue['end'] = get_time(m.group(8), m.group(9), m.group(10), m.group(12))
            cue['separator'] = m.group(5) + m.group(11)
            cue['settings'] = m.group(13)
            cue['content'] = ''
        elif cue != None:
            if line == '':
                # properly terminated cue
                cues.append(cue)
                cue = None
            else:
                cue['content'] += line + '\n'
        prev_line = line
    if cue != None:
        cues.append(cue)
    return cues

def analyze(cues):
    id_ = {'non-digit': 0, 'out-of-order': 0}
    content = {'tags': {},
               'have-markup': 0,
               'font': 0,
               '-->': 0}
    settings = {'trailing': 0, 'subrip': 0}
    timing = {'invalid':0,
              'empty': 0,
              'negative': 0,
              'overlapping': 0}
    unterminated = 0

    def analyze_id(prev_cue, cue):
        if not cue['id'].isdigit():
            #log("non-digit id: %s (line %d)" % (cue['id'], cue['lineno']))
            id_['non-digit'] += 1
        else:
            try:
                prev_id = int(prev_cue['id'])
                this_id = int(cue['id'])
                if prev_id+1 != this_id:
                    #log('out of order ids: %d, %d' % (prev_id, this_id))
                    id_['out-of-order'] += 1
            except:
                pass

    def analyze_timing(prev_cue, cue):
        start = cue['start']
        end = cue['end']
        if start != None and end != None:
            if start > end:
                timing['negative'] += 1
            elif start == end:
                timing['empty'] += 1
            elif prev_cue:
                prev_start = prev_cue['start']
                prev_end = prev_cue['end']
                if prev_start != None and prev_end != None and prev_start < prev_end:
                    # this only detects overlap between two
                    # adjacent cues, results questionable
                    timing['overlapping'] += int(overlaps(prev_start, prev_end, start, end))
        else:
            timing['invalid'] += 1

    def analyze_settings(text):
        if text and not text.isspace():
            log('trailing: '+text)
            settings['trailing'] += 1
            if subrip_re.match(text):
                settings['subrip'] += 1

    def analyze_content(text):
        for m in tag_re.finditer(text):
            tag = str(m.group(1))
            if tag in content['tags']:
                content['tags'][tag] += 1
            else:
                content['tags'][tag] = 1
            content['have-markup'] += 1
        if '<font' in text:
            content['font'] += 1
            content['have-markup'] += 1

        if '-->' in text:
            #log('--> in content: '+text)
            content['-->'] += 1

    def analyze_separator(cues):
        period = False
        comma = False
        for sep in (cue['separator'] for cue in cues):
            period |= ('.' in sep)
            comma |= (',' in sep)
        if period and comma:
            timing['separator'] = 'mixed'
        elif period:
            timing['separator'] = 'period'
        else:
            timing['separator'] = 'comma'

    prev_cue = None
    for cue in cues:

        #analyze_id(prev_cue, cue)
        #analyze_timing(prev_cue, cue)
        #analyze_settings(cue['settings'])
        analyze_content(cue['content'])

        unterminated += cue['unterminated']

        prev_cue = cue

    #analyze_separator(cues)

    return {'cues': len(cues),
            'unterminated': unterminated,
            'id': id_,
            'content': content,
            'settings': settings,
            'timing': timing}

def merge_dict(dst, src):
    for k, v in src.items():
        if isinstance(v, dict):
            if k not in dst:
                dst[k] = {}
            merge_dict(dst[k], v)
        elif isinstance(v, str):
            if k not in dst:
                dst[k] = {}
            if v in dst[k]:
                dst[k][v] += 1
            else:
                dst[k][v] = 1
            pass
        else:
            if k not in dst:
                dst[k] = 0
            dst[k] += v > 0

def main(args):
    global current_file
    summary = {}
    for arg in args:
        current_file = arg
        text = file(arg).read()
        # try some encodings
        encoding = 'unknown'
        for e in ['ascii', 'utf-8']:
            try:
                text = text.decode(e)
                encoding = e
                break
            except UnicodeDecodeError:
                pass
        # get rid of the BOM
        if text[0] == u'\ufeff':
            text = text[1:]
        cues = parse(text)
        if len(cues) == 0:
            log('no cues found, not SRT?')
        results = analyze(cues)
        results['encoding'] = {encoding: 1}
        merge_dict(summary, results)

    print summary

if __name__ == '__main__':
    main(sys.argv[1:])