Advertisement
Guest User

Philip Jägenstedt

a guest
Aug 25th, 2010
173
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.31 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import re
  4. import sys
  5.  
  6. timestamp_pattern = r'(\d\d):(\d\d):(\d\d)([,.])(\d\d\d)'
  7. timerange_pattern = '(' + timestamp_pattern + r') --> (' + timestamp_pattern + r')(.*)'
  8.  
  9. timerange_re = re.compile(timerange_pattern)
  10.  
  11. tag_re = re.compile(r'<(\w+)>')
  12.  
  13. subrip_re = re.compile(r'\s+X1:\d+ X2:\d+ Y1:\d+ Y2:\d+\s*')
  14.  
  15. current_file = None
  16. def log(msg):
  17.     print "%s: %s" % (current_file, msg)
  18.  
  19. def get_time(hh, mm, ss, ms):
  20.     hh = int(hh)
  21.     mm = int(mm)
  22.     ss = int(ss)
  23.     ms = int(ms)
  24.  
  25.     if mm > 59 or ss > 59:
  26.         return None
  27.  
  28.     return hh*3600+mm*60+ss+ms/1000.0
  29.  
  30. def overlaps(start1, end1, start2, end2):
  31.     assert start1 < end1
  32.     assert start2 < end2
  33.     return max(start1, start2) < min(end1, end2)
  34.  
  35. def parse(text):
  36.     lines = text.splitlines()
  37.     cues = []
  38.     cue = None
  39.     prev_line = ''
  40.     for line,lineno in zip(lines,range(1,len(lines)+1)):
  41.         m = timerange_re.match(line)
  42.         if (m):
  43.             unterminated = (cue != None)
  44.             if unterminated:
  45.                 cues.append(cue)
  46.             cue = {}
  47.             cue['lineno'] = lineno
  48.             cue['unterminated'] = unterminated
  49.             cue['id'] = prev_line.strip()
  50.             cue['start'] = get_time(m.group(2), m.group(3), m.group(4), m.group(6))
  51.             cue['end'] = get_time(m.group(8), m.group(9), m.group(10), m.group(12))
  52.             cue['separator'] = m.group(5) + m.group(11)
  53.             cue['settings'] = m.group(13)
  54.             cue['content'] = ''
  55.         elif cue != None:
  56.             if line == '':
  57.                 # properly terminated cue
  58.                 cues.append(cue)
  59.                 cue = None
  60.             else:
  61.                 cue['content'] += line + '\n'
  62.         prev_line = line
  63.     if cue != None:
  64.         cues.append(cue)
  65.     return cues
  66.  
  67. def analyze(cues):
  68.     id_ = {'non-digit': 0, 'out-of-order': 0}
  69.     content = {'tags': {},
  70.                'have-markup': 0,
  71.                'font': 0,
  72.                '-->': 0}
  73.     settings = {'trailing': 0, 'subrip': 0}
  74.     timing = {'invalid':0,
  75.               'empty': 0,
  76.               'negative': 0,
  77.               'overlapping': 0}
  78.     unterminated = 0
  79.  
  80.     def analyze_id(prev_cue, cue):
  81.         if not cue['id'].isdigit():
  82.             #log("non-digit id: %s (line %d)" % (cue['id'], cue['lineno']))
  83.             id_['non-digit'] += 1
  84.         else:
  85.             try:
  86.                 prev_id = int(prev_cue['id'])
  87.                 this_id = int(cue['id'])
  88.                 if prev_id+1 != this_id:
  89.                     #log('out of order ids: %d, %d' % (prev_id, this_id))
  90.                     id_['out-of-order'] += 1
  91.             except:
  92.                 pass
  93.  
  94.     def analyze_timing(prev_cue, cue):
  95.         start = cue['start']
  96.         end = cue['end']
  97.         if start != None and end != None:
  98.             if start > end:
  99.                 timing['negative'] += 1
  100.             elif start == end:
  101.                 timing['empty'] += 1
  102.             elif prev_cue:
  103.                 prev_start = prev_cue['start']
  104.                 prev_end = prev_cue['end']
  105.                 if prev_start != None and prev_end != None and prev_start < prev_end:
  106.                     # this only detects overlap between two
  107.                     # adjacent cues, results questionable
  108.                     timing['overlapping'] += int(overlaps(prev_start, prev_end, start, end))
  109.         else:
  110.             timing['invalid'] += 1
  111.  
  112.     def analyze_settings(text):
  113.         if text and not text.isspace():
  114.             log('trailing: '+text)
  115.             settings['trailing'] += 1
  116.             if subrip_re.match(text):
  117.                 settings['subrip'] += 1
  118.  
  119.     def analyze_content(text):
  120.         for m in tag_re.finditer(text):
  121.             tag = str(m.group(1))
  122.             if tag in content['tags']:
  123.                 content['tags'][tag] += 1
  124.             else:
  125.                 content['tags'][tag] = 1
  126.             content['have-markup'] += 1
  127.         if '<font' in text:
  128.             content['font'] += 1
  129.             content['have-markup'] += 1
  130.  
  131.         if '-->' in text:
  132.             #log('--> in content: '+text)
  133.             content['-->'] += 1
  134.  
  135.     def analyze_separator(cues):
  136.         period = False
  137.         comma = False
  138.         for sep in (cue['separator'] for cue in cues):
  139.             period |= ('.' in sep)
  140.             comma |= (',' in sep)
  141.         if period and comma:
  142.             timing['separator'] = 'mixed'
  143.         elif period:
  144.             timing['separator'] = 'period'
  145.         else:
  146.             timing['separator'] = 'comma'
  147.  
  148.     prev_cue = None
  149.     for cue in cues:
  150.  
  151.         #analyze_id(prev_cue, cue)
  152.         #analyze_timing(prev_cue, cue)
  153.         #analyze_settings(cue['settings'])
  154.         analyze_content(cue['content'])
  155.  
  156.         unterminated += cue['unterminated']
  157.  
  158.         prev_cue = cue
  159.  
  160.     #analyze_separator(cues)
  161.  
  162.     return {'cues': len(cues),
  163.             'unterminated': unterminated,
  164.             'id': id_,
  165.             'content': content,
  166.             'settings': settings,
  167.             'timing': timing}
  168.  
  169. def merge_dict(dst, src):
  170.     for k, v in src.items():
  171.         if isinstance(v, dict):
  172.             if k not in dst:
  173.                 dst[k] = {}
  174.             merge_dict(dst[k], v)
  175.         elif isinstance(v, str):
  176.             if k not in dst:
  177.                 dst[k] = {}
  178.             if v in dst[k]:
  179.                 dst[k][v] += 1
  180.             else:
  181.                 dst[k][v] = 1
  182.             pass
  183.         else:
  184.             if k not in dst:
  185.                 dst[k] = 0
  186.             dst[k] += v > 0
  187.  
  188. def main(args):
  189.     global current_file
  190.     summary = {}
  191.     for arg in args:
  192.         current_file = arg
  193.         text = file(arg).read()
  194.         # try some encodings
  195.         encoding = 'unknown'
  196.         for e in ['ascii', 'utf-8']:
  197.             try:
  198.                 text = text.decode(e)
  199.                 encoding = e
  200.                 break
  201.             except UnicodeDecodeError:
  202.                 pass
  203.         # get rid of the BOM
  204.         if text[0] == u'\ufeff':
  205.             text = text[1:]
  206.         cues = parse(text)
  207.         if len(cues) == 0:
  208.             log('no cues found, not SRT?')
  209.         results = analyze(cues)
  210.         results['encoding'] = {encoding: 1}
  211.         merge_dict(summary, results)
  212.  
  213.     print summary
  214.  
  215. if __name__ == '__main__':
  216.     main(sys.argv[1:])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement