Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import re
- import sys
- timestamp_pattern = r'(\d\d):(\d\d):(\d\d)([,.])(\d\d\d)'
- timerange_pattern = '(' + timestamp_pattern + r') --> (' + timestamp_pattern + r')(.*)'
- timerange_re = re.compile(timerange_pattern)
- tag_re = re.compile(r'<(\w+)>')
- subrip_re = re.compile(r'\s+X1:\d+ X2:\d+ Y1:\d+ Y2:\d+\s*')
- current_file = None
- def log(msg):
- print "%s: %s" % (current_file, msg)
- def get_time(hh, mm, ss, ms):
- hh = int(hh)
- mm = int(mm)
- ss = int(ss)
- ms = int(ms)
- if mm > 59 or ss > 59:
- return None
- return hh*3600+mm*60+ss+ms/1000.0
- def overlaps(start1, end1, start2, end2):
- assert start1 < end1
- assert start2 < end2
- return max(start1, start2) < min(end1, end2)
- def parse(text):
- lines = text.splitlines()
- cues = []
- cue = None
- prev_line = ''
- for line,lineno in zip(lines,range(1,len(lines)+1)):
- m = timerange_re.match(line)
- if (m):
- unterminated = (cue != None)
- if unterminated:
- cues.append(cue)
- cue = {}
- cue['lineno'] = lineno
- cue['unterminated'] = unterminated
- cue['id'] = prev_line.strip()
- cue['start'] = get_time(m.group(2), m.group(3), m.group(4), m.group(6))
- cue['end'] = get_time(m.group(8), m.group(9), m.group(10), m.group(12))
- cue['separator'] = m.group(5) + m.group(11)
- cue['settings'] = m.group(13)
- cue['content'] = ''
- elif cue != None:
- if line == '':
- # properly terminated cue
- cues.append(cue)
- cue = None
- else:
- cue['content'] += line + '\n'
- prev_line = line
- if cue != None:
- cues.append(cue)
- return cues
- def analyze(cues):
- id_ = {'non-digit': 0, 'out-of-order': 0}
- content = {'tags': {},
- 'have-markup': 0,
- 'font': 0,
- '-->': 0}
- settings = {'trailing': 0, 'subrip': 0}
- timing = {'invalid':0,
- 'empty': 0,
- 'negative': 0,
- 'overlapping': 0}
- unterminated = 0
- def analyze_id(prev_cue, cue):
- if not cue['id'].isdigit():
- #log("non-digit id: %s (line %d)" % (cue['id'], cue['lineno']))
- id_['non-digit'] += 1
- else:
- try:
- prev_id = int(prev_cue['id'])
- this_id = int(cue['id'])
- if prev_id+1 != this_id:
- #log('out of order ids: %d, %d' % (prev_id, this_id))
- id_['out-of-order'] += 1
- except:
- pass
- def analyze_timing(prev_cue, cue):
- start = cue['start']
- end = cue['end']
- if start != None and end != None:
- if start > end:
- timing['negative'] += 1
- elif start == end:
- timing['empty'] += 1
- elif prev_cue:
- prev_start = prev_cue['start']
- prev_end = prev_cue['end']
- if prev_start != None and prev_end != None and prev_start < prev_end:
- # this only detects overlap between two
- # adjacent cues, results questionable
- timing['overlapping'] += int(overlaps(prev_start, prev_end, start, end))
- else:
- timing['invalid'] += 1
- def analyze_settings(text):
- if text and not text.isspace():
- log('trailing: '+text)
- settings['trailing'] += 1
- if subrip_re.match(text):
- settings['subrip'] += 1
- def analyze_content(text):
- for m in tag_re.finditer(text):
- tag = str(m.group(1))
- if tag in content['tags']:
- content['tags'][tag] += 1
- else:
- content['tags'][tag] = 1
- content['have-markup'] += 1
- if '<font' in text:
- content['font'] += 1
- content['have-markup'] += 1
- if '-->' in text:
- #log('--> in content: '+text)
- content['-->'] += 1
- def analyze_separator(cues):
- period = False
- comma = False
- for sep in (cue['separator'] for cue in cues):
- period |= ('.' in sep)
- comma |= (',' in sep)
- if period and comma:
- timing['separator'] = 'mixed'
- elif period:
- timing['separator'] = 'period'
- else:
- timing['separator'] = 'comma'
- prev_cue = None
- for cue in cues:
- #analyze_id(prev_cue, cue)
- #analyze_timing(prev_cue, cue)
- #analyze_settings(cue['settings'])
- analyze_content(cue['content'])
- unterminated += cue['unterminated']
- prev_cue = cue
- #analyze_separator(cues)
- return {'cues': len(cues),
- 'unterminated': unterminated,
- 'id': id_,
- 'content': content,
- 'settings': settings,
- 'timing': timing}
- def merge_dict(dst, src):
- for k, v in src.items():
- if isinstance(v, dict):
- if k not in dst:
- dst[k] = {}
- merge_dict(dst[k], v)
- elif isinstance(v, str):
- if k not in dst:
- dst[k] = {}
- if v in dst[k]:
- dst[k][v] += 1
- else:
- dst[k][v] = 1
- pass
- else:
- if k not in dst:
- dst[k] = 0
- dst[k] += v > 0
- def main(args):
- global current_file
- summary = {}
- for arg in args:
- current_file = arg
- text = file(arg).read()
- # try some encodings
- encoding = 'unknown'
- for e in ['ascii', 'utf-8']:
- try:
- text = text.decode(e)
- encoding = e
- break
- except UnicodeDecodeError:
- pass
- # get rid of the BOM
- if text[0] == u'\ufeff':
- text = text[1:]
- cues = parse(text)
- if len(cues) == 0:
- log('no cues found, not SRT?')
- results = analyze(cues)
- results['encoding'] = {encoding: 1}
- merge_dict(summary, results)
- print summary
- if __name__ == '__main__':
- main(sys.argv[1:])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement