Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import gzip
- import sys
- def stripBadText(string):
- if string==None:
- return ""
- # No html tags
- string = re.sub("<[^>]+>","",string)
- # People don't talk in [brackets] or (inside parentheses), so I strip them.
- string = re.sub(r"\[([^\]]+)\]","",string)
- string = re.sub(r"\(([^\)]+)\)","",string)
- # ellipses can be used to separate one set of tokens from another.
- string = string.replace("..."," ")
- # And the use of dancy music theme things is irregular.
- string = string.replace("\xe2"," ")
- return string
- class srtBlock(str):
- """
- A string of text, initialized from an srt block,
- which parses the metadata out into separate methods.
- """
- def __init__(self,blockArray):
- self.blocknum = blockArray.pop(0)
- self.timestamp = blockArray.pop(0)
- text = " ".join([stripBadText(string) for string in blockArray])
- super(srtBlock,self).__init__(text)
- self.text = text
- def time(self,which="start"):
- metadata = dict()
- try:
- timeHits = re.findall("^(\d\d):(\d\d):(\d\d)",self.timestamp)
- if which=="start":
- time = timeHits[0]
- elif which=="end":
- time = timeHits[1]
- except IndexError:
- sys.stderr.write("Couldn't parse " + self.timestamp + "\n")
- return -1
- value = int(time[0])*60*60 + int(time[1])*60 + int(time[2])
- #value = int(value/chunkSize)*chunkSize/60
- return value
- def textContent(self):
- text = self.text
- return text
- class srtGroup(object):
- """
- An object that is initialized with an array of srt filenames:
- It builds them as a group into a set of dictionaries that includes
- minute, percentage, twelfth of the way through information as well as
- the text.
- """
- def __init__(self,files):
- self.targets = files
- self.files = [srtFile(file) for file in files]
- def strings(self):
- totalLength = sum([file.totalLength for file in self.files])
- self.totalLength = totalLength
- seenSoFar=0
- chunks = dict()
- for file in self.files:
- for block in file:
- try:
- key = seenSoFar + block.time()
- except TypeError:
- print self.targets
- print seenSoFar
- print block.time()
- raise
- try:
- chunks[key]['text'] = chunks[key]['text'] + '\f' + block.text
- except KeyError:
- chunks[key] = dict()
- chunks[key]['second'] = key
- chunks[key]['text'] = block.text
- seenSoFar = seenSoFar + file.totalLength
- return [chunks[key] for key in chunks.keys()]
- def documents(self,minChunk=120):
- chunks = dict()
- num = 0
- for string in self.strings():
- num += 1
- i=3
- going = True
- while going:
- #THis loop keeps assigning increasingly small chunks (6ths, 12ths, etc) to something, until the number falls below the threshold.
- key = nameChunk(i)
- try:
- string[key]=int((string['second']*i)/(self.totalLength*1.000001)) + 1
- except ZeroDivisionError:
- print string['text']
- raise
- if ((self.totalLength/i) < (minChunk*2)): #times two, because we're letting one slip through the cracks.
- string['maxChunk'] = key
- string['minute'] = int(string['second']/60)
- going = False
- i=i*2
- key = string[key]
- try:
- chunks[key]['text'] = chunks[key]['text'] + "\f" + string['text']
- except KeyError:
- chunks[key] = dict()
- chunks[key]['text'] = string['text']
- for key2 in string.keys():
- if key2 != "text":
- chunks[key][key2] = string[key2]
- del string['second']
- return [chunks[key] for key in chunks.keys()]
- def nameChunk(i):
- if i==3:
- return "3rd"
- else:
- return str(i) + "th"
- class srtFile(object):
- def __init__(self,filename):
- self.filename=filename
- self.source = gzip.open(filename,"r")
- self.readBlocks()
- def readBlocks(self):
- blocks = "".join(self.source.readlines())
- blocks = re.sub(r"\r\n","\n",blocks)
- blocks = blocks.split("\n\n")
- self.blocks = [block.split("\n") for block in blocks]
- self.array = []
- while len(self.blocks) > 0:
- thisBlock = self.blocks.pop()
- if len(thisBlock)>1:
- theBlock = (srtBlock(thisBlock))
- if theBlock.time() > -1:
- self.array.append(theBlock)
- if (len(self.array) > 0):
- self.totalLength = self.array[0].time()
- else:
- self.totalLength = -1
- def __iter__(self):
- return self
- def next(self):
- try:
- return self.array.pop()
- except IndexError:
- raise StopIteration
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement