Untitled

import re
import gzip
import sys

def stripBadText(string):
    if string==None:
        return ""
    # No html tags
    string = re.sub("<[^>]+>","",string)
    # People don't talk in [brackets] or (inside parentheses), so I strip them.
    string = re.sub(r"\[([^\]]+)\]","",string)
    string = re.sub(r"\(([^\)]+)\)","",string)
    # ellipses can be used to separate one set of tokens from another.
    string = string.replace("..."," ")
    # And the use of dancy music theme things is irregular.
    string = string.replace("\xe2"," ")
    return string

class srtBlock(str):
    """
    A string of text, initialized from an srt block,
    which parses the metadata out into separate methods.
    """
    def __init__(self,blockArray):
        self.blocknum = blockArray.pop(0)
        self.timestamp = blockArray.pop(0)
        text = " ".join([stripBadText(string) for string in blockArray])
        super(srtBlock,self).__init__(text)
        self.text = text

    def time(self,which="start"):
        metadata = dict()
        try:
            timeHits = re.findall("^(\d\d):(\d\d):(\d\d)",self.timestamp)
            if which=="start":
                time = timeHits[0]
            elif which=="end":
                time = timeHits[1]
        except IndexError:
            sys.stderr.write("Couldn't parse " + self.timestamp + "\n")
            return -1
        value = int(time[0])*60*60 + int(time[1])*60 + int(time[2])
        #value = int(value/chunkSize)*chunkSize/60
        return value

    def textContent(self):
        text = self.text
        return text

class srtGroup(object):

    """
    An object that is initialized with an array of srt filenames:
    It builds them as a group into a set of dictionaries that includes
    minute, percentage, twelfth of the way through information as well as
    the text.
    """

    def __init__(self,files):
        self.targets = files
        self.files = [srtFile(file) for file in files]

    def strings(self):
        totalLength = sum([file.totalLength for file in self.files])
        self.totalLength = totalLength
        seenSoFar=0
        chunks = dict()
        for file in self.files:
            for block in file:
                try:
                    key = seenSoFar + block.time()
                except TypeError:
                    print self.targets
                    print seenSoFar
                    print block.time()
                    raise
                try:
                    chunks[key]['text'] = chunks[key]['text'] + '\f' + block.text
                except KeyError:
                    chunks[key] = dict()
                    chunks[key]['second'] = key
                    chunks[key]['text'] = block.text

            seenSoFar = seenSoFar + file.totalLength
        return [chunks[key] for key in chunks.keys()]

    def documents(self,minChunk=120):
        chunks = dict()
        num = 0
        for string in self.strings():
            num += 1
            i=3
            going = True
            while going:
                #THis loop keeps assigning increasingly small chunks (6ths, 12ths, etc) to something, until the number falls below the threshold.
                key = nameChunk(i)
                try:
                    string[key]=int((string['second']*i)/(self.totalLength*1.000001)) + 1
                except ZeroDivisionError:
                    print string['text']
                    raise
                if ((self.totalLength/i) < (minChunk*2)): #times two, because we're letting one slip through the cracks.
                    string['maxChunk'] = key
                    string['minute'] = int(string['second']/60)
                    going = False
                i=i*2

            key = string[key]
            try:
                chunks[key]['text'] = chunks[key]['text'] + "\f" + string['text']
            except KeyError:
                chunks[key] = dict()
                chunks[key]['text'] = string['text']
            for key2 in string.keys():
                if key2 != "text":
                    chunks[key][key2] = string[key2]
            del string['second']

        return [chunks[key] for key in chunks.keys()]

def nameChunk(i):
    if i==3:
        return "3rd"
    else:
        return str(i) + "th"

class srtFile(object):
    def __init__(self,filename):
        self.filename=filename
        self.source = gzip.open(filename,"r")
        self.readBlocks()

    def readBlocks(self):
        blocks = "".join(self.source.readlines())

        blocks = re.sub(r"\r\n","\n",blocks)
        blocks = blocks.split("\n\n")

        self.blocks = [block.split("\n") for block in blocks]

        self.array = []
        while len(self.blocks) > 0:
            thisBlock = self.blocks.pop()
            if len(thisBlock)>1:
                theBlock = (srtBlock(thisBlock))
                if theBlock.time() > -1:
                    self.array.append(theBlock)
        if (len(self.array) > 0):
            self.totalLength = self.array[0].time()
        else:
            self.totalLength = -1

    def __iter__(self):
        return self

    def next(self):
        try:
            return self.array.pop()
        except IndexError:
            raise StopIteration