Advertisement
Guest User

Untitled

a guest
Sep 30th, 2016
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.21 KB | None | 0 0
  1. import re
  2. import gzip
  3. import sys
  4.  
  5. def stripBadText(string):
  6. if string==None:
  7. return ""
  8. # No html tags
  9. string = re.sub("<[^>]+>","",string)
  10. # People don't talk in [brackets] or (inside parentheses), so I strip them.
  11. string = re.sub(r"\[([^\]]+)\]","",string)
  12. string = re.sub(r"\(([^\)]+)\)","",string)
  13. # ellipses can be used to separate one set of tokens from another.
  14. string = string.replace("..."," ")
  15. # And the use of dancy music theme things is irregular.
  16. string = string.replace("\xe2"," ")
  17. return string
  18.  
  19. class srtBlock(str):
  20. """
  21. A string of text, initialized from an srt block,
  22. which parses the metadata out into separate methods.
  23. """
  24. def __init__(self,blockArray):
  25. self.blocknum = blockArray.pop(0)
  26. self.timestamp = blockArray.pop(0)
  27. text = " ".join([stripBadText(string) for string in blockArray])
  28. super(srtBlock,self).__init__(text)
  29. self.text = text
  30.  
  31. def time(self,which="start"):
  32. metadata = dict()
  33. try:
  34. timeHits = re.findall("^(\d\d):(\d\d):(\d\d)",self.timestamp)
  35. if which=="start":
  36. time = timeHits[0]
  37. elif which=="end":
  38. time = timeHits[1]
  39. except IndexError:
  40. sys.stderr.write("Couldn't parse " + self.timestamp + "\n")
  41. return -1
  42. value = int(time[0])*60*60 + int(time[1])*60 + int(time[2])
  43. #value = int(value/chunkSize)*chunkSize/60
  44. return value
  45.  
  46. def textContent(self):
  47. text = self.text
  48. return text
  49.  
  50. class srtGroup(object):
  51.  
  52. """
  53. An object that is initialized with an array of srt filenames:
  54. It builds them as a group into a set of dictionaries that includes
  55. minute, percentage, twelfth of the way through information as well as
  56. the text.
  57. """
  58.  
  59. def __init__(self,files):
  60. self.targets = files
  61. self.files = [srtFile(file) for file in files]
  62.  
  63. def strings(self):
  64. totalLength = sum([file.totalLength for file in self.files])
  65. self.totalLength = totalLength
  66. seenSoFar=0
  67. chunks = dict()
  68. for file in self.files:
  69. for block in file:
  70. try:
  71. key = seenSoFar + block.time()
  72. except TypeError:
  73. print self.targets
  74. print seenSoFar
  75. print block.time()
  76. raise
  77. try:
  78. chunks[key]['text'] = chunks[key]['text'] + '\f' + block.text
  79. except KeyError:
  80. chunks[key] = dict()
  81. chunks[key]['second'] = key
  82. chunks[key]['text'] = block.text
  83.  
  84. seenSoFar = seenSoFar + file.totalLength
  85. return [chunks[key] for key in chunks.keys()]
  86.  
  87. def documents(self,minChunk=120):
  88. chunks = dict()
  89. num = 0
  90. for string in self.strings():
  91. num += 1
  92. i=3
  93. going = True
  94. while going:
  95. #THis loop keeps assigning increasingly small chunks (6ths, 12ths, etc) to something, until the number falls below the threshold.
  96. key = nameChunk(i)
  97. try:
  98. string[key]=int((string['second']*i)/(self.totalLength*1.000001)) + 1
  99. except ZeroDivisionError:
  100. print string['text']
  101. raise
  102. if ((self.totalLength/i) < (minChunk*2)): #times two, because we're letting one slip through the cracks.
  103. string['maxChunk'] = key
  104. string['minute'] = int(string['second']/60)
  105. going = False
  106. i=i*2
  107.  
  108. key = string[key]
  109. try:
  110. chunks[key]['text'] = chunks[key]['text'] + "\f" + string['text']
  111. except KeyError:
  112. chunks[key] = dict()
  113. chunks[key]['text'] = string['text']
  114. for key2 in string.keys():
  115. if key2 != "text":
  116. chunks[key][key2] = string[key2]
  117. del string['second']
  118.  
  119. return [chunks[key] for key in chunks.keys()]
  120.  
  121. def nameChunk(i):
  122. if i==3:
  123. return "3rd"
  124. else:
  125. return str(i) + "th"
  126.  
  127. class srtFile(object):
  128. def __init__(self,filename):
  129. self.filename=filename
  130. self.source = gzip.open(filename,"r")
  131. self.readBlocks()
  132.  
  133. def readBlocks(self):
  134. blocks = "".join(self.source.readlines())
  135.  
  136. blocks = re.sub(r"\r\n","\n",blocks)
  137. blocks = blocks.split("\n\n")
  138.  
  139. self.blocks = [block.split("\n") for block in blocks]
  140.  
  141. self.array = []
  142. while len(self.blocks) > 0:
  143. thisBlock = self.blocks.pop()
  144. if len(thisBlock)>1:
  145. theBlock = (srtBlock(thisBlock))
  146. if theBlock.time() > -1:
  147. self.array.append(theBlock)
  148. if (len(self.array) > 0):
  149. self.totalLength = self.array[0].time()
  150. else:
  151. self.totalLength = -1
  152.  
  153. def __iter__(self):
  154. return self
  155.  
  156. def next(self):
  157. try:
  158. return self.array.pop()
  159. except IndexError:
  160. raise StopIteration
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement