Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! C:\Python\Python37\python.exe
- #! coding: utf-8
- #! python3
- # Black Cat 2019
- # Licence cc-by-nc-sa
- # Adds timestamp in format HH_MM_SS_ from precise audacity timestamps
- # Also, checks for type and abbreviations
- # And do a freaking mess of other things
- import difflib
- VERSION = 0.9
- def timestampToTime( timestamp ) :
- hours = int( timestamp/3600 )
- minutes = int( (timestamp - (3600*hours)) / 60 )
- seconds = int( (timestamp - (60*minutes) - (3600*hours)) )
- millis = int( (timestamp - int(timestamp)) * 1000 )
- fractional = timestamp - int(timestamp)
- return (hours, minutes, seconds, millis, fractional)
- def timeToTimestamp( hours=0, minutes=0, seconds=0, millis=0 ) :
- return (3600*hours)+(60*minutes)+seconds+(millis/1000)
- def likeness( a="", b="" ) :
- likeness = difflib.SequenceMatcher(None, a, b).ratio()
- return likeness
- # This is so dirty and lazy. Please, uses a real library like "inflect"
- def num2str( num=0 ) :
- string = ""
- if abs( num ) > 9999999999 :
- error( "num2str : (positive or negative) Number to big sorry my coder was lazy" )
- string = str(num)
- elif num == 0 :
- string = "zero"
- else :
- # Power Of Ten = 0
- units = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "TEN", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"]
- # Power Of Ten = 1
- decades = ["", "ten", "twenty", "thirty", "fourty", "fifty", "sixty", "seventy", "eighty", "ninety"]
- hundreds = "hundred"
- # Power Of Ten [0, 3, 6, 9]
- thousands = ["", "thousand", "million", "billion"]
- n = abs(num)
- t = 0
- string = ""
- while n > 0 :
- strHun = ""
- hun = n % 1000
- n = int(n/1000)
- u = hun % 10
- d = int((hun % 100)/10)
- c = int(hun/100)
- if d < 2 :
- strHun = units[(10*d)+u]
- else :
- strHun = decades[d] + " " + units[u]
- if c > 0 :
- strHun = units[c] + " hundred and " + strHun
- if t != 0 :
- strHun = strHun + " " + thousands[t] + ", "
- t += 1
- if string != "" :
- string = " " + string
- string = strHun + string
- if num < 0 :
- string = "minus "
- return string
- #==============================================================================
- # Eye candy
- #==============================================================================
- def info( text="?" ) :
- global ERROR_LEVEL
- if ERROR_LEVEL > 2 :
- print( " info : "+text )
- def good( text="?" ) :
- global ERROR_LEVEL
- if ERROR_LEVEL > 1 :
- print( " good : "+text )
- def warning( text="?" ) :
- global ERROR_LEVEL
- if ERROR_LEVEL > 0 :
- print( "[warning] : "+text )
- def error( text="?" ) :
- print( "[ ERROR ] : "+text )
- def draw_spacer(char="=") :
- print( char*79 )
- #==============================================================================
- # Parameters
- INPUT = "labelsRaw_Test.txt"
- #INPUT = r"D:\PPPV\Cut\MLP\0102\0102Raw.txt"
- # Shall the program save the file or just checks for goofs?
- SAVE_FILE = True
- OUTPUT = "labels.txt"
- # Shall the program autoreplace typos?
- REPLACE_TYPO = True
- # Shall the program autoreplace leftover abbreviations?
- REPLACE_LEFTOVER = True
- # Shall the program autoreplace ending punctuation?
- REPLACE_ENDING_PUNCTUATION = True
- # Shall the program autoreplace transcript abbreviations?
- REPLACE_ABBREVIATIONS = True
- # Error level to display or not stuff.
- # 0 : errors only
- # 1 : errors and warning
- # 2 : errors, warning and good
- # 3 : errors, warning, good and infos
- ERROR_LEVEL = 3
- # Shall not modify after this line
- #=============================================================================
- draw_spacer()
- print( "Welcome to label checker V "+str(VERSION) )
- print( "Let me bitch about your typo and other mistakes..." )
- print()
- print( "input\t: " + INPUT )
- print( "output\t: " + OUTPUT )
- bools = ["Nope", "Eyup"]
- print( "Replace typos \t: " + bools[int(REPLACE_TYPO)] )
- print( "Replace leftovers\t: " + bools[int(REPLACE_LEFTOVER)] )
- print( "Replace punctuation\t: " + bools[int(REPLACE_ENDING_PUNCTUATION)] )
- print( "Replace abbreviations\t: " + bools[int(REPLACE_ABBREVIATIONS)] )
- if (int(REPLACE_TYPO) + int(REPLACE_LEFTOVER) + int(REPLACE_ENDING_PUNCTUATION) + int(REPLACE_ABBREVIATIONS) ) > 3 :
- print( "(You shouldn't put that much trust in me...)" )
- whine = ["warning only", "warning and error", "boringly verbose", "worst than your ex"]
- print( "Whining level\t: " + whine[ERROR_LEVEL] )
- draw_spacer()
- print()
- abbreviations = {
- "names" : {
- "Twilight":["twilight sparkle", "ts", "twi"],
- "Pinkie":["pinkie pie", "pp"],
- "Applejack":["aj", "apl"],
- "Rainbow":["rainbow dash", "rd"],
- "Rarity":["ra", "rar"],
- "Fluttershy":["fs", "flu"],
- "Spike":["sp", "spi"],
- "Celestia":["princess celestia", "ce", "cel", "celly"],
- "Luna":["princess luna", "lu"],
- "Apple Bloom":["ab"],
- "Sweetie Belle":["sb"],
- "Scootaloo":["sc", "chicken"],
- "Diamond Tiarra":["dt"],
- "Silver Spoon":["ss"],
- "Cheerlee":["ch"],
- "Iron Will":["i"],
- "Cadence":["princess cadence", "cad"],
- "Shining Armor":["sa"],
- "Chrysalis":["queen chrysalis", "cr", "bug butt"],
- "Nightmare Moon":["nm"],
- "Grany Smith":["gs"],
- "Mayor Mare":["mm"],
- "Babs Seed":["bs"],
- "Spitfire":["sf"],
- "Lightning Dust":["ld"]
- },
- "moods" : {
- "Neutral":["n"],
- "Happy":["h"],
- "Amused":["am"],
- "Sad":["s"],
- "Annoyed":["a"],
- "Angry":["ag"],
- "Disgust":["d"],
- "Sarcastic":["sa"],
- "Smug":["sm"],
- "Fear":["f"],
- "Anxious":["ax"],
- "Confused":["c"],
- "Surprised":["su"],
- "Tired":["t"],
- "Whispering":["w"],
- "Shouting":["sh"],
- "Whining":["wh"]
- },
- "noise" : {
- "Noisy":["q", "n"],
- "Very Noisy":["qq", "vn"]
- },
- "transcript" : {
- "mister" : "mr",
- "miss" : "ms"
- }
- }
- characters = []
- # Reads the labels file
- try :
- file = open( INPUT, "r" )
- labelData = file.readlines()
- except NameError as e :
- error( "File " + INPUT + " can't be found." )
- except IOError as e :
- error( "IO error {0} when trying to access the file : {1}".format( e.errno, e.strerror ) )
- finally :
- file.close()
- # Gather informations from label file
- episode = {}
- lines = []
- #-----------------------------------------------------------------------------
- # Work on each label
- for index in range( len(labelData) ) :
- # By default, the line shall be defective to be absolutely sure. But I'm lazy
- rottenLine = False
- line = labelData[index]
- if line.rstrip() == "" :
- warning( "Empty line " + str(index +1 ) )
- rottenLine = True
- else :
- if line.count("\t") != 2 :
- error( "Malformed line " + str(index +1 ) + " : Check tabulations on label" )
- rottenLine = True
- else :
- start, end, label = line.split( "\t" )
- start = start
- end = end
- label = label.rstrip()
- length = float(end) - float(start)
- hour, minute, second, milli, trash = timestampToTime( float(start) )
- entry = "{hour:0>2}{minute:0>2}{second:0>2}".format( hour=hour, minute=minute, second=second)
- #-----------------------------------------------------------------
- # Checks the label content
- underscoreNumber = label.count("_")
- if underscoreNumber == 6 :
- info( "Line " + str(index +1) + " : Timestamp already here" )
- h, m, s, character, moods, noise, transcript = label.split( "_" )
- elif underscoreNumber == 3 :
- character, moods, noise, transcript = label.split( "_" )
- else :
- error( "Malformed line " +str(index +1 ) + " : Check underscores" )
- rottenLine = True
- if not rottenLine :
- #-------------------------------------------------------------
- # Character checks
- if character != "" :
- # Typo checks
- for char in abbreviations["names"] :
- if character != char :
- if likeness( character, char ) > 0.85 :
- if REPLACE_TYPO :
- info( "Line " + str(index +1 ) + " name : " + character + " replaced by " + char )
- character = char
- else :
- warning( "Line " + str(index +1 ) + " name : " + character + " used. Do you mean " + char + "?" )
- # Abbreviations checks
- for char in abbreviations["names"] :
- if character.lower() in abbreviations["names"][char] :
- if REPLACE_LEFTOVER :
- info( "Line " + str(index +1 ) + " name : " + character + " replaced by " + char )
- character = char
- else :
- warning( "Line " + str(index +1 ) + " name : " + character + " used. Do you mean " + char + "?" )
- else :
- error( "Line " + str(index +1 ) + " : No character" )
- rottenLine = True
- #-------------------------------------------------------------
- # Mood checks
- if moods != "" :
- m = moods.split()
- # For each mood in the label
- for id in range( len(m) ) :
- # Typo checks
- for mood in abbreviations["moods"] :
- if m[id] != mood :
- if likeness( m[id], mood ) > 0.85 :
- if REPLACE_TYPO :
- info( "Line " + str(index +1 ) + " mood : " + m[id] + " replaced by " + mood )
- m[id] = mood
- else :
- warning( "Line " + str(index +1 ) + " mood : " + m[id] + " used. Do you mean " + mood + "?" )
- # Abbreviations checks
- # Checks against each known mood leftover abbreviation
- for mood in abbreviations["moods"] :
- if m[id].lower() in abbreviations["moods"][mood] :
- if REPLACE_LEFTOVER :
- info( "Line " + str(index +1 ) + " mood : " + m[id] + " replaced by " + mood )
- m[id] = mood
- else :
- warning( "Line " + str(index +1 ) + " mood : " + m[id] + " used. Do you mean " + mood + "?" )
- # Checks if the mod is known
- m[id] = m[id].capitalize()
- if m[id] not in abbreviations["moods"] :
- warning( "Line "+str(index +1)+" mood : '" + m[id] + "' unknown. Can't check it" )
- # Construct back the string. Obviously a more pythonic way exists, but I'm a lazy fuck.
- moods = ""
- for id in range( len(m) ) :
- if id > 0 :
- moods += " "
- moods += m[id]
- else :
- error( "Line " + str(index +1 ) + " : No mood" )
- rottenLine = True
- #-------------------------------------------------------------
- # Noise checks
- if noise != "" :
- for n in abbreviations["noise"] :
- # Abbreviation check
- if noise in abbreviations["noise"][n] :
- if REPLACE_LEFTOVER :
- info( "Line " + str(index +1 ) + " noise : " + noise + " replaced by " + n )
- noise = n
- else :
- warning( "Line " + str(index +1 ) + " noise : " + noise + " used. Do you mean " + n + "?" )
- # Typo check
- if noise != n :
- if likeness( noise.lower(), n.lower() ) > 0.85 :
- if REPLACE_TYPO :
- info( "Line " + str(index +1) + " noise : " + noise + " replaced by " + n )
- noise = n
- else :
- warning( "Line " + str(index +1) + " noise : " + noise + "used. Do you mean " + n + "?" )
- if noise.lower() not in ["", "noisy", "very noisy"] :
- error( "Line " + str(index +1) + " noise is " + noise + " ; shall be nothing, 'noisy' or 'very noisy'" )
- rottenLine = True
- #-------------------------------------------------------------
- # Transcript checks
- if transcript != "" :
- # Check for ending punctuation
- if transcript[-1] not in ",.!?" :
- ending = ""
- m = moods.split()
- for mood in m :
- if mood in ["Happy", "Angry", "Fear", "Anxious", "Surprised", "Shouting"] :
- if ending.count("!") == 0 :
- ending += "!"
- elif mood in ["Annoyed", "Sarcastic", "Confused"] :
- if ending.count("?") == 0 :
- ending += "?"
- if ending == "" :
- ending = "."
- if REPLACE_ENDING_PUNCTUATION :
- info( "Line " + str(index +1) + " trans : No punctuation. Guestimate based on mood : " + ending + " (please, do check, I'm not good with emotions...)" )
- transcript += ending
- else :
- warning( "Line " + str(index +1) + " trans : No punctuation. (Guestimate based on mood : "+ending+" )" )
- # Check for numbers and abbreviations
- words = transcript.split( " " )
- trans = ""
- for word in words :
- # Numbers
- if word.isdigit() :
- if REPLACE_ABBREVIATIONS :
- number = num2str( int(word) )
- info( "Line "+str(index +1)+" : "+ word+" expanded to " + number )
- word = number # Yay, that's duck typing! (kidding, both are string, but you got the joke, right? Ri-right?)
- for abb in abbreviations["transcript"] :
- #if abbreviations["transcript"][abb] in word :
- if abbreviations["transcript"][abb] == word :
- info( "Line "+str(index +1)+" : " + word+" expanded to " + abb)
- word = abb
- # Rebuild the transcript
- if trans != "" :
- trans += " "
- trans += word
- transcript = trans
- else :
- error( "Line " + str(index +1 ) + " : No transcript" )
- rottenLine = True
- #-------------------------------------------------------------------------
- # If no errors, keep the line
- if not rottenLine :
- lines.append( f"{start}\t{end}\t{hour:0>2}_{minute:0>2}_{second:0>2}_{character}_{moods}_{noise}_{transcript}" )
- else :
- info( "Line " + str(index +1 ) + " dropped." )
- # Save to file if asked
- if SAVE_FILE :
- try :
- file = open( OUTPUT, "w" )
- for line in lines :
- file.write( line + "\n" )
- info( "File saved" )
- except NameError as e :
- error( "File " + OUTPUT + " can't be found." )
- except IOError as e :
- error( "IO error {0} when trying to access the file : {1}".format( e.errno, e.strerror ) )
- finally :
- file.close()
- else :
- for line in lines :
- print( line )
- info( "Nothing saved to the disk." )
- input("Press enter to quit.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement