Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## Tests [plain_text]
- [XX] - 'Brian Is In The Kitchen'
- [->] - 'Brian Is in the Kitchen'
- [OK] - 'Bring It On'
- [OK] - 'You Can Leave Your Hat On'
- [OK] - 'One Is For'
- [XX] - 'And You And I'
- [->] - 'And You and I'
- [XX] - 'Love Is In The Air'
- [->] - 'Love Is in the Air'
- [XX] - 'I Am The Walrus'
- [->] - 'I Am the Walrus'
- [XX] - 'That Was Then , This Is Now'
- [->] - 'That Was Then, This Is Now'
- [OK] - 'You Are So Beautiful'
- [OK] - 'This Is As Good As It Gets'
- [XX] - 'The Man Who Sold The World'
- [->] - 'The Man Who Sold the World'
- [OK] - 'In a Safe Place'
- [XX] - 'The Best Of The Temptation'
- [->] - 'The Best of The Temptation'
- [XX] - 'Rattle And Hum'
- [->] - 'Rattle and Hum'
- [XX] - 'It 's Now Or Never'
- [->] - 'It's Now or Never'
- [OK] - 'Nothin' But a Good Time'
- [OK] - 'Life Is But a Dream'
- [XX] - 'Ain't But a Few Of Us Left'
- [->] - 'Ain't But a Few of Us Left'
- [XX] - 'You Are But a Draft , a Long Rehearsal For a Show That Will Never Play'
- [->] - 'You Are But a Draft, a Long Rehearsal for a Show That Will Never Play'
- [XX] - 'I Know You Are But What Am I'
- [->] - 'I Know You Are but What Am I'
- [XX] - 'I don't Know What It Is But I Like It'
- [->] - 'I Don't Know What It Is but I Like It'
- [XX] - 'Live At Woodstock'
- [->] - 'Live at Woodstock'
- [XX] - 'Face To Face'
- [->] - 'Face to Face'
- [XX] - 'Death Cab For Cutie'
- [->] - 'Death Cab for Cutie'
- [XX] - 'Pretty In Pink'
- [->] - 'Pretty in Pink'
- [OK] - 'Spy vs. Spy'
- [OK] - 'Birds v. Worms'
- [OK] - 'Time After Time etc.'
- [XX] - 'Keep On rockin' In The Free World'
- [->] - 'Keep On Rockin' in the Free World'
- [XX] - 'Come In From The Cold'
- [->] - 'Come In From the Cold'
- [XX] - 'Nowhere To Run'
- [->] - 'Nowhere to Run'
- [XX] - 'How To Dismantle An Atomic Bomb'
- [->] - 'How to Dismantle an Atomic Bomb'
- [XX] - 'Song I Love To Sing'
- [->] - 'Song I Love to Sing'
- [XX] - 'Reality Used To Be a Friend Of Mine'
- [->] - 'Reality Used to Be a Friend of Mine'
- [XX] - 'An Ode To Death'
- [->] - 'An Ode to Death'
- [OK] - 'Otis! The Definitive Otis Redding'
- [XX] - 'In Time: The Best Of R . E . M .'
- [->] - 'In Time: The Best of R.E.M.'
- [XX] - 'I'm Just a Singer (In a Rock 'n' Roll Band) '
- [->] - 'I'm Just a Singer (In a Rock 'n' Roll Band)'
- [XX] - 'I'm Just; A Singer (Otis! Rox) '
- [->] - 'I'm Just; A Singer (Otis! Rox)'
- [XX] - 'The Go - Gos'
- [->] - 'The Go-Gos'
- [XX] - 'At The Drive - In'
- [->] - 'At the Drive-In'
- [XX] - 'The Boy With The x - Ray Eyes'
- [->] - 'The Boy With the X-Ray Eyes'
- [XX] - 'R . E . M'
- [->] - 'R.E.M'
- [XX] - 'N . W . A'
- [->] - 'N.W.A'
- [XX] - 'R . O . C . K In The U . S . A'
- [->] - 'R.O.C.K in the U.S.A'
- [OK] - 'Rock 'n' Roll'
- [XX] - 'Will o' The Wisp'
- [->] - 'Will o' the Wisp'
- [OK] - 'Sweet Child o' Mine'
- [XX] - 'Nick Cave And The Bad Seeds'
- [->] - 'Nick Cave and The Bad Seeds'
- [XX] - 'Elvis Costello And The Attractions'
- [->] - 'Elvis Costello and The Attractions'
- [XX] - 'Huey Lewis And The News'
- [->] - 'Huey Lewis and The News'
- ## Script avec tests [python]
- #!/usr/bin/env python
- #encoding: utf-8
- from MontyTagger import MontyTagger
- import re
- tagger = MontyTagger()
- ns_list = ['—']
- nsb_list = ['?', '!', ':', ';']
- sbna_list = ['(', '“']
- sanb_list = [')', '”']
- no_space_before_chars = re.compile("([?!:;—])")
- space_before_not_after_chars = re.compile("([\\(\\)\"“”])")
- def music_capitalization(title):
- #Split with () and “”
- splitted_title = space_before_not_after_chars.split(title)
- if len(splitted_title) > 1:
- #Recall this function with each groups to further split
- for i in xrange(0, len(splitted_title)):
- if (splitted_title[i] not in sbna_list and
- splitted_title[i] not in sanb_list):
- splitted_title[i] = music_capitalization(splitted_title[i])
- else:
- #Split with ?!:; etc...
- splitted_title = no_space_before_chars.split(splitted_title[0])
- for i in xrange(0, len(splitted_title)):
- if splitted_title[i] not in nsb_list and splitted_title[i] != '':
- splitted_title[i] = music_capitalize_sentence(splitted_title[i])
- #Join the full sentence again taking car of the correct spacings
- capitalized_title = ""
- for element in splitted_title:
- if element in nsb_list:
- capitalized_title += "%s " % element
- elif element in sbna_list:
- capitalized_title += " %s" % element
- elif element in sanb_list:
- capitalized_title += "%s " % element
- else:
- capitalized_title += element
- return capitalized_title
- def music_capitalize_sentence(title):
- splitted_title = title.lower().split()
- #versus substitution
- substitutions = []
- for i in xrange(0, len(splitted_title)):
- word = splitted_title[i]
- if word == 'versus':
- substitutions.append(('versus', 'versus',))
- elif word == 'vs.':
- splitted_title[i] = 'versus'
- substitutions.append(('versus', 'vs.',))
- elif word == 'v.':
- splitted_title[i] = 'versus'
- substitutions.append(('versus', 'v.',))
- elif word == 'etcetera':
- substitutions.append(('etcetera', 'etcetera',))
- elif word == 'etc.':
- splitted_title[i] = 'etcetera'
- substitutions.append(('etcetera', 'etc.',))
- elif word == '.':
- substitutions.append(('.', '.',))
- elif '-' in word:
- splitted_word = word.split('-')
- for el in splitted_word:
- substitutions.append(('.', '-',))
- splitted_title.pop(i)
- splitted_title.insert(i, '.'.join(splitted_word))
- tagged_title = []
- for element in tagger.tag(' '.join(splitted_title), expand_contractions_p=1).split():
- tagged_title.append(element.split('/'))
- #Capitalize all nouns, verbs, adverbs, subordinating conjunctions,
- #adjectives and pronouns
- prev_prev_element = None
- prev_element = None
- for element in tagged_title:
- word = element[0].lower()
- tag = element[1]
- if '\'' in word:
- continue
- elif tag.startswith('.'):
- if prev_element and len(prev_element[0]) == 1:
- prev_element[0] = prev_element[0].capitalize()
- elif len(word) == 1:
- if (prev_prev_element and len(prev_prev_element[0]) == 1 and
- prev_element and prev_element[0] == '.'):
- prev_prev_element[0] = prev_prev_element[0].capitalize()
- element[0] = element[0].capitalize()
- #Determinent except 'a', 'an' and 'the'
- elif tag.startswith('DT') or tag.startswith('WDT'):
- if word == 'a' or word == 'an' or word == 'the':
- element[0] = word
- else:
- element[0] = element[0].capitalize()
- #Coordinating Conjunction
- elif tag.startswith('CC'):
- #but after a verb is probably an adverb (not always but the tagger is wrong on this)
- if word == 'but' and prev_element != None and prev_element[1].startswith('VB'):
- element[0] = element[0].capitalize()
- elif (word == 'and' or
- word == 'but' or
- word == 'or' or
- word == 'nor' or
- word == 'for' or
- word == 'yet' or
- word == 'so'):
- element[0] = word
- else:
- element[0] = element[0].capitalize()
- #Preposition except len(prep) <= 3
- elif tag.startswith('IN'):
- if len(word) <= 3 or word == 'versus':
- element[0] = word
- else:
- element[0] = element[0].capitalize()
- #Adverbs except etcetera
- elif tag.startswith('RB') or tag.startswith('WRB'):
- if word == 'etcetera':
- element[0] = word
- else:
- element[0] = element[0].capitalize()
- #Verbs
- elif tag.startswith('VB'):
- #'to' used to form an infinitive
- if tag == 'VB' and prev_element and prev_element[1] == 'TO':
- prev_element[0] = prev_element[0].lower()
- element[0] = element[0].capitalize()
- else:
- element[0] = word.capitalize()
- prev_prev_element = prev_element
- prev_element = element
- #Capitalize the first and last word
- tagged_title[0][0] = tagged_title[0][0].capitalize()
- tagged_title[-1][0] = tagged_title[-1][0].capitalize()
- #Reverse the substitutions
- idx = 0
- for sub, orig in substitutions:
- for element in tagged_title[idx:]:
- idx += 1
- word = element[0]
- if word.lower() == sub:
- element[0] = orig
- break
- #print tagged_title
- #Rejoin the title without the POS tags
- capitalized_title = None
- for i in xrange(0, len(tagged_title)):
- element = tagged_title[i]
- if capitalized_title == None:
- capitalized_title = element[0]
- elif element[1] in [',', '.']:
- capitalized_title += '%s' % element[0]
- elif i > 0 and tagged_title[i-1][1] == '.':
- capitalized_title += '%s' % element[0]
- else:
- capitalized_title += ' %s' % element[0]
- return capitalized_title.strip()
- def test(shouldbe):
- result = music_capitalization(shouldbe)
- if result == shouldbe:
- print "[OK] - '" + result + "'"
- else:
- print "[XX] - '" + result + "'\n[->] - '" + shouldbe + "'"
- print
- print
- test("Brian Is in the Kitchen")
- test("Bring It On")
- test("You Can Leave Your Hat On")
- test("One Is For")
- test("And You and I")
- test("Love Is in the Air")
- test("I Am the Walrus")
- test("That Was Then, This Is Now")
- test("You Are So Beautiful")
- test("This Is As Good As It Gets")
- test("The Man Who Sold the World")
- test("In a Safe Place")
- test("The Best of The Temptation") #TODO (the -> The)
- test("Rattle and Hum")
- test("It's Now or Never")
- test("Nothin' But a Good Time")
- test("Life Is But a Dream")
- test("Ain't But a Few of Us Left") #TODO but -> But
- test("You Are But a Draft, a Long Rehearsal for a Show That Will Never Play")
- test("I Know You Are but What Am I") #TODO But -> but
- test("I Don't Know What It Is but I Like It") #TODO But -> but
- test("Live at Woodstock")
- test("Face to Face")
- test("Death Cab for Cutie")
- test("Pretty in Pink")
- test("Spy vs. Spy")
- test("Birds v. Worms")
- test("Time After Time etc.") #TODO "et cetera" will pass unoticed
- test("Keep On Rockin' in the Free World") #TODO on -> On without in -> In
- test("Come In From the Cold") #TODO in -> In
- test("Nowhere to Run")
- test("How to Dismantle an Atomic Bomb")
- test("Song I Love to Sing")
- test("Reality Used to Be a Friend of Mine")
- test("An Ode to Death")
- test("Otis! The Definitive Otis Redding")
- test("In Time: The Best of R.E.M.")
- test("I'm Just a Singer (In a Rock 'n' Roll Band)")
- test("I'm Just; A Singer (Otis! Rox)")
- test("The Go-Gos")
- test("At the Drive-In")
- test("The Boy With the X-Ray Eyes")
- test("R.E.M")
- test("N.W.A")
- test("R.O.C.K in the U.S.A")
- test("Rock 'n' Roll")
- test("Will o' the Wisp")
- test("Sweet Child o' Mine")
- test("Nick Cave and The Bad Seeds") #TODO the -> The
- test("Elvis Costello and The Attractions") #TODO the -> The
- test("Huey Lewis and The News") #TODO the -> The
Add Comment
Please, Sign In to add comment