Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import division
- import itertools as IT
- VOWELS = "AEIOUY"
- CONSONANTS = "BCDFGHJKLMNPQRSTVWXZ"
- def join_S_V(res):
- res = "".join(res)
- while "SV" in res: res = res.replace("SV", "S")
- while "VS" in res: res = res.replace("VS", "S")
- return res
- def sign(word, special):
- res= []
- i = 0
- while i < len(word):
- if word[i:i+2] == special:
- x = 'S'
- i += 1
- elif word[i] in CONSONANTS:
- x = 'C'
- elif word[i] in VOWELS:
- x = 'V'
- i += 1
- res.append(x)
- res = join_S_V(res)
- return [k for k,_ in IT.groupby(res)]
- def ph_sign(phon, special="EY"):
- res= []
- for p in phon:
- c = p[:2]
- if c == special:
- x = 'S'
- elif c[0] in CONSONANTS or c == "ER":
- x = 'C'
- elif c[0] in VOWELS:
- x = 'V'
- res.append(x)
- res = join_S_V(res)
- return [k for k,_ in IT.groupby(res)]
- def match(word, phon, special):
- if word.startswith("MC"): word = "MAC" + word[2:]
- s1 = sign(word, special)
- s2 = ph_sign(phon)
- return ('S', 'S') in zip(s1,s2)
- def check_pos(word, phon, special):
- ind = word.index(special)
- tol = 2
- plen = len(phon)
- wlen = len(word)
- pind = [i for i, p in enumerate(phon) if p.startswith("EY")]
- if not pind:
- return False
- wordpos = (ind + 0.5)
- phonpos = [i / plen * wlen for i in pind]
- return any(abs(p - wordpos) <= tol for p in phonpos)
- def sounds_like(line, special):
- e = line.split()
- word = e[0]
- phon = e[1:]
- if not check_pos(word, phon, special):
- return False
- if not any(p[:2] == "EY" for p in phon):
- return False
- return match(word, phon, special)
- def against_rule2(line):
- word = line.split()[0]
- ind = word.find("IE")
- if ind == -1:
- return False
- return sounds_like(line, "IE")
- def against_rule3(line):
- word = line.split()[0]
- ind = word.find("EI")
- if ind == -1:
- return False
- if word.find("CEI") >= 0:
- return False
- return not sounds_like(line, "EI")
- def ibeforee_exceptions():
- """ the following combinations are against the rule:
- 1. CIE
- 2. IE that sound like EY[012]
- 3. EI that do not sound like EY[012] and have no C in front
- What about:
- EIE like in German names, which are on the list
- AIE like MAIER (M EY1 ER0) where the EY1 is more for AI than IE
- DIEGO (D IY0 EY1 G OW0) I and E are separate sounds
- ATHEIST where E and I are separate sounds
- DOSSIER (D AO2 S Y EY10)
- FRASIER'S F R EY1 ZH ER0 Z EY1 is for A
- """
- f = open("cmudict_ie.txt")
- words = 0
- violate1 = violate2 = violate3 = 0
- with open("cmudict_ie.txt") as f:
- for line in f:
- words += 1
- line = line[:-1]
- elements = line.strip().split()
- line = " ".join(elements)
- word = elements[0]
- if "CIE" in word:
- print "rule 1:", line
- violate1 += 1
- elif against_rule2(line):
- print "rule 2:", line
- violate2 += 1
- elif against_rule3(line):
- print "rule 3:", line
- violate3 += 1
- print "number of words", words
- print "CIE ", violate1
- print "IE that sound like EY[012]", violate2
- print "EI that do not sound like EY[012] and have no C in front", violate3
- print "total number violations", violate1 + violate2 + violate3
- ibeforee_exceptions()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement