from __future__ import division import itertools as IT VOWELS = "AEIOUY" CONSONANTS = "BCDFGHJKLMNPQRSTVWXZ" def join_S_V(res): res = "".join(res) while "SV" in res: res = res.replace("SV", "S") while "VS" in res: res = res.replace("VS", "S") return res def sign(word, special): res= [] i = 0 while i < len(word): if word[i:i+2] == special: x = 'S' i += 1 elif word[i] in CONSONANTS: x = 'C' elif word[i] in VOWELS: x = 'V' i += 1 res.append(x) res = join_S_V(res) return [k for k,_ in IT.groupby(res)] def ph_sign(phon, special="EY"): res= [] for p in phon: c = p[:2] if c == special: x = 'S' elif c[0] in CONSONANTS or c == "ER": x = 'C' elif c[0] in VOWELS: x = 'V' res.append(x) res = join_S_V(res) return [k for k,_ in IT.groupby(res)] def match(word, phon, special): if word.startswith("MC"): word = "MAC" + word[2:] s1 = sign(word, special) s2 = ph_sign(phon) return ('S', 'S') in zip(s1,s2) def check_pos(word, phon, special): ind = word.index(special) tol = 2 plen = len(phon) wlen = len(word) pind = [i for i, p in enumerate(phon) if p.startswith("EY")] if not pind: return False wordpos = (ind + 0.5) phonpos = [i / plen * wlen for i in pind] return any(abs(p - wordpos) <= tol for p in phonpos) def sounds_like(line, special): e = line.split() word = e[0] phon = e[1:] if not check_pos(word, phon, special): return False if not any(p[:2] == "EY" for p in phon): return False return match(word, phon, special) def against_rule2(line): word = line.split()[0] ind = word.find("IE") if ind == -1: return False return sounds_like(line, "IE") def against_rule3(line): word = line.split()[0] ind = word.find("EI") if ind == -1: return False if word.find("CEI") >= 0: return False return not sounds_like(line, "EI") def ibeforee_exceptions(): """ the following combinations are against the rule: 1. CIE 2. IE that sound like EY[012] 3. EI that do not sound like EY[012] and have no C in front What about: EIE like in German names, which are on the list AIE like MAIER (M EY1 ER0) where the EY1 is more for AI than IE DIEGO (D IY0 EY1 G OW0) I and E are separate sounds ATHEIST where E and I are separate sounds DOSSIER (D AO2 S Y EY10) FRASIER'S F R EY1 ZH ER0 Z EY1 is for A """ f = open("cmudict_ie.txt") words = 0 violate1 = violate2 = violate3 = 0 with open("cmudict_ie.txt") as f: for line in f: words += 1 line = line[:-1] elements = line.strip().split() line = " ".join(elements) word = elements[0] if "CIE" in word: print "rule 1:", line violate1 += 1 elif against_rule2(line): print "rule 2:", line violate2 += 1 elif against_rule3(line): print "rule 3:", line violate3 += 1 print "number of words", words print "CIE ", violate1 print "IE that sound like EY[012]", violate2 print "EI that do not sound like EY[012] and have no C in front", violate3 print "total number violations", violate1 + violate2 + violate3 ibeforee_exceptions()