# ibeforee

Dec 3rd, 2013
1. from __future__ import division
2.
3. import itertools as IT
4.
5. VOWELS = "AEIOUY"
6. CONSONANTS = "BCDFGHJKLMNPQRSTVWXZ"
7.
8. def join_S_V(res):
9.     res = "".join(res)
10.     while "SV" in res: res = res.replace("SV", "S")
11.     while "VS" in res: res = res.replace("VS", "S")
12.     return res
13.
14. def sign(word, special):
15.     res= []
16.     i = 0
17.     while i < len(word):
18.         if word[i:i+2] == special:
19.             x = 'S'
20.             i += 1
21.         elif word[i] in CONSONANTS:
22.             x = 'C'
23.         elif word[i] in VOWELS:
24.             x = 'V'
25.         i += 1
26.         res.append(x)
27.     res = join_S_V(res)
28.     return [k for k,_ in IT.groupby(res)]
29.
30. def ph_sign(phon, special="EY"):
31.     res= []
32.     for p in phon:
33.         c = p[:2]
34.         if  c == special:
35.             x = 'S'
36.         elif c[0] in CONSONANTS or c == "ER":
37.             x = 'C'
38.         elif c[0] in VOWELS:
39.             x = 'V'
40.         res.append(x)
41.     res = join_S_V(res)
42.     return [k for k,_ in IT.groupby(res)]
43.
44. def match(word, phon, special):
45.     if word.startswith("MC"): word = "MAC" + word[2:]
46.     s1 = sign(word, special)
47.     s2 = ph_sign(phon)
48.     return ('S', 'S') in zip(s1,s2)
49.
50. def check_pos(word, phon, special):
51.     ind = word.index(special)
52.     tol = 2
53.     plen = len(phon)
54.     wlen = len(word)
55.     pind = [i for i, p in enumerate(phon) if p.startswith("EY")]
56.     if not pind:
57.         return False
58.     wordpos = (ind + 0.5)
59.     phonpos = [i / plen * wlen for i in pind]
60.     return any(abs(p - wordpos) <= tol for p in phonpos)
61.
62. def sounds_like(line, special):
63.     e = line.split()
64.     word = e[0]
65.     phon = e[1:]
66.     if not check_pos(word, phon, special):
67.         return False
68.     if not any(p[:2] == "EY" for p in phon):
69.         return False
70.     return match(word, phon, special)
71.
72. def against_rule2(line):
73.     word = line.split()[0]
74.     ind = word.find("IE")
75.     if  ind == -1:
76.         return False
77.     return sounds_like(line, "IE")
78.
79. def against_rule3(line):
80.     word = line.split()[0]
81.     ind = word.find("EI")
82.     if  ind == -1:
83.         return False
84.     if word.find("CEI") >= 0:
85.         return False
86.     return not sounds_like(line, "EI")
87.
88. def ibeforee_exceptions():
89.     """ the following combinations are against the rule:
90.        1. CIE
91.        2. IE that sound like EY[012]
92.        3. EI that do not sound like EY[012] and have no C in front
94.            EIE like in German names, which are on the list
95.            AIE like MAIER (M EY1 ER0) where the EY1 is more for AI than IE
96.            DIEGO (D IY0 EY1 G OW0) I and E are separate sounds
97.            ATHEIST where E and I are separate sounds
98.            DOSSIER (D AO2 S Y EY10)
99.            FRASIER'S  F R EY1 ZH ER0 Z EY1 is for A
100.    """
101.     f = open("cmudict_ie.txt")
102.     words = 0
103.     violate1 = violate2 = violate3 = 0
104.     with open("cmudict_ie.txt") as f:
105.             for line in f:
106.                 words += 1
107.                 line = line[:-1]
108.                 elements = line.strip().split()
109.                 line = " ".join(elements)
110.                 word = elements[0]
111.                 if "CIE" in word:
112.                     print "rule 1:", line
113.                     violate1 += 1
114.                 elif against_rule2(line):
115.                     print "rule 2:", line
116.                     violate2 += 1
117.                 elif against_rule3(line):
118.                     print "rule 3:", line
119.                     violate3 += 1
120.             print "number of words", words
121.             print "CIE            ", violate1
122.             print "IE that sound like EY[012]", violate2
123.             print "EI that do not sound like EY[012] and have no C in front", violate3
124.             print "total number violations", violate1 + violate2 + violate3
125.
126. ibeforee_exceptions()
