Guest User

Untitled

a guest
Oct 21st, 2017
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.36 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. from itertools import izip_longest
  3. import io
  4. import re
  5.  
  6. with io.open(r'C:...python1k_erk.txt', 'r', encoding = 'utf8') as erkList,
  7. io.open(r'C:...python1k_wb.txt', 'r', encoding = 'utf8') as wbList,
  8. io.open(r'C:...pythonerrors.txt', 'w', encoding = 'utf8') as errorList:
  9.  
  10. #initialize numerator and denominator values for calculating accuracy
  11. nTotal = 1000
  12. nCorrect = nTotal
  13.  
  14. counter = 0
  15.  
  16. for erkLine, wbLine in izip_longest(erkList, wbList):
  17.  
  18. erkWord = erkLine.strip()
  19. wbWord = wbLine.strip()
  20.  
  21. # Rule 1: Word-initial V
  22. # Rule 1.1: Word-initial ^VCV -> ^V[=]CV
  23. match = re.match(u'^[AEIOUYaeiouy]([bcćdfghjklłmnńprsśtwzżź]|rz|sz|cz|dz|dż|dź|ch)[aąeęioóuy].*(.*[=].*)*', wbWord)
  24. result = match.group() if match else None
  25.  
  26. if result == wbWord:
  27. wbWord = re.sub(u'(?<=^[AEIOUYaeiouy])(?=([bcćdfghjklłmnńprsśtwzżź]|rz|sz|cz|dz|dż|dź|ch)[aąeęioóuy])', u'[=]', wbWord)
  28.  
  29. # Rule 1.2: Word-initial ^VCCV -> ^VC[=]CV
  30. match = re.match(u'^[AEIOUYaeiouy][bcćdfghjklłmnńprsśtwzżź](?:[bcćdfgjklłmnńprsśtw]|rz|sz|cz|dz|dż|dź|ch)[aąeęioóuy].*(.*[=].*)*', wbWord)
  31. result = match.group() if match else None
  32.  
  33. if result == wbWord:
  34. wbWord = re.sub(u'(?<=^[AEIOUYaeiouy][bcćdfghjklłmnńprsśtwzżź])(?=([bcćdfgjklłmnńprsśtw]|rz|sz|cz|dz|dż|dź|ch)[aąeęioóuy])', u'[=]', wbWord)
  35.  
  36. # Rule 2: V obstruent_1 obstruent_2 V -> V obstruent_1 [=] obstruent_2 V
  37. # Rule 2.1: sibilants + obstruent clusters
  38. match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][=][ćsśzżźwf][ptkbdg][aąeęioóuyrfw].*(.*[=].*)*', wbWord)
  39. result = match.group() if match else None
  40.  
  41. if result == wbWord:
  42. #remove erroneous syllable break
  43. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[ćsśzżźwf][ptkbdg][aąeęioóuyrfw])', u'', wbWord)
  44. #add correct syllable break
  45. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][ćsśzżźwf])(?=[ptkbdg][aąeęioóuyrfw])', u'[=]', wbWord)
  46.  
  47. # Rule 2.2: V[=]ściVC? -> Vś[=]ciVC?
  48. match = re.match(u'(.*[=].*)*[aąeęioóuyj][=]ści[aąeęou]?.*',wbWord)
  49. result = match.group() if match else None
  50.  
  51. if result == wbWord:
  52.  
  53. wbWord = re.sub(u'(?<=[aąeęioóuyj])[=](?=ści[aąeęou]?)', u'', wbWord)
  54. wbWord = re.sub(u'(?<=[aąeęioóuyj]ś)(?=ci[aąeęou]?)', u'[=]', wbWord)
  55.  
  56. # Rule 2.3.1: V [=] fricative_1 fricative_2 V -> V fricative_1 [=] fricative_2 V (labiodentals before sibilants)
  57. match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy][=][wf][szżź][aąeęioóuy].*(.*[=].*)*', wbWord)
  58. result = match.group() if match else None
  59.  
  60. if result == wbWord:
  61.  
  62. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[wf][szżź][aąeęioóuy])', u'', wbWord)
  63. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][wf])(?=[szżź][aąeęioóuy])', u'[=]', wbWord)
  64.  
  65. # Rule 2.3.2: V [=] fricative_1 fricative_2 V -> V fricative_1 [=] fricative_2 V (sibilants before labiodentals)
  66. match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy][=][cćsśzżź][wf][aąeęioóuy].*(.*[=].*)*', wbWord)
  67. result = match.group() if match else None
  68.  
  69. if result == wbWord:
  70.  
  71. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[cćsśzżź][wf][aąeęioóuy])', u'', wbWord)
  72. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][cćsśzżź])(?=[wf][aąeęioóuy])', u'[=]', wbWord)
  73.  
  74. # Rule 2.4.1 w [=] digraph digraph w -> w digraph [=] digraph w
  75. match = re.match(u'(.*[=].*)*w?w?w?w[=][cdsr][hzż][cdsr][hzż]w?w?w?(.*[=].*)*', wbWord)
  76. result = match.group() if match else None
  77.  
  78. if result == wbWord:
  79.  
  80. wbWord = re.sub(u'(?<=w)[=](?=[cdsr][hzż][cdsr][hzż])', u'', wbWord)
  81. wbWord = re.sub(u'(?<=w[cdsr][hzż])(?=[cdsr][hzż])', u'[=]', wbWord)
  82.  
  83. # Rule 2.4.2 w digraph digraph [=] w-> w digraph [=] digraph w
  84. match = re.match(u'(.*[=].*)*w?w?w?[cdsr][hzż][cdsr][hzż][=]w?w?w?(.*[=].*)*', wbWord)
  85. result = match.group() if match else None
  86.  
  87. if result == wbWord:
  88.  
  89. wbWord = re.sub(u'(?<=[cdsr][hzż][cdsr][hzż])[=]', u'', wbWord)
  90. wbWord = re.sub(u'(?<=[cdsr][hzż])(?=[cdsr][hzż])', u'[=]', wbWord)
  91.  
  92. # Rule 3: V obstruent [=] sonorant V -> V [=] obstruent sonorant V
  93. # Rule 3.1: V monograph-obstruent [=] sonorant V -> V [=] monograph-obstruent sonorant V
  94. match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy][bcćdfghkpsśtwzżź][=][jlłmnr][zżź]?[aąeęioóuy](.*[=].*)*', wbWord)
  95. result = match.group() if match else None
  96.  
  97. if result == wbWord:
  98.  
  99. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][bcćdfghkpsśtwzżź])[=](?=[jlłmnr][zżź]?[aąeęioóuy])', u'', wbWord)
  100. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])(?=[bcćdfghkpsśtwzżź][jlłmnr][zżź]?[aąeęioóuy])', u'[=]', wbWord)
  101.  
  102. # Rule 3.2.1: V digraph-obstruent [=] sonorant V -> V [=] digraph-obstruent sonorant V ==> all bigraphs except ch, namely sz cz dz rz
  103. match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy][scr][zżź][=][lłmnr][aąeęioóuy](.*[=].*)*', wbWord)
  104. result = match.group() if match else None
  105.  
  106. if result == wbWord:
  107.  
  108. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][scdr][zżź])[=](?=[lłmnr][aąeęioóuy])', u'', wbWord)
  109. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])(?=[scdr][zżź][lłmnr][aąeęioóuy])', u'[=]', wbWord)
  110.  
  111. # Rule 3.2.2: V digraph-obstruent [=] sonorant V -> V [=] digraph-obstruent sonorant V ===> only ch
  112. match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy]ch[=][lłmnr][aąeęioóuy](.*[=].*)*',wbWord)
  113. result = match.group() if match else None
  114.  
  115. if result == wbWord:
  116.  
  117. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy]ch)[=](?=[lłmnr][aąeęioóuy])', u'', wbWord)
  118. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])(?=ch[lłmnr][aąeęioóuy])', u'[=]', wbWord)
  119.  
  120. #Rule 4: V [=] sonorant obstruent -> V sonorant [=] obstruent V
  121. match = re.match(u'(.*[=].*)*.*[AEIOUYaąeęioóuy][=][jlłmn][bcdfghkpstvwzż][aąeęioóuy].*(.*[=].*)*', wbWord)
  122. result = match.group() if match else None
  123.  
  124. if result == wbWord:
  125.  
  126. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[jlłmn][bcdfghkpstvwzż][aąeęioóuy])', u'', wbWord)
  127. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][jlłmn])(?=[bcdfghkpstvwzż][aąeęioóuy])', u'[=]', wbWord)
  128.  
  129. #triconsonantal clusters
  130. #Rule 5.1.1: V [=] C1 C2 C3 V -> V C1 [=] C2 C3 V
  131. match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][=][wszmn][ptkbdg][ptkbdgcrmn][zh]?[aąeęioóuy]w?w?w?(.*[=].*)*', wbWord)
  132. result = match.group() if match else None
  133.  
  134. if result == wbWord:
  135.  
  136. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[wszmn][ptkbdg][ptkbdgcrmn][zh]?[aąeęioóuy])', u'', wbWord)
  137. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][wszmn])(?=[ptkbdg][ptkbdgcrmn][zh]?[aąeęioóuy])', u'[=]', wbWord)
  138.  
  139. #Rule 5.1.2: V C1 C2 [=] C3 V -> V C1 [=] C2 C3 V
  140. match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][wszmn][ptkbdg][=][ptkbdgrmn]z?[aąeęioóuy]w?w?w?(.*[=].*)*',wbWord)
  141. result = match.group() if match else None
  142.  
  143. if result == wbWord:
  144.  
  145. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][wsśzmn][ptkbdg])[=](?:[ptkbdgrmn]z?[aąeęioóuy])', u'', wbWord)
  146. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][wsśzmn])(?=[ptkbdg][ptkbdgrmn]z?[aąeęioóuy])', u'[=]', wbWord)
  147.  
  148. #Rule 5.2 stop stop sonorant
  149. #Rule 5.2.1: V [=] C1 C2 C3 V -> V C1 [=] C2 C3 V
  150. match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy][=][ptkbdg][ptkbdg][crslł][zh]?[aąeęioóuy]w?w?(.*[=].*)*', wbWord)
  151. result = match.group() if match else None
  152.  
  153. if result == wbWord:
  154.  
  155. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[ptkbdg][ptkbdg][crslł][zh]?[aąeęioóuy])', u'', wbWord)
  156. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][pthbdg])(?=[ptkbdg][crslł][zh]?[aąeęioóuy])', u'[=]', wbWord)
  157.  
  158. #Rule 5.2.2: V C1 C2 [=] C3 V -> V C1 [=] C2 C3 V
  159. match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][ptkbdg][ptkbdg][=][rlł]z?[aąeęioóuy]w?w?w?(.*[=].*)*', wbWord)
  160. result = match.group() if match else None
  161.  
  162. if result == wbWord:
  163.  
  164. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][ptkbdg][ptkbdg])[=](?:[rlł]z?[aąeęioóuy])', u'', wbWord)
  165. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][ptkbdg])(?=[ptkbdg][rlł]z?[aąeęioóuy])', u'[=]', wbWord)
  166.  
  167.  
  168. #Rule 5.3.1 sibilant/nasal stop sonorant
  169. #5.3.1: V C1 C2 C3 V -> V C1 [=] C2 C3 V
  170. match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][=][ptkbdg][ptkbdg][rlł]z?[aąeęioóuy]w?w?w?(.*[=].*)*', wbWord)
  171. result = match.group() if match else None
  172.  
  173. if result == wbWord:
  174.  
  175. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?:[ptkbdg][ptkbdg][rlł]z?[aąeęioóuy])', u'', wbWord)
  176. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][ptkbdg])(?=[ptkbdg][rlł]z?[aąeęioóuy])', u'[=]', wbWord)
  177.  
  178. #Rule 5.3.2: V C1 C2 [=] C3 V -> V C1 [=] C2 C3 V
  179. match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][wszmn][ptkbdg][=][łlr]z?[aąeęioóuy]w?w?w?(.*[=].*)*', wbWord)
  180. result = match.group() if match else None
  181.  
  182. if result == wbWord:
  183.  
  184. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])(?=[wszmn][ptkbdg][=][łlr]z?[aąeęioóuy])', u'', wbWord)
  185. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][wszmn])(?=[ptkbdg][lłr]z?[aąeęioóuy])', u'[=]', wbWord)
  186.  
  187. #Rule 6: divide two adjacent vowels
  188. match = re.match(u'(.*[=].*)*w?w?w?[AEOUaeouy][aeoui]w?[=](.*[=].*)*', wbWord)
  189. result = match.group() if match else None
  190.  
  191. if result == wbWord:
  192.  
  193. wbWord = re.sub(u'(?<=[AEOUaeouy])(?=[aeoui]w?[=])', u'[=]', wbWord)
  194.  
  195. #Rule 7: 4-consonant clusters
  196. match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][bcćdfghkpstwzżźjlmnr][=][bcćdfgkpsśtwjlłmnr][bcćdfgkpsśtwjlłmnr][bcćdfgkpsśtwjlłmnr][aąeęioóuy]w?w?w?(.*[=].*)*',wbWord)
  197. result = match.group() if match else None
  198.  
  199. if result == wbWord:
  200.  
  201. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][bcćdfghkpstwzżźjlmnr])[=](?=[bcćdfgkpsśtwjlłmnr][bcćdfgkpsśtwjlłmnr][bcćdfgkpsśtwjlłmnr][aąeęioóuy])', u'', wbWord)
  202. wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][bcćdfghkpstwzżźjlmnr][bcćdfgkpsśtwjlłmnr])(?=[bcćdfgkpsśtwjlłmnr][bcćdfgkpsśtwjlłmnr][aąeęioóuy])', u'[=]', wbWord)
  203.  
  204. if erkWord != wbWord:
  205.  
  206. outLine = wbWord + "t" + erkWord + "n"
  207. errorList.write(outLine)
  208.  
  209. nCorrect -= 1
  210.  
  211. print float(nCorrect) / nTotal
Add Comment
Please, Sign In to add comment