Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- from itertools import izip_longest
- import io
- import re
- with io.open(r'C:...python1k_erk.txt', 'r', encoding = 'utf8') as erkList,
- io.open(r'C:...python1k_wb.txt', 'r', encoding = 'utf8') as wbList,
- io.open(r'C:...pythonerrors.txt', 'w', encoding = 'utf8') as errorList:
- #initialize numerator and denominator values for calculating accuracy
- nTotal = 1000
- nCorrect = nTotal
- counter = 0
- for erkLine, wbLine in izip_longest(erkList, wbList):
- erkWord = erkLine.strip()
- wbWord = wbLine.strip()
- # Rule 1: Word-initial V
- # Rule 1.1: Word-initial ^VCV -> ^V[=]CV
- match = re.match(u'^[AEIOUYaeiouy]([bcćdfghjklłmnńprsśtwzżź]|rz|sz|cz|dz|dż|dź|ch)[aąeęioóuy].*(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=^[AEIOUYaeiouy])(?=([bcćdfghjklłmnńprsśtwzżź]|rz|sz|cz|dz|dż|dź|ch)[aąeęioóuy])', u'[=]', wbWord)
- # Rule 1.2: Word-initial ^VCCV -> ^VC[=]CV
- match = re.match(u'^[AEIOUYaeiouy][bcćdfghjklłmnńprsśtwzżź](?:[bcćdfgjklłmnńprsśtw]|rz|sz|cz|dz|dż|dź|ch)[aąeęioóuy].*(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=^[AEIOUYaeiouy][bcćdfghjklłmnńprsśtwzżź])(?=([bcćdfgjklłmnńprsśtw]|rz|sz|cz|dz|dż|dź|ch)[aąeęioóuy])', u'[=]', wbWord)
- # Rule 2: V obstruent_1 obstruent_2 V -> V obstruent_1 [=] obstruent_2 V
- # Rule 2.1: sibilants + obstruent clusters
- match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][=][ćsśzżźwf][ptkbdg][aąeęioóuyrfw].*(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- #remove erroneous syllable break
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[ćsśzżźwf][ptkbdg][aąeęioóuyrfw])', u'', wbWord)
- #add correct syllable break
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][ćsśzżźwf])(?=[ptkbdg][aąeęioóuyrfw])', u'[=]', wbWord)
- # Rule 2.2: V[=]ściVC? -> Vś[=]ciVC?
- match = re.match(u'(.*[=].*)*[aąeęioóuyj][=]ści[aąeęou]?.*',wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[aąeęioóuyj])[=](?=ści[aąeęou]?)', u'', wbWord)
- wbWord = re.sub(u'(?<=[aąeęioóuyj]ś)(?=ci[aąeęou]?)', u'[=]', wbWord)
- # Rule 2.3.1: V [=] fricative_1 fricative_2 V -> V fricative_1 [=] fricative_2 V (labiodentals before sibilants)
- match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy][=][wf][szżź][aąeęioóuy].*(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[wf][szżź][aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][wf])(?=[szżź][aąeęioóuy])', u'[=]', wbWord)
- # Rule 2.3.2: V [=] fricative_1 fricative_2 V -> V fricative_1 [=] fricative_2 V (sibilants before labiodentals)
- match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy][=][cćsśzżź][wf][aąeęioóuy].*(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[cćsśzżź][wf][aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][cćsśzżź])(?=[wf][aąeęioóuy])', u'[=]', wbWord)
- # Rule 2.4.1 w [=] digraph digraph w -> w digraph [=] digraph w
- match = re.match(u'(.*[=].*)*w?w?w?w[=][cdsr][hzż][cdsr][hzż]w?w?w?(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=w)[=](?=[cdsr][hzż][cdsr][hzż])', u'', wbWord)
- wbWord = re.sub(u'(?<=w[cdsr][hzż])(?=[cdsr][hzż])', u'[=]', wbWord)
- # Rule 2.4.2 w digraph digraph [=] w-> w digraph [=] digraph w
- match = re.match(u'(.*[=].*)*w?w?w?[cdsr][hzż][cdsr][hzż][=]w?w?w?(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[cdsr][hzż][cdsr][hzż])[=]', u'', wbWord)
- wbWord = re.sub(u'(?<=[cdsr][hzż])(?=[cdsr][hzż])', u'[=]', wbWord)
- # Rule 3: V obstruent [=] sonorant V -> V [=] obstruent sonorant V
- # Rule 3.1: V monograph-obstruent [=] sonorant V -> V [=] monograph-obstruent sonorant V
- match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy][bcćdfghkpsśtwzżź][=][jlłmnr][zżź]?[aąeęioóuy](.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][bcćdfghkpsśtwzżź])[=](?=[jlłmnr][zżź]?[aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])(?=[bcćdfghkpsśtwzżź][jlłmnr][zżź]?[aąeęioóuy])', u'[=]', wbWord)
- # Rule 3.2.1: V digraph-obstruent [=] sonorant V -> V [=] digraph-obstruent sonorant V ==> all bigraphs except ch, namely sz cz dz rz
- match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy][scr][zżź][=][lłmnr][aąeęioóuy](.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][scdr][zżź])[=](?=[lłmnr][aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])(?=[scdr][zżź][lłmnr][aąeęioóuy])', u'[=]', wbWord)
- # Rule 3.2.2: V digraph-obstruent [=] sonorant V -> V [=] digraph-obstruent sonorant V ===> only ch
- match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy]ch[=][lłmnr][aąeęioóuy](.*[=].*)*',wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy]ch)[=](?=[lłmnr][aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])(?=ch[lłmnr][aąeęioóuy])', u'[=]', wbWord)
- #Rule 4: V [=] sonorant obstruent -> V sonorant [=] obstruent V
- match = re.match(u'(.*[=].*)*.*[AEIOUYaąeęioóuy][=][jlłmn][bcdfghkpstvwzż][aąeęioóuy].*(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[jlłmn][bcdfghkpstvwzż][aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][jlłmn])(?=[bcdfghkpstvwzż][aąeęioóuy])', u'[=]', wbWord)
- #triconsonantal clusters
- #Rule 5.1.1: V [=] C1 C2 C3 V -> V C1 [=] C2 C3 V
- match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][=][wszmn][ptkbdg][ptkbdgcrmn][zh]?[aąeęioóuy]w?w?w?(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[wszmn][ptkbdg][ptkbdgcrmn][zh]?[aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][wszmn])(?=[ptkbdg][ptkbdgcrmn][zh]?[aąeęioóuy])', u'[=]', wbWord)
- #Rule 5.1.2: V C1 C2 [=] C3 V -> V C1 [=] C2 C3 V
- match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][wszmn][ptkbdg][=][ptkbdgrmn]z?[aąeęioóuy]w?w?w?(.*[=].*)*',wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][wsśzmn][ptkbdg])[=](?:[ptkbdgrmn]z?[aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][wsśzmn])(?=[ptkbdg][ptkbdgrmn]z?[aąeęioóuy])', u'[=]', wbWord)
- #Rule 5.2 stop stop sonorant
- #Rule 5.2.1: V [=] C1 C2 C3 V -> V C1 [=] C2 C3 V
- match = re.match(u'(.*[=].*)*w?w?[AEIOUYaąeęioóuy][=][ptkbdg][ptkbdg][crslł][zh]?[aąeęioóuy]w?w?(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?=[ptkbdg][ptkbdg][crslł][zh]?[aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][pthbdg])(?=[ptkbdg][crslł][zh]?[aąeęioóuy])', u'[=]', wbWord)
- #Rule 5.2.2: V C1 C2 [=] C3 V -> V C1 [=] C2 C3 V
- match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][ptkbdg][ptkbdg][=][rlł]z?[aąeęioóuy]w?w?w?(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][ptkbdg][ptkbdg])[=](?:[rlł]z?[aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][ptkbdg])(?=[ptkbdg][rlł]z?[aąeęioóuy])', u'[=]', wbWord)
- #Rule 5.3.1 sibilant/nasal stop sonorant
- #5.3.1: V C1 C2 C3 V -> V C1 [=] C2 C3 V
- match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][=][ptkbdg][ptkbdg][rlł]z?[aąeęioóuy]w?w?w?(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])[=](?:[ptkbdg][ptkbdg][rlł]z?[aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][ptkbdg])(?=[ptkbdg][rlł]z?[aąeęioóuy])', u'[=]', wbWord)
- #Rule 5.3.2: V C1 C2 [=] C3 V -> V C1 [=] C2 C3 V
- match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][wszmn][ptkbdg][=][łlr]z?[aąeęioóuy]w?w?w?(.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy])(?=[wszmn][ptkbdg][=][łlr]z?[aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][wszmn])(?=[ptkbdg][lłr]z?[aąeęioóuy])', u'[=]', wbWord)
- #Rule 6: divide two adjacent vowels
- match = re.match(u'(.*[=].*)*w?w?w?[AEOUaeouy][aeoui]w?[=](.*[=].*)*', wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEOUaeouy])(?=[aeoui]w?[=])', u'[=]', wbWord)
- #Rule 7: 4-consonant clusters
- match = re.match(u'(.*[=].*)*w?w?w?[AEIOUYaąeęioóuy][bcćdfghkpstwzżźjlmnr][=][bcćdfgkpsśtwjlłmnr][bcćdfgkpsśtwjlłmnr][bcćdfgkpsśtwjlłmnr][aąeęioóuy]w?w?w?(.*[=].*)*',wbWord)
- result = match.group() if match else None
- if result == wbWord:
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][bcćdfghkpstwzżźjlmnr])[=](?=[bcćdfgkpsśtwjlłmnr][bcćdfgkpsśtwjlłmnr][bcćdfgkpsśtwjlłmnr][aąeęioóuy])', u'', wbWord)
- wbWord = re.sub(u'(?<=[AEIOUYaąeęioóuy][bcćdfghkpstwzżźjlmnr][bcćdfgkpsśtwjlłmnr])(?=[bcćdfgkpsśtwjlłmnr][bcćdfgkpsśtwjlłmnr][aąeęioóuy])', u'[=]', wbWord)
- if erkWord != wbWord:
- outLine = wbWord + "t" + erkWord + "n"
- errorList.write(outLine)
- nCorrect -= 1
- print float(nCorrect) / nTotal
Add Comment
Please, Sign In to add comment