Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re, datetime, os
- from datetime import datetime, timedelta
- date = datetime.strftime(datetime.now()+timedelta(hours=-4), "%Y-%m-%d")
- dropboxfolder = {
- r'/home/hellerick':r'/home/hellerick/Dropbox',
- r'C:\Users\OTK':r'D:\Dropbox',
- }[os.environ['HOME']]
- pyscriptfolder = os.path.join(dropboxfolder, 'Programming', 'Python')
- inputfilename = os.path.join(dropboxfolder, 'KSV-shared', 'Lib', 'Jefremov, Ivan Anatoljevich', 'Tumannostj Andromedy', 'EFRITUMA.htm')
- outputfilename = re.sub(r'(\.[^.]+\Z)', r'.lat.'+date+r'\1', inputfilename)
- hyphenate = True
- simplify = True
- maindict = dict()
- if simplify:
- with open(os.path.join(pyscriptfolder, 'RussianRomanization', 'KeepDuplicationDictionary.txt'), mode='rt', encoding='utf-8') as f:
- keepdupldict = f.read()
- keepdupldict = keepdupldict.split('\n')
- keepdupldict = [i for i in keepdupldict if i!='']
- with open(os.path.join(pyscriptfolder, 'RussianRomanization', 'SpecialCasesDictionary.txt'), mode='rt', encoding='utf-8') as f:
- specdict = f.read()
- specdict = specdict.split('\n')
- specdict = {j.split('\t')[0]:j.split('\t')[1]
- for j in specdict if len(j.split('\t'))>1
- }# if len(j)>=2
- def checkspecialspelling(w):
- for i in specdict:
- if re.match(re.sub(r'\*','.*',i), w):
- i1 = re.sub('\*','',i)
- i2 = specdict[i]
- w = re.sub(i1, i2, w)
- return w
- def keepduplication(w):
- for i in keepdupldict:
- i = re.sub(r'\*','.*',i)
- if re.match(i, w):
- i1 = re.sub('\.\*','',i)
- i2 = re.sub(r'([бвгджзклмнпрстфхцчшщ])\1',r'\1\1', i1)
- w = re.sub(i1, i2, w)
- return w
- def translit(w):
- w = re.sub(r'и([ау])',r'ï\1', w)
- w = re.sub(r'цï([ау])',r'цi\1', w)
- w = re.sub(r'ля',r'lа', w)
- w = re.sub(r'лю',r'lу', w)
- w = re.sub(r'л([еиï])',r'l\1', w)
- w = re.sub(r'ль([еёио])',r'lй\1', w)
- w = re.sub(r'ль',r'l', w)
- w = re.sub(r'щ',r'сч', w)
- w = re.sub(r'([б-джзкмнп-тф-ш])ю',r'\1иу', w)
- w = re.sub(r'([б-джзкмнп-тф-ш])я',r'\1иа', w)
- w = re.sub(r'ю',r'йу', w)
- w = re.sub(r'я',r'йа', w)
- w = re.sub(r'ъ',r'й', w)
- w = re.sub(r'[йь]й',r'й', w)
- w = re.sub(r'ЛL',r'LL', w)
- w = re.sub(r'Лl',r'Ll', w)
- w = re.sub(r'лl',r'll', w)
- w = re.sub(r'([жчш])ь\b',r'\1', w)
- for c in enumerate( 'абцчдеёэфгхийьклмнопрсштувызж'):
- w = re.sub(c[1], 'abcčdeëèfghijjkłmnoprsštuvyzž'[c[0]], w)
- return w
- def hyphen(w):
- if hyphenate:
- #
- vow='[aeëèiïouy]'
- con='[bcčdfghjklłmnprsštvzž]'
- let=vow[:-1]+con[1:] # 'ABCČDEËÈFGHIJKLŁMNOPRSŠTUVYZŽabcčdeëèfghijklłmnoprsštuvyzž'
- w = re.sub(r'\b('+let+')', r'<<\1', w)
- w = re.sub(r'('+let+r')\b', r'\1>>', w)
- w = re.sub('('+let+')('+let+')', r'\1_\2', w)
- w = re.sub('('+let+')('+let+')', r'\1_\2', w)
- w = re.sub('_('+let+')-', r'\1-', w)
- w = re.sub('-('+let+')_', r'\1', w)
- w = re.sub('-', r'_', w)
- w = re.sub('('+con+')_('+vow+')', r'\1\2', w)
- w = re.sub('(<<'+let+')_', r'\1', w)
- w = re.sub('_('+let+'>>)', r'\1', w)
- w = re.sub('(<<'+con+'+)_', r'\1', w)
- w = re.sub('(<<'+con+'+)_', r'\1', w)
- w = re.sub('_('+con+'+>>)', r'\1', w)
- w = re.sub('_('+con+'+>>)', r'\1', w)
- w = re.sub('i_([au])', r'i\1', w)
- w = re.sub('s_č', r'sč', w)
- w = re.sub('<<ra_sč', r'<<ras_č', w)
- w = re.sub('s_t', r'st', w)
- w = re.sub('('+vow+')_j', r'\1j', w)
- w = re.sub('('+con+')_(j_'+con+')', r'\1\2', w)
- w = re.sub('_('+con+r')(_\1)', r'\1\2', w)
- w = re.sub('('+con+r')(_\1)_', r'\1\2', w)
- w = re.sub('_(l_[cnš]|ł_[čkž]|m_[čp]|n_t_r|r_[mnt]|s_t_s|t_s)', r'\1', w)
- w = re.sub('_('+con+'j)', r'\1', w)
- for p in ('nad', 'nis', 'niz', 'pod', 'ras', 'raz', 'ros', 'roz', 'vos', 'voz'):
- w = re.sub('<<(['+p[0]+p[0].upper()+']['+p[1]+p[1].upper()+'])_(['+p[2]+p[2].upper()+'])', r'\1\2', w)
- w = re.sub(r'(<<|>>)',r'', w)
- w = re.sub(r'_',r'', w)
- else:
- w = re.sub('', '', w)
- return w
- def transword(w):
- firstcap = allcaps = False
- if len(w)>1 and w[1].isupper():
- allcaps = True
- elif w[0].isupper():
- firstcap = True
- w = w.lower()
- out = maindict[w]
- if allcaps:
- out = out.upper()
- elif firstcap:
- out = out.capitalize()
- return out
- def transtext(t):
- if t=='': return ''
- t = re.split(r"([а-яёА-ЯЁ]+)", t)
- for i,s in enumerate(t):
- if i%2!=0:
- t[i] = transword(t[i])
- t = ''.join(t)
- return t
- def makemaindict(c):
- global maindict, cyrlist, latlist, watch
- cyrlist=sorted(list(set(re.sub(r'[^а-яё]+', r' ', c.lower()).split())))
- t = ' '.join(cyrlist)
- if simplify:
- #
- for i in specdict:
- affixes = [i[0]=='*', i[-1]=='*']
- i1 = re.sub(r'\*',r'(\S*)',i)
- i2 = specdict[i]
- if affixes == [True, True]: i2 = r'\1'+i2+r'\2'
- elif affixes == [True, False]: i2 = r'\1'+i2
- elif affixes == [False, True]: i2 = i2+r'\1'
- t = re.sub(r'\b'+i1+r'\b', i2, t)
- for i in keepdupldict:
- affixes = [i[0]=='*', i[-1]=='*']
- i1 = re.sub(r'\*',r'(\S*)',i)
- i2 = re.sub(r'([бвгджзклмнпрстфхцчшщ])\1', r'\1\1', re.sub(r'\*','',i))
- if affixes == [True, True]: i2 = r'\1'+i2+r'\2'
- elif affixes == [True, False]: i2 = r'\1'+i2
- elif affixes == [False, True]: i2 = i2+r'\1'
- t = re.sub(r'\b'+i1+r'\b', i2, t)
- t = re.sub(r'([бвгдзйклмнпрстфхцчшщ])\1',r'\1', t)
- #
- t = translit(t)
- if hyphenate: t = hyphen(t)
- latlist = t.split()
- maindict=dict(zip(cyrlist,latlist))
- def transcode(c):
- makemaindict(c)
- if c=='': return ''
- c = re.split(r"(<[^>]+>)", c)
- for i,s in enumerate(c):
- if i%2!=1:
- c[i] = transtext(c[i])
- c = ''.join(c)
- return c
- def convertfile():
- with open(inputfilename, mode='rt', encoding='utf-8') as f:
- text = f.read()
- print ('Input text length', len(text), 'characters.')
- text = transcode(text)
- with open(outputfilename, mode='wt', encoding='utf-8') as f:
- f.write(text)
- print ('Output text length', len(text), 'characters.')
- def p(t):
- print (transcode(t))
- convertfile()
Advertisement
Add Comment
Please, Sign In to add comment