Hellerick_Ferlibay

Dinjdinj transliterator

Jan 14th, 2016
235
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.69 KB | None | 0 0
  1. import re, datetime, os
  2. from datetime import datetime, timedelta
  3.  
  4. date = datetime.strftime(datetime.now()+timedelta(hours=-4), "%Y-%m-%d")
  5.  
  6. dropboxfolder = {
  7.     r'/home/hellerick':r'/home/hellerick/Dropbox',
  8.     r'C:\Users\OTK':r'D:\Dropbox',
  9.     }[os.environ['HOME']]
  10.  
  11. pyscriptfolder = os.path.join(dropboxfolder, 'Programming', 'Python')
  12.  
  13. inputfilename = os.path.join(dropboxfolder, 'KSV-shared', 'Lib', 'Jefremov, Ivan Anatoljevich', 'Tumannostj Andromedy', 'EFRITUMA.htm')
  14.  
  15. outputfilename = re.sub(r'(\.[^.]+\Z)', r'.lat.'+date+r'\1', inputfilename)
  16.  
  17. hyphenate = True
  18. simplify = True
  19. maindict = dict()
  20.  
  21. if simplify:
  22.     with open(os.path.join(pyscriptfolder, 'RussianRomanization', 'KeepDuplicationDictionary.txt'), mode='rt', encoding='utf-8') as f:
  23.         keepdupldict = f.read()
  24.     keepdupldict = keepdupldict.split('\n')
  25.     keepdupldict = [i for i in keepdupldict if i!='']
  26.     with open(os.path.join(pyscriptfolder, 'RussianRomanization', 'SpecialCasesDictionary.txt'), mode='rt', encoding='utf-8') as f:
  27.         specdict = f.read()
  28.     specdict = specdict.split('\n')
  29.     specdict = {j.split('\t')[0]:j.split('\t')[1]
  30.         for j in specdict if len(j.split('\t'))>1
  31.         }#  if len(j)>=2
  32.  
  33. def checkspecialspelling(w):
  34.     for i in specdict:
  35.         if re.match(re.sub(r'\*','.*',i), w):
  36.             i1 = re.sub('\*','',i)
  37.             i2 = specdict[i]
  38.             w = re.sub(i1, i2, w)
  39.     return w
  40.  
  41. def keepduplication(w):
  42.     for i in keepdupldict:
  43.         i = re.sub(r'\*','.*',i)
  44.         if re.match(i, w):
  45.             i1 = re.sub('\.\*','',i)
  46.             i2 = re.sub(r'([бвгджзклмнпрстфхцчшщ])\1',r'\1­\1', i1)
  47.             w = re.sub(i1, i2, w)
  48.     return w
  49.  
  50. def translit(w):
  51.     w = re.sub(r'и([ау])',r\1', w)
  52.     w = re.sub(r'цï([ау])',r'цi\1', w)
  53.     w = re.sub(r'ля',r'lа', w)
  54.     w = re.sub(r'лю',r'lу', w)
  55.     w = re.sub(r'л([еиï])',r'l\1', w)
  56.     w = re.sub(r'ль([еёио])',r'lй\1', w)
  57.     w = re.sub(r'ль',r'l', w)
  58.     w = re.sub(r'щ',r'сч', w)
  59.     w = re.sub(r'([б-джзкмнп-тф-ш])ю',r'\1иу', w)
  60.     w = re.sub(r'([б-джзкмнп-тф-ш])я',r'\1иа', w)
  61.     w = re.sub(r'ю',r'йу', w)
  62.     w = re.sub(r'я',r'йа', w)
  63.     w = re.sub(r'ъ',r'й', w)
  64.     w = re.sub(r'[йь]й',r'й', w)
  65.     w = re.sub(r'ЛL',r'LL', w)
  66.     w = re.sub(r'Лl',r'Ll', w)
  67.     w = re.sub(r'лl',r'll', w)
  68.     w = re.sub(r'([жчш])ь\b',r'\1', w)
  69.     for c in enumerate(  'абцчдеёэфгхийьклмнопрсштувызж'):
  70.         w = re.sub(c[1], 'abcčdeëèfghijjkłmnoprsštuvyzž'[c[0]], w)
  71.     return w
  72.  
  73. def hyphen(w):
  74.     if hyphenate:
  75.        
  76.         vow='[aeëèiïouy]'
  77.         con='[bcčdfghjklłmnprsštvzž]'
  78.         let=vow[:-1]+con[1:] # 'ABCČDEËÈFGHIJKLŁMNOPRSŠTUVYZŽabcčdeëèfghijklłmnoprsštuvyzž'
  79.         w = re.sub(r'\b('+let+')', r'<<\1', w)
  80.         w = re.sub(r'('+let+r')\b', r'\1>>', w)
  81.         w = re.sub('('+let+')('+let+')', r'\1_\2', w)
  82.         w = re.sub('('+let+')('+let+')', r'\1_\2', w)
  83.         w = re.sub('_('+let+')-', r'\1-', w)
  84.         w = re.sub('-('+let+')_', r'\1', w)
  85.         w = re.sub('-', r'_', w)
  86.         w = re.sub('('+con+')_('+vow+')', r'\1\2', w)
  87.         w = re.sub('(<<'+let+')_', r'\1', w)
  88.         w = re.sub('_('+let+'>>)', r'\1', w)
  89.         w = re.sub('(<<'+con+'+)_', r'\1', w)
  90.         w = re.sub('(<<'+con+'+)_', r'\1', w)
  91.         w = re.sub('_('+con+'+>>)', r'\1', w)
  92.         w = re.sub('_('+con+'+>>)', r'\1', w)
  93.         w = re.sub('i_([au])', r'i\1', w)
  94.         w = re.sub('s_č', r'sč', w)
  95.         w = re.sub('<<ra_sč', r'<<ras_č', w)
  96.         w = re.sub('s_t', r'st', w)
  97.         w = re.sub('('+vow+')_j', r'\1j', w)
  98.         w = re.sub('('+con+')_(j_'+con+')', r'\1\2', w)
  99.         w = re.sub('_('+con+r')(_\1)', r'\1\2', w)
  100.         w = re.sub('('+con+r')(_\1)_', r'\1\2', w)
  101.         w = re.sub('_(l_[cnš]|ł_[čkž]|m_[čp]|n_t_r|r_[mnt]|s_t_s|t_s)', r'\1', w)
  102.         w = re.sub('_('+con+'j)', r'\1', w)
  103.         for p in ('nad', 'nis', 'niz', 'pod', 'ras', 'raz', 'ros', 'roz', 'vos', 'voz'):
  104.             w = re.sub('<<(['+p[0]+p[0].upper()+']['+p[1]+p[1].upper()+'])_(['+p[2]+p[2].upper()+'])', r'\1\2', w)
  105.         w = re.sub(r'(<<|>>)',r'', w)
  106.         w = re.sub(r'_',r'­', w)
  107.     else:
  108.         w = re.sub('­', '', w)
  109.     return w
  110.    
  111. def transword(w):
  112.     firstcap = allcaps = False
  113.     if len(w)>1 and w[1].isupper():
  114.         allcaps = True
  115.     elif w[0].isupper():
  116.         firstcap = True
  117.        
  118.     w = w.lower()
  119.  
  120.     out = maindict[w]
  121.        
  122.     if allcaps:
  123.         out = out.upper()
  124.     elif firstcap:
  125.         out = out.capitalize()
  126.  
  127.     return out
  128.        
  129.  
  130. def transtext(t):
  131.     if t=='': return ''
  132.     t = re.split(r"([а-яёА-ЯЁ]+)", t)
  133.     for i,s in enumerate(t):
  134.         if i%2!=0:
  135.             t[i] = transword(t[i])
  136.     t = ''.join(t)
  137.     return t
  138.  
  139. def makemaindict(c):
  140.     global maindict, cyrlist, latlist, watch
  141.     cyrlist=sorted(list(set(re.sub(r'[^а-яё]+', r' ', c.lower()).split())))
  142.     t = ' '.join(cyrlist)
  143.     if simplify:
  144.     #
  145.         for i in specdict:
  146.             affixes = [i[0]=='*', i[-1]=='*']
  147.             i1 = re.sub(r'\*',r'(\S*)',i)
  148.             i2 = specdict[i]
  149.             if affixes == [True, True]: i2 = r'\1'+i2+r'\2'
  150.             elif affixes == [True, False]: i2 = r'\1'+i2
  151.             elif affixes == [False, True]: i2 = i2+r'\1'
  152.             t = re.sub(r'\b'+i1+r'\b', i2, t)
  153.         for i in keepdupldict:
  154.             affixes = [i[0]=='*', i[-1]=='*']
  155.             i1 = re.sub(r'\*',r'(\S*)',i)
  156.             i2 = re.sub(r'([бвгджзклмнпрстфхцчшщ])\1', r'\1­\1', re.sub(r'\*','',i))
  157.             if affixes == [True, True]: i2 = r'\1'+i2+r'\2'
  158.             elif affixes == [True, False]: i2 = r'\1'+i2
  159.             elif affixes == [False, True]: i2 = i2+r'\1'
  160.             t = re.sub(r'\b'+i1+r'\b', i2, t)
  161.         t = re.sub(r'([бвгдзйклмнпрстфхцчшщ])\1',r'\1', t)
  162.     #
  163.     t = translit(t)
  164.     if hyphenate: t = hyphen(t)
  165.     latlist = t.split()
  166.     maindict=dict(zip(cyrlist,latlist))
  167.  
  168. def transcode(c):
  169.     makemaindict(c)
  170.     if c=='': return ''
  171.     c = re.split(r"(<[^>]+>)", c)
  172.     for i,s in enumerate(c):
  173.         if i%2!=1:
  174.             c[i] = transtext(c[i])
  175.     c = ''.join(c)
  176.     return c
  177.  
  178. def convertfile():
  179.     with open(inputfilename, mode='rt', encoding='utf-8') as f:
  180.         text = f.read()
  181.         print ('Input text length', len(text), 'characters.')
  182.     text = transcode(text)
  183.     with open(outputfilename, mode='wt', encoding='utf-8') as f:
  184.         f.write(text)
  185.         print ('Output text length', len(text), 'characters.')
  186.  
  187. def p(t):
  188.     print (transcode(t))
  189.  
  190. convertfile()
Advertisement
Add Comment
Please, Sign In to add comment