Guest User

Untitled

a guest
Jan 16th, 2018
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.66 KB | None | 0 0
  1. # KrutidevToUnicode
  2. # coding=utf-8
  3.  
  4. class KrutidevToUnicode:
  5.  
  6. CHARS_KD = [
  7. "ñ", "Q+Z", "sas", "aa", ")Z", "ZZ", "‘", "’", "“", "”",
  8.  
  9. "å", "ƒ", "„", "…", "†", "‡", "ˆ", "‰", "Š", "‹",
  10.  
  11. "¶+", "d+", "[+k", "[+", "x+", "T+", "t+", "M+", "<+", "Q+", ";+", "j+", "u+",
  12. "Ùk", "Ù", "ä", "–", "—", "é", "™", "=kk", "f=k",
  13.  
  14. "à", "á", "â", "ã", "ºz", "º", "í", "{k", "{", "=", "«",
  15. "Nî", "Vî", "Bî", "Mî", "<î", "|", "K", "}",
  16. "J", "Vª", "Mª", "<ªª", "Nª", "Ø", "Ý", "nzZ", "æ", "ç", "Á", "xz", "#", ":",
  17.  
  18. "v‚", "vks", "vkS", "vk", "v", "b±", "Ã", "bZ", "b", "m", "Å", ",s", ",", "_",
  19.  
  20. "ô", "d", "Dk", "D", "[k", "[", "x", "Xk", "X", "Ä", "?k", "?", "³",
  21. "pkS", "p", "Pk", "P", "N", "t", "Tk", "T", ">", "÷", "¥",
  22.  
  23. "ê", "ë", "V", "B", "ì", "ï", "M+", "<+", "M", "<", ".k", ".",
  24. "r", "Rk", "R", "Fk", "F", ")", "n", "/k", "èk", "/", "Ë", "è", "u", "Uk", "U",
  25.  
  26. "i", "Ik", "I", "Q", "¶", "c", "Ck", "C", "Hk", "H", "e", "Ek", "E",
  27. ";", "¸", "j", "y", "Yk", "Y", "G", "o", "Ok", "O",
  28. "'k", "'", "\"k", "\"", "l", "Lk", "L", "g",
  29.  
  30. "È", "z",
  31. "Ì", "Í", "Î", "Ï", "Ñ", "Ò", "Ó", "Ô", "Ö", "Ø", "Ù", "Ük", "Ü",
  32.  
  33. "‚", "ks", "kS", "k", "h", "q", "w", "`", "s", "S",
  34. "a", "¡", "%", "W", "•", "·", "∙", "·", "~j", "~", "\\", "+", " ः",
  35. "^", "*", "Þ", "ß", "(", "¼", "½", "¿", "À", "¾", "A", "-", "&", "&", "Œ", "]", "~ ", "@"
  36. ]
  37.  
  38. CHARS_UNICODE = [
  39. "॰", "QZ+", "sa", "a", "र्द्ध", "Z", "\"", "\"", "'", "'",
  40.  
  41. "०", "१", "२", "३", "४", "५", "६", "७", "८", "९",
  42.  
  43. "फ़्", "क़", "ख़", "ख़्", "ग़", "ज़्", "ज़", "ड़", "ढ़", "फ़", "य़", "ऱ", "ऩ",
  44. "त्त", "त्त्", "क्त", "दृ", "कृ", "न्न", "न्न्", "=k", "f=",
  45.  
  46. "ह्न", "ह्य", "हृ", "ह्म", "ह्र", "ह्", "द्द", "क्ष", "क्ष्", "त्र", "त्र्",
  47. "छ्य", "ट्य", "ठ्य", "ड्य", "ढ्य", "द्य", "ज्ञ", "द्व",
  48. "श्र", "ट्र", "ड्र", "ढ्र", "छ्र", "क्र", "फ्र", "र्द्र", "द्र", "प्र", "प्र", "ग्र", "रु", "रू",
  49.  
  50. "ऑ", "ओ", "औ", "आ", "अ", "ईं", "ई", "ई", "इ", "उ", "ऊ", "ऐ", "ए", "ऋ",
  51.  
  52. "क्क", "क", "क", "क्", "ख", "ख्", "ग", "ग", "ग्", "घ", "घ", "घ्", "ङ",
  53. "चै", "च", "च", "च्", "छ", "ज", "ज", "ज्", "झ", "झ्", "ञ",
  54.  
  55. "ट्ट", "ट्ठ", "ट", "ठ", "ड्ड", "ड्ढ", "ड़", "ढ़", "ड", "ढ", "ण", "ण्",
  56. "त", "त", "त्", "थ", "थ्", "द्ध", "द", "ध", "ध", "ध्", "ध्", "ध्", "न", "न", "न्",
  57.  
  58. "प", "प", "प्", "फ", "फ्", "ब", "ब", "ब्", "भ", "भ्", "म", "म", "म्",
  59. "य", "य्", "र", "ल", "ल", "ल्", "ळ", "व", "व", "व्",
  60. "श", "श्", "ष", "ष्", "स", "स", "स्", "ह",
  61.  
  62. "ीं", "्र",
  63. "द्द", "ट्ट", "ट्ठ", "ड्ड", "कृ", "भ", "्य", "ड्ढ", "झ्", "क्र", "त्त्", "श", "श्",
  64.  
  65. "ॉ", "ो", "ौ", "ा", "ी", "ु", "ू", "ृ", "े", "ै",
  66. "ं", "ँ", "ः", "ॅ", "ऽ", "ऽ", "ऽ", "ऽ", "्र", "्", "?", "़", ":",
  67. "‘", "’", "“", "”", ";", "(", ")", "{", "}", "=", "।", ".", "-", "µ", "॰", ",", "् ", "/"
  68. ]
  69.  
  70. @staticmethod
  71. def do_convert(krutidevPart):
  72. processPart = unicode(krutidevPart, 'utf-8')
  73. if processPart != "":
  74. for input_symbol_idx in range(0, len(KrutidevToUnicode.CHARS_KD)):
  75. idx = 0
  76. while idx > -1:
  77. processPart = processPart.replace(unicode(KrutidevToUnicode.CHARS_KD[input_symbol_idx], 'utf-8'), unicode(KrutidevToUnicode.CHARS_UNICODE[input_symbol_idx], 'utf-8'))
  78. idx = processPart.find(unicode(KrutidevToUnicode.CHARS_KD[input_symbol_idx], 'utf-8'))
  79.  
  80. # Code for Replacing five Special glyphs
  81.  
  82. # Code for Glyph1 : ± (reph+anusvAr)
  83.  
  84. processPart = processPart.replace(u'±', u"Zं")
  85.  
  86. # Glyp2: Æ
  87. # code for replacing "f" with "ि" and correcting its position too. (moving it one position forward)
  88.  
  89. processPart = processPart.replace(u'Æ', u"र्f")
  90.  
  91. position_of_i = processPart.find(u'f')
  92. while position_of_i > -1:
  93. charecter_next_to_i = processPart[position_of_i + 1]
  94. charecter_to_be_replaced = u"f" + charecter_next_to_i
  95. processPart = processPart.replace(charecter_to_be_replaced, charecter_next_to_i + u"ि")
  96. position_of_i = processPart.find(u'f', position_of_i + 1)
  97.  
  98. # Glyph3 & Glyph4: Ç É
  99. # code for replacing "fa" with "िं" and correcting its position too.(moving it two positions forward)
  100.  
  101. processPart = processPart.replace(u'Ç', u"fa")
  102. processPart = processPart.replace(u'É', u"र्fa")
  103.  
  104. position_of_i = processPart.find(u'fa')
  105. while position_of_i > -1:
  106. charecter_next_to_ip2 = processPart[position_of_i + 2]
  107. charecter_to_be_replaced = u"fa" + charecter_next_to_ip2
  108. processPart = processPart.replace(charecter_to_be_replaced, charecter_next_to_ip2 + u"िं")
  109. position_of_i = processPart.find(u'fa', position_of_i + 1)
  110.  
  111. # Glyph5: Ê
  112. # code for replacing "h" with "ी" and correcting its position too.(moving it one positions forward)
  113.  
  114. processPart = processPart.replace(u'Ê', u"ीZ")
  115.  
  116. # End of Code for Replacing four Special glyphs
  117.  
  118. # following loop to eliminate 'chhotee ee kee maatraa' on half-letters as a result of above transformation.
  119. position_of_wrong_ee = processPart.find(u"ि्")
  120. while position_of_wrong_ee > -1:
  121. consonent_next_to_wrong_ee = processPart[position_of_wrong_ee + 2]
  122. charecter_to_be_replaced = u"ि्" + consonent_next_to_wrong_ee
  123. processPart = processPart.replace(charecter_to_be_replaced, u"्" + consonent_next_to_wrong_ee + u"ि")
  124. position_of_wrong_ee = processPart.find(u"ि्", position_of_wrong_ee + 2)
  125.  
  126. # Eliminating reph "Z" and putting 'half - r' at proper position for this.
  127. set_of_matras = u"अ आ इ ई उ ऊ ए ऐ ओ औ ा ि ी ु ू ृ े ै ो ौ ं : ँ ॅ"
  128. position_of_R = processPart.find(u"Z")
  129. while position_of_R > -1:
  130. probable_position_of_half_r = position_of_R - 1
  131. charecter_at_probable_position_of_half_r = processPart[probable_position_of_half_r]
  132.  
  133. # trying to find non-maatra position left to current O (ie, half -r).
  134. while set_of_matras.find(charecter_at_probable_position_of_half_r) > -1:
  135. probable_position_of_half_r = probable_position_of_half_r - 1
  136. charecter_at_probable_position_of_half_r = processPart[probable_position_of_half_r]
  137.  
  138. charecter_to_be_replaced = processPart[probable_position_of_half_r, (position_of_R - probable_position_of_half_r)]
  139. new_replacement_string = u"र्" + charecter_to_be_replaced
  140. charecter_to_be_replaced = charecter_to_be_replaced + u"Z"
  141. processPart = processPart.replace(charecter_to_be_replaced, new_replacement_string)
  142. position_of_R = processPart.find(u"Z")
  143.  
  144. return processPart.encode('utf-8')
  145.  
  146. @staticmethod
  147. def convert_to_unicode(krutidevString):
  148. unicodeString = ''
  149.  
  150. text_size = len(krutidevString)
  151. sthiti1 = 0
  152. sthiti2 = 0
  153. chale_chalo = 1
  154. max_text_size = 6000
  155.  
  156. while chale_chalo == 1:
  157. sthiti1 = sthiti2
  158.  
  159. if sthiti2 < (text_size - max_text_size):
  160. sthiti2 += max_text_size
  161. while krutidevString[sthiti2] != ' ':
  162. sthiti2 -= 1
  163. else:
  164. sthiti2 = text_size
  165. chale_chalo = 0
  166.  
  167. modifiedSubstring = krutidevString[sthiti1:sthiti2]
  168. unicodeString += KrutidevToUnicode.do_convert(modifiedSubstring)
  169.  
  170. return unicodeString
Add Comment
Please, Sign In to add comment