Advertisement
beng

Myanmar / Burmese Unicode to Zawgyi ICU Transliterator

Nov 30th, 2016
165
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.93 KB | None | 0 0
  1. $anusvara = [\u1032\u1036];
  2. $asat = \u103a;
  3. $consonant = [\u1000-\u1020\u103f\u104e];
  4. $consonant_na = \u1014;
  5. $consonant_nya = \u1009;
  6. $consonant_nnya = \u100a;
  7. $digit_except_zero = [\u1041-\u1049];
  8. $digit_zero = \u1040;
  9. $dot_below = \u1037;
  10. $dot_below_cluster = $dot_below $asat?;
  11. $generic_base = [\u00A0\u00D7\u2012-\u2015\u2022\u25CC\u25FB-\u25FE];
  12. $halant = \u1039;
  13. $independent_vowel = [\u1021-\u102A];
  14. $joiner = [\u200C\u200D];
  15. $kinzi_start = \u1004;
  16. $kinzi = $kinzi_start \u103A \u1039;
  17. $medial_consonant_h = \u103E;
  18. $medial_consonant_r = \u103C;
  19. $medial_consonant_w = \u103D;
  20. $medial_consonant_y = \u103B;
  21. $medial_consonant_w_cluster = $medial_consonant_w $asat?;
  22. $medial_consonant_h_cluster = $medial_consonant_w? $medial_consonant_h $asat?;
  23. $common = [\p{Common}];
  24. $punctuation = [\u104A\u104B];
  25. $reserved = [\uAA7C-\uAA7F];
  26. $symbol = [\u104C\u104D\u104F];
  27. $visarga = \u1038;
  28. $vowel_above = [\u102D\u102E];
  29. $vowel_below = [\u102F\u1030];
  30. $vowel_pre = \u1031;
  31. $vowel_post = [\u102B\u102C];
  32. $vowel_post_prefix = $vowel_post $medial_consonant_h? $asat*
  33. $vowel_above* $anusvara*;
  34. $vowel_post_cluster = $vowel_post_prefix $dot_below? $asat?;
  35. $vowel_u = \u102F;
  36. $vowel_uu = \u1030;
  37. $variation_selector = [\uFE00-\uFE0F];
  38. $word_joiner = \u2060;
  39. $ws = [\p{Whitespace}];
  40. $cluster_base = [$consonant $independent_vowel $digit_except_zero
  41. $generic_base];
  42. $cluster_base_vs = $cluster_base $variation_selector?;
  43. $halant_cluster = $halant [$consonant $independent_vowel]
  44. $variation_selector?;
  45. $cluster_base_short_narrow = [\u1001\u1002\u1004\u1005\u1007\u100e
  46. \u1012\u1013\u1015\u1016\u1017\u1019\u101d];
  47. $cluster_base_short_wide = [\u1000\u1003\u1006\u100f\u1010\u1011
  48. \u1018\u101a\u101c\u101e\u101f\u1021];
  49. $cluster_base_tall_narrow = [\u100b\u100c\u100d\u1014\u101b\u1020
  50. \u1025\u1026\u1028];
  51. $cluster_base_tall_wide = [\u1008\u1009\u100a\u1023\u1024\u1029\u102a];
  52. $cluster_base_tall = [$cluster_base_tall_narrow $cluster_base_tall_wide];
  53. $cluster_base_wide = [$cluster_base_short_wide $cluster_base_tall_wide];
  54. $cluster_base_narrow = [$cluster_base_short_narrow $cluster_base_tall_narrow];
  55.  
  56. # PUA placeholders for code points which are re-used for different meanings
  57. $placeholder_medial_consonant_r = \uE100;
  58. $placeholder_medial_consonant_y = \uE101;
  59. $placeholder_medial_consonant_w = \uE102;
  60. $placeholder_medial_consonant_h = \uE103;
  61.  
  62. $placeholder_medial_consonant_w_cluster = $placeholder_medial_consonant_w $asat?;
  63. $placeholder_medial_consonant_h_cluster = $placeholder_medial_consonant_w? $placeholder_medial_consonant_h $asat?;
  64.  
  65. # Zawgyi versions of Unicode code points
  66. $zawgyi_asat = \u1039;
  67.  
  68. $zawgyi_consonant_na_without_tail = \u108f;
  69. $zawgyi_consonant_nya_without_tail = \u106a;
  70. $zawgyi_consonant_nnya_without_tail = \u106b;
  71.  
  72. $zawgyi_medial_r_wide = \u107e;
  73. $zawgyi_medial_r_narrow = \u103b;
  74. $zawgyi_medial_r_wide_short_top = \u1080;
  75. $zawgyi_medial_r_narrow_short_top = \u107f;
  76. $zawgyi_medial_r_wide_short_bottom = \u1082;
  77. $zawgyi_medial_r_narrow_short_bottom = \u1081;
  78.  
  79. # TODO: Two zawgyi versions of medial Y (also U+103A), figure out which ya pin to use
  80. $zawgyi_medial_consonant_y = \u107d;
  81. $zawgyi_medial_consonant_w = \u103c;
  82. $zawgyi_medial_consonant_h = \u103d;
  83.  
  84. $zawgyi_vowel_u_post = \u1033;
  85. $zawgyi_vowel_uu_post = \u1034;
  86.  
  87. $zawgyi_dot_below_right = \u1094;
  88.  
  89. # Logical to visual order: Cluster terminating in halant
  90. ($kinzi?) ($cluster_base_vs) ($halant_cluster*) ($halant) > $2 $1 $3 $4;
  91.  
  92. # Logical to visual order: Complex cluster
  93.  
  94. # VISUAL ORDER:
  95. #
  96. # vowel_pre medial_consonant_r cluster_base kinzi? halant_cluster*
  97. # asat medial_consonant_y medial_consonant_w_cluster medial_consonant_h_cluster
  98. # vowel_above anusvara vowel_below dot_below_cluster vowel_post_cluster visarga
  99. # joiner
  100.  
  101. ($kinzi?) ($cluster_base_vs) ($halant_cluster*)
  102. ($asat?) ($medial_consonant_y?) ($medial_consonant_r?)
  103. ($medial_consonant_w_cluster)? ($medial_consonant_h_cluster)?
  104. ($vowel_pre*) ($vowel_above*) ($vowel_below*) ($anusvara*) ($dot_below_cluster)?
  105. ($vowel_post_cluster)* ($visarga*) ($joiner?) > $9 $6 $2 $1 $3 $4 $5 $7 $8 $10 $12 $11 $13 $14 $15 $16;
  106.  
  107. ::Null;
  108.  
  109. $medial_consonant_r > $placeholder_medial_consonant_r;
  110. $medial_consonant_y > $placeholder_medial_consonant_y;
  111. $medial_consonant_w > $placeholder_medial_consonant_w;
  112. $medial_consonant_h > $placeholder_medial_consonant_h;
  113.  
  114. ::Null;
  115.  
  116. # Na loses its tail with consonant cluster, vowels, or medial consonants below)
  117. $consonant_na } $kinzi? $vowel_above? [$halant $placeholder_medial_consonant_r $placeholder_medial_consonant_y $placeholder_medial_consonant_w $placeholder_medial_consonant_h $vowel_below] > $zawgyi_consonant_na_without_tail;
  118.  
  119. # Nya + lower diacritic loses right side of tail
  120. $consonant_nya } $kinzi? $vowel_above? [$halant $placeholder_medial_consonant_r $placeholder_medial_consonant_y $placeholder_medial_consonant_w $placeholder_medial_consonant_h $vowel_below] > $zawgyi_consonant_nya_without_tail;
  121.  
  122. # Nnya + lower diacritic loses right side of tail
  123. $consonant_nnya } $kinzi? $vowel_above? [$halant $placeholder_medial_consonant_r $placeholder_medial_consonant_y $placeholder_medial_consonant_w $placeholder_medial_consonant_h $vowel_below] > $zawgyi_consonant_nnya_without_tail;
  124.  
  125. ::Null;
  126.  
  127. # Move vowel u or uu after the base if medial r surrounds
  128. $placeholder_medial_consonant_r $cluster_base_vs $kinzi? $halant_cluster* $asat? $placeholder_medial_consonant_y? $placeholder_medial_consonant_w_cluster? $placeholder_medial_consonant_h_cluster? $vowel_above? $anusvara? ($dot_below_cluster)? { $vowel_u > $zawgyi_vowel_u_post;
  129. $placeholder_medial_consonant_r $cluster_base_vs $kinzi? $halant_cluster* $asat? $placeholder_medial_consonant_y? $placeholder_medial_consonant_w_cluster? $placeholder_medial_consonant_h_cluster? $vowel_above? $anusvara? ($dot_below_cluster)? { $vowel_uu > $zawgyi_vowel_uu_post;
  130.  
  131. # Move vowel u or uu after the base if tall cluster base collides
  132. $cluster_base_tall $variation_selector? $kinzi? $halant_cluster* $asat? $placeholder_medial_consonant_y? $placeholder_medial_consonant_w_cluster? $placeholder_medial_consonant_h_cluster? $vowel_above? $anusvara? ($dot_below_cluster)? { $vowel_u > $zawgyi_vowel_u_post;
  133. $cluster_base_tall $variation_selector? $kinzi? $halant_cluster* $asat? $placeholder_medial_consonant_y? $placeholder_medial_consonant_w_cluster? $placeholder_medial_consonant_h_cluster? $vowel_above? $anusvara? ($dot_below_cluster)? { $vowel_uu > $zawgyi_vowel_uu_post;
  134.  
  135. ::Null;
  136.  
  137. # Wide ya yit: Wide base, nothing above or below
  138. $placeholder_medial_consonant_r } $cluster_base_wide [^ $kinzi_start $halant $placeholder_medial_consonant_y $placeholder_medial_consonant_w $placeholder_medial_consonant_h $vowel_above $vowel_below $anusvara $dot_below] > $zawgyi_medial_r_wide;
  139.  
  140. # Narrow ya yit: Narrow base, nothing above or below
  141. $placeholder_medial_consonant_r } $cluster_base_narrow [^ $kinzi_start $halant $placeholder_medial_consonant_y $placeholder_medial_consonant_w $placeholder_medial_consonant_h $vowel_above $vowel_below $anusvara $dot_below] > $zawgyi_medial_r_narrow;
  142.  
  143. # Wide ya yit with short top: Wide base, kinzi above, nothing below
  144. $placeholder_medial_consonant_r } $cluster_base_wide $kinzi_start [^ $halant $placeholder_medial_consonant_y $placeholder_medial_consonant_w $placeholder_medial_consonant_h $vowel_below $dot_below] > $zawgyi_medial_r_wide_short_top;
  145.  
  146. # Wide ya yit with short top: Wide base, vowel or anusvara above, nothing below
  147. $placeholder_medial_consonant_r } $cluster_base_wide [$vowel_above $anusvara]+ [^$vowel_below $anusvara $dot_below] > $zawgyi_medial_r_wide_short_top;
  148.  
  149. # Narrow ya yit with short top: Narrow base, kinzi above, nothing below
  150. $placeholder_medial_consonant_r } $cluster_base_narrow $kinzi_start [^ $halant $placeholder_medial_consonant_y $placeholder_medial_consonant_w $placeholder_medial_consonant_h $vowel_below $dot_below] > $zawgyi_medial_r_narrow_short_top;
  151.  
  152. # Narrow ya yit with short top: Narrow base, vowel or anusvara above, nothing below
  153. $placeholder_medial_consonant_r } $cluster_base_narrow [$vowel_above $anusvara]+ [^$vowel_below $anusvara $dot_below] > $zawgyi_medial_r_narrow_short_top;
  154.  
  155. # Narrow ya yit with short bottom: Narrow base, nothing above, medial w below
  156. $placeholder_medial_consonant_r } $cluster_base_narrow $placeholder_medial_consonant_w > $zawgyi_medial_r_narrow_short_bottom;
  157.  
  158. # Wide ya yit with short bottom: Wide base, nothing above, medial w below
  159. $placeholder_medial_consonant_r } $cluster_base_wide $placeholder_medial_consonant_w > $zawgyi_medial_r_wide_short_bottom;
  160.  
  161. # Na with dot below: dot moves to right
  162. $consonant_na $kinzi? $asat? $placeholder_medial_consonant_y? $placeholder_medial_consonant_w_cluster? $placeholder_medial_consonant_h_cluster? $vowel_above* $anusvara* $vowel_below? { $dot_below_cluster ($vowel_post_cluster?) > $1 $zawgyi_dot_below_right;
  163.  
  164. # Consonant cluster with dot below: dot moves to right
  165. $halant_cluster+ $asat? $placeholder_medial_consonant_y? $placeholder_medial_consonant_w_cluster? $placeholder_medial_consonant_h_cluster? $vowel_above* $anusvara* $vowel_below? { $dot_below_cluster ($vowel_post_cluster?) > $1 $zawgyi_dot_below_right;
  166.  
  167. # Anything else below with dot below: dot moves to right
  168. $placeholder_medial_consonant_y $placeholder_medial_consonant_w_cluster? $placeholder_medial_consonant_h_cluster? $vowel_above* $anusvara* $vowel_below? { $dot_below_cluster ($vowel_post_cluster?) > $1 $zawgyi_dot_below_right;
  169. $placeholder_medial_consonant_w_cluster $placeholder_medial_consonant_h_cluster? $vowel_above* $anusvara* $vowel_below? { $dot_below_cluster ($vowel_post_cluster?) > $1 $zawgyi_dot_below_right;
  170. $placeholder_medial_consonant_h_cluster $vowel_above* $anusvara* $vowel_below? { $dot_below_cluster ($vowel_post_cluster?) >
  171. $1 $zawgyi_dot_below_right;
  172.  
  173. $vowel_below { $dot_below_cluster ($vowel_post_cluster?) > $1 $zawgyi_dot_below_right;
  174.  
  175. # Post vowel with dot below: dot moves to right
  176. $vowel_post_prefix { $dot_below_cluster > $zawgyi_dot_below_right;
  177.  
  178. ::Null;
  179.  
  180. # Pre-defined ligatures
  181. \u103F > \u1086;
  182. \u104E\u1004\u103A\u1038 > \u104E;
  183. \u100B\u1039\u100B > \u1097;
  184. \u100B\u1039\u100C > \u1092;
  185. \u100F\u1039\u100D > \u1091;
  186. \u100D\u1039\u100E > \u106F;
  187. \u100D\u1039\u100D > \u106E;
  188.  
  189. # Ha hto + u/uu ligatures
  190. $placeholder_medial_consonant_h $vowel_u > \u1088;
  191. # TODO bhamilton Is this really a thing??
  192. $placeholder_medial_consonant_h $vowel_uu > \u1089;
  193.  
  194. # Stacked Consonants
  195. $halant \u101C > \u1085;
  196. $halant \u1019 > \u107C;
  197. # XXX TODO bhamilton which one?? U+107B and U+1093 look the same in zawgyi
  198. $halant \u1018 > \u1093;
  199. #$halant \u1018 > \u107B;
  200. $halant \u1017 > \u107A;
  201. $halant \u1016 > \u1079;
  202. $halant \u1015 > \u1078;
  203. $halant \u1014 > \u1077;
  204. $halant \u1013 > \u1076;
  205. $halant \u1012 > \u1075;
  206. # XXX TODO bhamilton U+1073 and U+1074 look the same, which to use?
  207. $halant \u1011 > \u1074;
  208. #$halant \u1011 > \u1073;
  209. $halant \u1010 \u103D > \u1096;
  210. # XXX TODO bhamilton U+1072 and U+1071 look the same
  211. $halant \u1010 > \u1072;
  212. #$halant \u1010 > \u1071;
  213. $halant \u100F > \u1070;
  214. $halant \u100C > \u106D;
  215. $halant \u100B > \u106C;
  216. $halant \u1008 > \u1069;
  217. $halant \u1007 > \u1068;
  218. # XXX TODO bhamilton U+1067 U+1066 look the same
  219. $halant \u1006 > \u1067;
  220. #$halant \u1006 > \u1066;
  221. $halant \u1005 > \u1065;
  222. $halant \u1003 > \u1063;
  223. $halant \u1002 > \u1062;
  224. $halant \u1001 > \u1061;
  225. $halant \u1000 > \u1060;
  226.  
  227. # Special cases for 1025 vs 1009
  228. \u1009 \u1039 \u1016 > \u1025 \u1079;
  229. \u1009 \u1039 \u1017 > \u1025 \u107A;
  230. \u1009 \u1039 \u1015 > \u1025 \u1078;
  231. \u1009 \u1039 \u1013 > \u1025 \u1076;
  232. \u1009 \u1039 \u1007 > \u1025 \u1068;
  233. \u1009 \u1039 \u1005 > \u1025 \u1065;
  234. \u1009 \u1039 \u1002 > \u1025 \u1062;
  235. \u1009 \u1039 \u1001 > \u1025 \u1061;
  236.  
  237. ::Null;
  238.  
  239. # Zawgyi versions of each of these
  240.  
  241. $asat > $zawgyi_asat;
  242.  
  243. # Restore placeholders
  244.  
  245. $placeholder_medial_consonant_r > $zawgyi_medial_r_narrow;
  246. $placeholder_medial_consonant_y > $zawgyi_medial_consonant_y;
  247. $placeholder_medial_consonant_w > $zawgyi_medial_consonant_w;
  248. $placeholder_medial_consonant_h > $zawgyi_medial_consonant_h;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement