Advertisement
Guest User

Marpa Lao syllables

a guest
Sep 6th, 2014
231
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 13.32 KB | None | 0 0
  1. use utf8;
  2. use strict;
  3. use Marpa::R2;
  4. use Data::Dumper::AutoEncode;
  5.  
  6. my $pg = Marpa::R2::Scanless::G->new({
  7.         bless_package => 'PennTags',
  8.         source         => \(<<'END_OF_SOURCE'),
  9.  
  10. :default ::= action => [ values ] bless => ::lhs
  11. lexeme default = action => [ value ] bless => ::name
  12.  
  13. # string of items
  14.  
  15. S   ::= I*                  bless => S
  16.  
  17. # each item is a Lao syllable or a string of non-Lao text
  18.  
  19. I   ::= SYL                 bless => I
  20.     |   LAONUM              bless => I
  21.     |   LAOABBR             bless => I
  22.     |   NONLAO              bless => I
  23.  
  24. # Lao syllable ordered by vowel number as on http://www.thailao.net/laovowel.htm
  25.  
  26. SYL ::= C1 VASF             bless => SYL #  1. ah (short)
  27.     |   C1 VASM C2          bless => SYL #  2. ah/uh (short)
  28.     |   C1 VASM T C2        bless => SYL #  2. ah/uh (short)
  29.     |   C1 VAL              bless => SYL #  3. ah (long)
  30.     |   C1 T VAL            bless => SYL #  3. ah (long)
  31.     |   C1 VAL C2           bless => SYL #  4. same with final consonant
  32.     |   C1 T VAL C2         bless => SYL #  4. same with final consonant
  33.                                          #  5. ai (long)
  34.                                          #  6. ao (long)
  35.     |   C1 VIS              bless => SYL #  7. ee/i (short)
  36.     |   C1 VIS T            bless => SYL #  7. ee/i (short)
  37.     |   C1 VIS C2           bless => SYL #  8. same with final consonant
  38.     |   C1 VIS T C2         bless => SYL #  8. same with final consonant
  39.     |   C1 VIL              bless => SYL #  9. ee (long)
  40.     |   C1 VIL T            bless => SYL #  9. ee (long)
  41.     |   C1 T VIL            bless => SYL       # tone and top vowel in wrong order
  42.     |   C1 VIL C2           bless => SYL # 10. same with final consonant
  43.     |   C1 VIL T C2         bless => SYL # 10. same with final consonant
  44.                                          # 11. iu (short)
  45.     |   C1 VEUS             bless => SYL # 12. eu (short)
  46.     |   C1 VEUS T           bless => SYL # 12. eu (short)
  47. #    |   C1 VEUS C2          bless => SYL
  48. #    |   C1 VEUS T C2        bless => SYL
  49.     |   C1 VEUL             bless => SYL # 13. eu (long)
  50.     |   C1 VEUL T           bless => SYL # 13. eu (long)
  51.     |   C1 VEUL C2          bless => SYL # 14. same with final consonant
  52.     |   C1 VEUL T C2        bless => SYL # 14. same with final consonant
  53.     |   C1 VUS              bless => SYL # 15. oo (short)
  54.     |   C1 VUS T            bless => SYL # 15. oo (short)
  55.     |   C1 VUS C2           bless => SYL # 16. same with final consonant
  56.     |   C1 VUS T C2         bless => SYL # 16. same with final consonant
  57.                                          # 17. ui (short)
  58.     |   C1 VUL              bless => SYL # 18. oo (long)
  59.     |   C1 VUL T            bless => SYL # 18. oo (long)
  60.     |   C1 VUL C2           bless => SYL # 19. same with final consonant
  61.     |   VE C1 VASM C2       bless => SYL # 20. eh/ay (short)
  62.     |   VE C1 VASM T C2     bless => SYL # 20. eh/ay (short)
  63.     |   VE C1 VASF          bless => SYL # 21. ay (short)
  64.     |   VE C1               bless => SYL # 22. ay (long)
  65.     |   VE C1 T             bless => SYL # 22. ay (long)
  66.     |   VE C1 C2            bless => SYL # 23. same with final consonant
  67.     |   VAE C1 VASF         bless => SYL # 24. ae (short)
  68.     |   VAE C1              bless => SYL # 25. ae (long)
  69.     |   VAE C1 T            bless => SYL # 25. ae (long)
  70.     |   VAE C1 C2           bless => SYL # 26. same with final consonant
  71.     |   VAE C1 T C2         bless => SYL # 26. same with final consonant
  72.                                          # 27. aeo
  73. #   |   VAE C1 VASM C2      bless => SYL # ??? TODO
  74.                                          # 27. aeo
  75.     |   C1 VOUM C2          bless => SYL # 28. oh (short) with final consonant
  76.     |   C1 VOUM T C2        bless => SYL # 28. oh (short) with final consonant
  77.                                          # 29. oh (short) without final consonant
  78.     |   VOL C1              bless => SYL # 30. oh (long)
  79.     |   VOL C1 C2           bless => SYL # 30. oh (long)
  80.                                          # 31. same with final consonant
  81.                                          # 32. oy (oh + ee)
  82.     |   VE C1 VAL VASF      bless => SYL # 33. aw (short)
  83.     |   C1 VOUC             bless => SYL # 34. aw (long) no final consonant
  84.     |   C1 VOUC T           bless => SYL # 34. aw (long) no final consonant
  85.     |   C1 VAW C2           bless => SYL # 35. aw (long) with final consonant
  86.     |   C1 T VAW C2         bless => SYL # 35. aw (long) with final consonant
  87.                                          # 36. oi (aw + ee)
  88.                                          # 37. euh (short)
  89.     |   VE C1 VIS C2        bless => SYL # 38. same with final consonant
  90.     |   VE C1 VIS T C2      bless => SYL # 38. same with final consonant
  91.     |   VE C1 VIL           bless => SYL # 39. euh (long)
  92.     |   VE C1 VIL T         bless => SYL # 39. euh (long)
  93.     |   VE C1 VIL C2        bless => SYL # 40. same with final consonant
  94.     |   VE C1 VIL T C2      bless => SYL # 40. same with final consonant
  95.                                          # 41. euy (euh + ee)
  96.                                          # 42. ia (short)
  97.                                          # 43. ia (long)
  98.     |   C1 VIE C2           bless => SYL # 44. ia (long) with final consonant
  99.     |   C1 T VIE C2         bless => SYL # 44. ia (long) with final consonant
  100.                                          # 45. io (long)
  101.     |   VE C1 VEUL VAW      bless => SYL # 46. eua (long)
  102.     |   VE C1 VEUL T VAW    bless => SYL # 46. eua (long)
  103.     |   VE C1 VEUL VAW C2   bless => SYL # 47. same with final consonant
  104.     |   VE C1 VEUL T VAW C2 bless => SYL # 47. same with final consonant
  105.                                          # 48. euay (long)
  106.                                          # 49. ua (short)
  107.                                          # 50. ua (long)
  108.     |   C1 VW C2            bless => SYL # 51. ua (long) with final consonant
  109.     |   C1 T VW C2          bless => SYL # 51. ua (long) with final consonant
  110.                                          # 52. uay (long)
  111.     |   VAI C1              bless => SYL # 53.54. ai (long/short)
  112.     |   VAI C1 T            bless => SYL # 53.54. ai (long/short)
  113.     |   VE C1 VOUM VAL      bless => SYL # 55. ao (short)
  114.     |   VE C1 VOUM T VAL    bless => SYL # 55. ao (short)
  115.     |   C1 VAM              bless => SYL # 56. am (long/short)
  116.     |   C1 T VAM            bless => SYL # 56. am (long/short)
  117.     |   C1 VOUC VAL         bless => SYL # 56. am (long/short)
  118.     |   C1 VOUC T VAL       bless => SYL # 56. am (long/short)
  119.  
  120.     # not listed at ancientscripts.com or thailao.net/laovowel.htm
  121.  
  122.     |   C1 CANC             bless => SYL
  123.     |   RPT                 bless => SYL
  124.  
  125.     # abbreviation!
  126.     #|   C1                  bless => SYL
  127.  
  128. ########
  129.  
  130. # initial consonant
  131. C1  ::= C1B                 bless => C1
  132.     |   CHH C1L             bless => C1
  133.     |   C1G CVW             bless => C1
  134.     |   CHH C1L CVW         bless => C1
  135.     |   CHH CVW             bless => C1
  136.     |   C1R CR              bless => C1 # old orthography cluster
  137.  
  138. # final consonant
  139. C2  ::= C2B                 bless => C2
  140.     |   C2X                 bless => C2
  141.    
  142. # special lao "syllables" that are not really syllables
  143. LAOABBR ::= CSH CP CP       bless => LAOABBR
  144.         |   CK CM           bless => LAOABBR
  145.  
  146. # vowels
  147. VAE ::= VAEB                bless => VAE
  148.     |   VE VE               bless => VAE # ແ made from two ເ
  149.  
  150. ############### lexical rules ###############
  151.  
  152. # basic initial consonants
  153. C1B ~ 'ຂ' | 'ສ' | 'ຖ' | 'ຜ' | 'ຝ' | 'ຫ'
  154.    | 'ກ' | 'ຈ' | 'ດ' | 'ຕ' | 'ບ' | 'ປ' | 'ຢ' | 'ອ'
  155.    | 'ຄ' | 'ຊ' | 'ທ' | 'ພ' | 'ຟ' | 'ຮ' | 'ງ' | 'ຍ' | 'ນ' | 'ມ' | 'ລ' | 'ວ' | 'ຣ'
  156.    | 'ໜ' | 'ໝ'
  157.  
  158. # low initial consonants which can be modified by a high h prefix
  159. C1L ~ 'ງ' | 'ຍ' | 'ນ' | 'ມ' | 'ລ' | 'ຼ' #| 'ວ'
  160.  
  161. # glide v/w consonant modifier suffix
  162. C1G ~ 'ຂ' | 'ສ' | 'ຖ' | 'ຜ' | 'ຝ' #| 'ຫ'
  163.    | 'ກ' | 'ຈ' | 'ດ' | 'ຕ' | 'ບ' | 'ປ' | 'ຢ' | 'ອ'
  164.    | 'ຄ' | 'ຊ' | 'ທ' | 'ພ' | 'ຟ' | 'ຮ' | 'ງ' | 'ຍ' | 'ນ' | 'ມ' | 'ລ' | 'ວ' | 'ຣ'
  165.    | 'ໜ' | 'ໝ'
  166.  
  167. # C1 consonant cluster with 'r'
  168. C1R ~ 'ຂ' | 'ສ' | 'ຖ' | 'ຜ' | 'ຝ' | 'ຫ'
  169.    | 'ກ' | 'ຈ' | 'ດ' | 'ຕ' | 'ບ' | 'ປ' | 'ຢ' | 'ອ'
  170.    | 'ຄ' | 'ຊ' | 'ທ' | 'ພ' | 'ຟ' | 'ຮ' | 'ງ' | 'ຍ' | 'ນ' | 'ມ' | 'ລ' | 'ວ' #| 'ຣ'
  171.    | 'ໜ' | 'ໝ'
  172.  
  173. # letters used in abbreviation "syllables"
  174. CHH ~ 'ຫ' # high consonant modifier prefix, high h
  175. CVW ~ 'ວ'
  176. CSH ~ 'ສ'
  177. CP ~ 'ປ'
  178. CK ~ 'ກ'
  179. CM ~ 'ມ'
  180. CR ~ 'ຣ'
  181.  
  182. ########
  183.  
  184. # tone mark
  185. T ~ '່' | '້' | '໊' | '໋'
  186. CANC ~ '໌'
  187.  
  188. ########
  189.  
  190. VASF ~ 'ະ'
  191. VASM ~ 'ັ'
  192. VAL ~ 'າ'
  193. VIS ~ 'ິ'
  194. VIL ~ 'ີ'
  195. VEUS ~ 'ຶ'
  196. VEUL ~ 'ື'
  197. VUS ~ 'ຸ'
  198. VUL ~ 'ູ'
  199. VE ~ 'ເ'
  200. VAEB ~ 'ແ'
  201. VOL ~ 'ໂ'
  202. VOUM ~ 'ົ'
  203. VOUC ~ 'ໍ'
  204. VIE ~ 'ຽ'
  205. VAW ~ 'ອ'
  206. VW ~ 'ວ'
  207. #VY ~ 'ຢ'
  208. VAI ~ [ໃໄ]
  209. VAM ~ 'ຳ'
  210.  
  211. ########
  212.  
  213. # final consonant
  214. C2B ~ 'ບ' | 'ດ' | 'ກ' | 'ມ' | 'ນ' | 'ງ' | 'ຍ' | 'ວ'
  215. C2X ~ 'ປ' | 'ຊ' | 'ຕ' | 'ຄ'
  216.  
  217. RPT ~ [ຯໆ]
  218.  
  219. ########
  220.  
  221. LAONUM ~ [\x{0ED0}-\x{0ED9}]+
  222. NONLAO ~ [^\x{0E80}-\x{0EFF}]+
  223.  
  224. END_OF_SOURCE
  225. });
  226.  
  227. my $pr = Marpa::R2::Scanless::R->new( { grammar => $pg } );
  228.  
  229. my $input = <<EOI;
  230. ດິນແດນຂອງສະຫະລັດອາເມຣິກາໃນໝູ່ເກາະປາຊີຟິກ:
  231. EOI
  232.  
  233. print "Trying to parse:\n$input\n\n";
  234. $pr->read(\$input);
  235.  
  236.  
  237.  
  238. my $asf = Marpa::R2::ASF->new( { slr=>$pr } );
  239. my $full_result = $asf->traverse( {}, \&full_traverser );
  240. my $pruned_result = $asf->traverse( {}, \&pruning_traverser );
  241.  
  242. print "Full output:\n".eDumper($full_result);
  243.  
  244. print "Pruned output:\n".eDumper($pruned_result);
  245.  
  246. print scalar(@{$full_result})." interpretations";
  247.  
  248. #########################################################################
  249.  
  250. sub penn_tag {
  251.    my ($symbol_name) = @_;
  252.    return q{.} if $symbol_name eq 'period';
  253.    return $symbol_name;
  254. }
  255.  
  256. sub pruning_traverser {
  257.  
  258.     # This routine converts the glade into a list of Penn-tagged elements.  It is called recursively.
  259.     my ($glade, $scratch)     = @_;
  260.     my $rule_id     = $glade->rule_id();
  261.     my $symbol_id   = $glade->symbol_id();
  262.     my $symbol_name = $pg->symbol_name($symbol_id);
  263.  
  264.     # A token is a single choice, and we know enough to fully Penn-tag it
  265.     if ( not defined $rule_id ) {
  266.         my $literal = $glade->literal();
  267.         my $penn_tag = penn_tag($symbol_name);
  268.         return "($penn_tag $literal)";
  269.     }
  270.  
  271.     my $length = $glade->rh_length();
  272.     my @return_value = map { $glade->rh_value($_) } 0 .. $length - 1;
  273.  
  274.     # Special case for the start rule
  275.     return (join q{ }, @return_value) . "\n" if  $symbol_name eq '[:start]' ;
  276.  
  277.     my $join_ws = q{ };
  278.     $join_ws = qq{\n   } if $symbol_name eq 'S';
  279.     my $penn_tag = penn_tag($symbol_name);
  280.     return "($penn_tag " . ( join $join_ws, @return_value ) . ')';
  281.  
  282. }
  283.  
  284. sub full_traverser {
  285.  
  286.     # This routine converts the glade into a list of Penn-tagged elements.  It is called recursively.
  287.     my ($glade, $scratch)     = @_;
  288.     my $rule_id     = $glade->rule_id();
  289.     my $symbol_id   = $glade->symbol_id();
  290.     my $symbol_name = $pg->symbol_name($symbol_id);
  291.  
  292.     # A token is a single choice, and we know enough to fully Penn-tag it
  293.     if ( not defined $rule_id ) {
  294.         my $literal = $glade->literal();
  295.         my $penn_tag = penn_tag($symbol_name);
  296.         return ["($penn_tag $literal)"];
  297.     } ## end if ( not defined $rule_id )
  298.  
  299.     # Our result will be a list of choices
  300.     my @return_value = ();
  301.  
  302.     CHOICE: while (1) {
  303.  
  304.         # The results at each position are a list of choices, so
  305.         # to produce a new result list, we need to take a Cartesian
  306.         # product of all the choices
  307.         my $length = $glade->rh_length();
  308.         my @results = ( [] );
  309.         for my $rh_ix ( 0 .. $length - 1 ) {
  310.             my @new_results = ();
  311.             for my $old_result (@results) {
  312.                 my $child_value = $glade->rh_value($rh_ix);
  313.                 for my $new_value ( @{ $child_value } ) {
  314.                     push @new_results, [ @{$old_result}, $new_value ];
  315.                 }
  316.             }
  317.             @results = @new_results;
  318.         } ## end for my $rh_ix ( 0 .. $length - 1 )
  319.  
  320.         # Special case for the start rule
  321.         if ( $symbol_name eq '[:start]' ) {
  322.             return [ map { join q{}, @{$_} } @results ];
  323.         }
  324.  
  325.         # Now we have a list of choices, as a list of lists.  Each sub list
  326.         # is a list of Penn-tagged elements, which we need to join into
  327.         # a single Penn-tagged element.  The result will be to collapse
  328.         # one level of lists, and leave us with a list of Penn-tagged
  329.         # elements
  330.         my $join_ws = q{ };
  331.         $join_ws = qq{\n   } if $symbol_name eq 'S';
  332.         push @return_value,
  333.             map { '(' . penn_tag($symbol_name) . q{ } . ( join $join_ws, @{$_} ) . ')' }
  334.             @results;
  335.  
  336.         # Look at the next alternative in this glade, or end the
  337.         # loop if there is none
  338.         last CHOICE if not defined $glade->next();
  339.  
  340.     } ## end CHOICE: while (1)
  341.  
  342.     # Return the list of Penn-tagged elements for this glade
  343.     return \@return_value;
  344. } ## end sub full_traverser
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement