Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- use utf8;
- use strict;
- use Marpa::R2;
- use Data::Dumper::AutoEncode;
- my $pg = Marpa::R2::Scanless::G->new({
- bless_package => 'PennTags',
- source => \(<<'END_OF_SOURCE'),
- :default ::= action => [ values ] bless => ::lhs
- lexeme default = action => [ value ] bless => ::name
- # string of items
- S ::= I* bless => S
- # each item is a Lao syllable or a string of non-Lao text
- I ::= SYL bless => I
- | LAONUM bless => I
- | LAOABBR bless => I
- | NONLAO bless => I
- # Lao syllable ordered by vowel number as on http://www.thailao.net/laovowel.htm
- SYL ::= C1 VASF bless => SYL # 1. ah (short)
- | C1 VASM C2 bless => SYL # 2. ah/uh (short)
- | C1 VASM T C2 bless => SYL # 2. ah/uh (short)
- | C1 VAL bless => SYL # 3. ah (long)
- | C1 T VAL bless => SYL # 3. ah (long)
- | C1 VAL C2 bless => SYL # 4. same with final consonant
- | C1 T VAL C2 bless => SYL # 4. same with final consonant
- # 5. ai (long)
- # 6. ao (long)
- | C1 VIS bless => SYL # 7. ee/i (short)
- | C1 VIS T bless => SYL # 7. ee/i (short)
- | C1 VIS C2 bless => SYL # 8. same with final consonant
- | C1 VIS T C2 bless => SYL # 8. same with final consonant
- | C1 VIL bless => SYL # 9. ee (long)
- | C1 VIL T bless => SYL # 9. ee (long)
- | C1 T VIL bless => SYL # tone and top vowel in wrong order
- | C1 VIL C2 bless => SYL # 10. same with final consonant
- | C1 VIL T C2 bless => SYL # 10. same with final consonant
- # 11. iu (short)
- | C1 VEUS bless => SYL # 12. eu (short)
- | C1 VEUS T bless => SYL # 12. eu (short)
- # | C1 VEUS C2 bless => SYL
- # | C1 VEUS T C2 bless => SYL
- | C1 VEUL bless => SYL # 13. eu (long)
- | C1 VEUL T bless => SYL # 13. eu (long)
- | C1 VEUL C2 bless => SYL # 14. same with final consonant
- | C1 VEUL T C2 bless => SYL # 14. same with final consonant
- | C1 VUS bless => SYL # 15. oo (short)
- | C1 VUS T bless => SYL # 15. oo (short)
- | C1 VUS C2 bless => SYL # 16. same with final consonant
- | C1 VUS T C2 bless => SYL # 16. same with final consonant
- # 17. ui (short)
- | C1 VUL bless => SYL # 18. oo (long)
- | C1 VUL T bless => SYL # 18. oo (long)
- | C1 VUL C2 bless => SYL # 19. same with final consonant
- | VE C1 VASM C2 bless => SYL # 20. eh/ay (short)
- | VE C1 VASM T C2 bless => SYL # 20. eh/ay (short)
- | VE C1 VASF bless => SYL # 21. ay (short)
- | VE C1 bless => SYL # 22. ay (long)
- | VE C1 T bless => SYL # 22. ay (long)
- | VE C1 C2 bless => SYL # 23. same with final consonant
- | VAE C1 VASF bless => SYL # 24. ae (short)
- | VAE C1 bless => SYL # 25. ae (long)
- | VAE C1 T bless => SYL # 25. ae (long)
- | VAE C1 C2 bless => SYL # 26. same with final consonant
- | VAE C1 T C2 bless => SYL # 26. same with final consonant
- # 27. aeo
- # | VAE C1 VASM C2 bless => SYL # ??? TODO
- # 27. aeo
- | C1 VOUM C2 bless => SYL # 28. oh (short) with final consonant
- | C1 VOUM T C2 bless => SYL # 28. oh (short) with final consonant
- # 29. oh (short) without final consonant
- | VOL C1 bless => SYL # 30. oh (long)
- | VOL C1 C2 bless => SYL # 30. oh (long)
- # 31. same with final consonant
- # 32. oy (oh + ee)
- | VE C1 VAL VASF bless => SYL # 33. aw (short)
- | C1 VOUC bless => SYL # 34. aw (long) no final consonant
- | C1 VOUC T bless => SYL # 34. aw (long) no final consonant
- | C1 VAW C2 bless => SYL # 35. aw (long) with final consonant
- | C1 T VAW C2 bless => SYL # 35. aw (long) with final consonant
- # 36. oi (aw + ee)
- # 37. euh (short)
- | VE C1 VIS C2 bless => SYL # 38. same with final consonant
- | VE C1 VIS T C2 bless => SYL # 38. same with final consonant
- | VE C1 VIL bless => SYL # 39. euh (long)
- | VE C1 VIL T bless => SYL # 39. euh (long)
- | VE C1 VIL C2 bless => SYL # 40. same with final consonant
- | VE C1 VIL T C2 bless => SYL # 40. same with final consonant
- # 41. euy (euh + ee)
- # 42. ia (short)
- # 43. ia (long)
- | C1 VIE C2 bless => SYL # 44. ia (long) with final consonant
- | C1 T VIE C2 bless => SYL # 44. ia (long) with final consonant
- # 45. io (long)
- | VE C1 VEUL VAW bless => SYL # 46. eua (long)
- | VE C1 VEUL T VAW bless => SYL # 46. eua (long)
- | VE C1 VEUL VAW C2 bless => SYL # 47. same with final consonant
- | VE C1 VEUL T VAW C2 bless => SYL # 47. same with final consonant
- # 48. euay (long)
- # 49. ua (short)
- # 50. ua (long)
- | C1 VW C2 bless => SYL # 51. ua (long) with final consonant
- | C1 T VW C2 bless => SYL # 51. ua (long) with final consonant
- # 52. uay (long)
- | VAI C1 bless => SYL # 53.54. ai (long/short)
- | VAI C1 T bless => SYL # 53.54. ai (long/short)
- | VE C1 VOUM VAL bless => SYL # 55. ao (short)
- | VE C1 VOUM T VAL bless => SYL # 55. ao (short)
- | C1 VAM bless => SYL # 56. am (long/short)
- | C1 T VAM bless => SYL # 56. am (long/short)
- | C1 VOUC VAL bless => SYL # 56. am (long/short)
- | C1 VOUC T VAL bless => SYL # 56. am (long/short)
- # not listed at ancientscripts.com or thailao.net/laovowel.htm
- | C1 CANC bless => SYL
- | RPT bless => SYL
- # abbreviation!
- #| C1 bless => SYL
- ########
- # initial consonant
- C1 ::= C1B bless => C1
- | CHH C1L bless => C1
- | C1G CVW bless => C1
- | CHH C1L CVW bless => C1
- | CHH CVW bless => C1
- | C1R CR bless => C1 # old orthography cluster
- # final consonant
- C2 ::= C2B bless => C2
- | C2X bless => C2
- # special lao "syllables" that are not really syllables
- LAOABBR ::= CSH CP CP bless => LAOABBR
- | CK CM bless => LAOABBR
- # vowels
- VAE ::= VAEB bless => VAE
- | VE VE bless => VAE # ແ made from two ເ
- ############### lexical rules ###############
- # basic initial consonants
- C1B ~ 'ຂ' | 'ສ' | 'ຖ' | 'ຜ' | 'ຝ' | 'ຫ'
- | 'ກ' | 'ຈ' | 'ດ' | 'ຕ' | 'ບ' | 'ປ' | 'ຢ' | 'ອ'
- | 'ຄ' | 'ຊ' | 'ທ' | 'ພ' | 'ຟ' | 'ຮ' | 'ງ' | 'ຍ' | 'ນ' | 'ມ' | 'ລ' | 'ວ' | 'ຣ'
- | 'ໜ' | 'ໝ'
- # low initial consonants which can be modified by a high h prefix
- C1L ~ 'ງ' | 'ຍ' | 'ນ' | 'ມ' | 'ລ' | 'ຼ' #| 'ວ'
- # glide v/w consonant modifier suffix
- C1G ~ 'ຂ' | 'ສ' | 'ຖ' | 'ຜ' | 'ຝ' #| 'ຫ'
- | 'ກ' | 'ຈ' | 'ດ' | 'ຕ' | 'ບ' | 'ປ' | 'ຢ' | 'ອ'
- | 'ຄ' | 'ຊ' | 'ທ' | 'ພ' | 'ຟ' | 'ຮ' | 'ງ' | 'ຍ' | 'ນ' | 'ມ' | 'ລ' | 'ວ' | 'ຣ'
- | 'ໜ' | 'ໝ'
- # C1 consonant cluster with 'r'
- C1R ~ 'ຂ' | 'ສ' | 'ຖ' | 'ຜ' | 'ຝ' | 'ຫ'
- | 'ກ' | 'ຈ' | 'ດ' | 'ຕ' | 'ບ' | 'ປ' | 'ຢ' | 'ອ'
- | 'ຄ' | 'ຊ' | 'ທ' | 'ພ' | 'ຟ' | 'ຮ' | 'ງ' | 'ຍ' | 'ນ' | 'ມ' | 'ລ' | 'ວ' #| 'ຣ'
- | 'ໜ' | 'ໝ'
- # letters used in abbreviation "syllables"
- CHH ~ 'ຫ' # high consonant modifier prefix, high h
- CVW ~ 'ວ'
- CSH ~ 'ສ'
- CP ~ 'ປ'
- CK ~ 'ກ'
- CM ~ 'ມ'
- CR ~ 'ຣ'
- ########
- # tone mark
- T ~ '່' | '້' | '໊' | '໋'
- CANC ~ '໌'
- ########
- VASF ~ 'ະ'
- VASM ~ 'ັ'
- VAL ~ 'າ'
- VIS ~ 'ິ'
- VIL ~ 'ີ'
- VEUS ~ 'ຶ'
- VEUL ~ 'ື'
- VUS ~ 'ຸ'
- VUL ~ 'ູ'
- VE ~ 'ເ'
- VAEB ~ 'ແ'
- VOL ~ 'ໂ'
- VOUM ~ 'ົ'
- VOUC ~ 'ໍ'
- VIE ~ 'ຽ'
- VAW ~ 'ອ'
- VW ~ 'ວ'
- #VY ~ 'ຢ'
- VAI ~ [ໃໄ]
- VAM ~ 'ຳ'
- ########
- # final consonant
- C2B ~ 'ບ' | 'ດ' | 'ກ' | 'ມ' | 'ນ' | 'ງ' | 'ຍ' | 'ວ'
- C2X ~ 'ປ' | 'ຊ' | 'ຕ' | 'ຄ'
- RPT ~ [ຯໆ]
- ########
- LAONUM ~ [\x{0ED0}-\x{0ED9}]+
- NONLAO ~ [^\x{0E80}-\x{0EFF}]+
- END_OF_SOURCE
- });
- my $pr = Marpa::R2::Scanless::R->new( { grammar => $pg } );
- my $input = <<EOI;
- ດິນແດນຂອງສະຫະລັດອາເມຣິກາໃນໝູ່ເກາະປາຊີຟິກ:
- EOI
- print "Trying to parse:\n$input\n\n";
- $pr->read(\$input);
- my $asf = Marpa::R2::ASF->new( { slr=>$pr } );
- my $full_result = $asf->traverse( {}, \&full_traverser );
- my $pruned_result = $asf->traverse( {}, \&pruning_traverser );
- print "Full output:\n".eDumper($full_result);
- print "Pruned output:\n".eDumper($pruned_result);
- print scalar(@{$full_result})." interpretations";
- #########################################################################
- sub penn_tag {
- my ($symbol_name) = @_;
- return q{.} if $symbol_name eq 'period';
- return $symbol_name;
- }
- sub pruning_traverser {
- # This routine converts the glade into a list of Penn-tagged elements. It is called recursively.
- my ($glade, $scratch) = @_;
- my $rule_id = $glade->rule_id();
- my $symbol_id = $glade->symbol_id();
- my $symbol_name = $pg->symbol_name($symbol_id);
- # A token is a single choice, and we know enough to fully Penn-tag it
- if ( not defined $rule_id ) {
- my $literal = $glade->literal();
- my $penn_tag = penn_tag($symbol_name);
- return "($penn_tag $literal)";
- }
- my $length = $glade->rh_length();
- my @return_value = map { $glade->rh_value($_) } 0 .. $length - 1;
- # Special case for the start rule
- return (join q{ }, @return_value) . "\n" if $symbol_name eq '[:start]' ;
- my $join_ws = q{ };
- $join_ws = qq{\n } if $symbol_name eq 'S';
- my $penn_tag = penn_tag($symbol_name);
- return "($penn_tag " . ( join $join_ws, @return_value ) . ')';
- }
- sub full_traverser {
- # This routine converts the glade into a list of Penn-tagged elements. It is called recursively.
- my ($glade, $scratch) = @_;
- my $rule_id = $glade->rule_id();
- my $symbol_id = $glade->symbol_id();
- my $symbol_name = $pg->symbol_name($symbol_id);
- # A token is a single choice, and we know enough to fully Penn-tag it
- if ( not defined $rule_id ) {
- my $literal = $glade->literal();
- my $penn_tag = penn_tag($symbol_name);
- return ["($penn_tag $literal)"];
- } ## end if ( not defined $rule_id )
- # Our result will be a list of choices
- my @return_value = ();
- CHOICE: while (1) {
- # The results at each position are a list of choices, so
- # to produce a new result list, we need to take a Cartesian
- # product of all the choices
- my $length = $glade->rh_length();
- my @results = ( [] );
- for my $rh_ix ( 0 .. $length - 1 ) {
- my @new_results = ();
- for my $old_result (@results) {
- my $child_value = $glade->rh_value($rh_ix);
- for my $new_value ( @{ $child_value } ) {
- push @new_results, [ @{$old_result}, $new_value ];
- }
- }
- @results = @new_results;
- } ## end for my $rh_ix ( 0 .. $length - 1 )
- # Special case for the start rule
- if ( $symbol_name eq '[:start]' ) {
- return [ map { join q{}, @{$_} } @results ];
- }
- # Now we have a list of choices, as a list of lists. Each sub list
- # is a list of Penn-tagged elements, which we need to join into
- # a single Penn-tagged element. The result will be to collapse
- # one level of lists, and leave us with a list of Penn-tagged
- # elements
- my $join_ws = q{ };
- $join_ws = qq{\n } if $symbol_name eq 'S';
- push @return_value,
- map { '(' . penn_tag($symbol_name) . q{ } . ( join $join_ws, @{$_} ) . ')' }
- @results;
- # Look at the next alternative in this glade, or end the
- # loop if there is none
- last CHOICE if not defined $glade->next();
- } ## end CHOICE: while (1)
- # Return the list of Penn-tagged elements for this glade
- return \@return_value;
- } ## end sub full_traverser
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement