Advertisement
Guest User

utf8_to_names.pl

a guest
Apr 10th, 2019
582
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 3.92 KB | None | 0 0
  1. #! /usr/local/bin/perl -w
  2.  
  3. use strict;
  4.  
  5. my $unidata = "/usr/share/unicode/UnicodeData.txt" // $ENV{"UNIDATA"};
  6. open F, "<", $unidata or die "Can't open $unidata";
  7. my @unidata;
  8. while (<F>) {
  9.     my @l = split /\;/;
  10.     my $n = hex($l[0]);
  11.     $unidata[$n] = $l[1] unless $l[1] =~ /^\<.*\>$/;
  12. }
  13. close F;
  14.  
  15. sub putunichar {
  16.     my $c = shift;
  17.     if ( defined($unidata[$c]) ) {
  18.         printf "U+%04X %s\n", $c, $unidata[$c];
  19.     } elsif ( $c>=0xAC00 && $c<=0xD7AF ) {
  20.         my $n = $c-0xAC00;
  21.         my $l = int($n/(21*28));
  22.         my $v = (int($n/28))%21;
  23.         my $t = $n%28;
  24.         my @partL = ( "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S",
  25.                       "SS", "", "J", "JJ", "C", "K", "T", "P", "H" );
  26.         my @partV = ( "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
  27.                       "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
  28.                       "I" );
  29.         my @partT = ( "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG",
  30.                       "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S",
  31.                       "SS", "NG", "J", "C", "K", "T", "P", "H" );
  32.         printf "U+%04X HANGUL SYLLABLE %s%s%s\n", $c, $partL[$l], $partV[$v], $partT[$t];
  33.     } elsif ( ( $c>=0x3400 && $c<=0x4DB5 )
  34.               || ( $c>=0x4E00 && $c<=0x9FCB )
  35.               || ( $c>=0x20000 && $c<=0x2A6D6 )
  36.               || ( $c>=0x2A700 && $c<=0x2B734 )
  37.               || ( $c>=0x2B740 && $c<=0x2B81D ) ) {
  38.         printf "U+%04X CJK UNIFIED IDEOGRAPH-%04X\n", $c, $c;
  39.     } else {
  40.         printf "U+%04X\n", $c;
  41.     }
  42. }
  43.  
  44. sub dosinglechar {
  45.     # Process a single (ASCII) character.
  46.     my $char = shift;
  47.     $char = ord($char);
  48.     putunichar $char;
  49. }
  50.  
  51. sub dodoublechar {
  52.     my $ch1 = shift;
  53.     my $ch2 = shift;
  54.     $ch1 = ord($ch1);
  55.     $ch2 = ord($ch2);
  56.     putunichar ((($ch1-0300)<<6)|($ch2-0200));
  57. }
  58.  
  59. sub dotriplechar {
  60.     my $ch1 = shift;
  61.     my $ch2 = shift;
  62.     my $ch3 = shift;
  63.     $ch1 = ord($ch1);
  64.     $ch2 = ord($ch2);
  65.     $ch3 = ord($ch3);
  66.     putunichar ((($ch1-0340)<<12)|(($ch2-0200)<<6)|($ch3-0200));
  67. }
  68.  
  69. sub doquadruplechar {
  70.     my $ch1 = shift;
  71.     my $ch2 = shift;
  72.     my $ch3 = shift;
  73.     my $ch4 = shift;
  74.     $ch1 = ord($ch1);
  75.     $ch2 = ord($ch2);
  76.     $ch3 = ord($ch3);
  77.     $ch4 = ord($ch4);
  78.     putunichar ((($ch1-0360)<<18)|(($ch2-0200)<<12)|(($ch3-0200)<<6)|($ch4-0200));
  79. }
  80.  
  81. sub doquintuplechar {
  82.     my $ch1 = shift;
  83.     my $ch2 = shift;
  84.     my $ch3 = shift;
  85.     my $ch4 = shift;
  86.     my $ch5 = shift;
  87.     $ch1 = ord($ch1);
  88.     $ch2 = ord($ch2);
  89.     $ch3 = ord($ch3);
  90.     $ch4 = ord($ch4);
  91.     $ch5 = ord($ch5);
  92.     putunichar ((($ch1-0370)<<24)|(($ch2-0200)<<18)|(($ch3-0200)<<12)|(($ch4-0200)<<6)|($ch5-0200));
  93. }
  94.  
  95. sub dosextuplechar {
  96.     my $ch1 = shift;
  97.     my $ch2 = shift;
  98.     my $ch3 = shift;
  99.     my $ch4 = shift;
  100.     my $ch5 = shift;
  101.     my $ch6 = shift;
  102.     $ch1 = ord($ch1);
  103.     $ch2 = ord($ch2);
  104.     $ch3 = ord($ch3);
  105.     $ch4 = ord($ch4);
  106.     $ch5 = ord($ch5);
  107.     $ch6 = ord($ch6);
  108.     putunichar ((($ch1-0374)<<30)|(($ch2-0200)<<24)|(($ch3-0200)<<18)|(($ch4-0200)<<12)|(($ch5-0200)<<6)|($ch6-0200));
  109. }
  110.  
  111. while ( <> ) {
  112.   GOBBLE:
  113.     while ( 1 ) {
  114.         if ( s/^([\300-\337])([\200-\277])//s ) {
  115.             dodoublechar $1, $2;
  116.         } elsif ( s/^([\340-\357])([\200-\277])([\200-\277])//s ) {
  117.             dotriplechar $1, $2, $3;
  118.         } elsif ( s/^([\360-\367])([\200-\277])([\200-\277])([\200-\277])//s ) {
  119.             doquadruplechar $1, $2, $3, $4;
  120.         } elsif ( s/^([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])//s ) {
  121.             doquintuplechar $1, $2, $3, $4, $5;
  122.         } elsif ( s/^([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])//s ) {
  123.             dosextuplechar $1, $2, $3, $4, $5, $6;
  124.         } elsif ( s/^(.)//s ) {
  125.             dosinglechar $1;
  126.         } else {
  127.             last GOBBLE;
  128.         }
  129.     }
  130. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement