Guest User

Untitled

a guest
Aug 19th, 2018
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.27 KB | None | 0 0
  1. #!/usr/bin/perl -w
  2.  
  3. use Inline C;
  4. #use Benchmark qw(:all);
  5.  
  6. local $/=undef;
  7.  
  8. my $file=$ARGV[0] or die("you must specify a file");
  9. open my $fh, "<", $file;
  10. my $text=<$fh>;
  11. close $fh;
  12.  
  13. ##$stat=timethese(100, {
  14. ## 'Perl' => sub { _is_utf8($text); },
  15. ## 'InlineC' => sub { __is_utf8($text); },
  16. ##});
  17. ##cmpthese($stat) ;
  18.  
  19. if(__is_utf8($text)) { print "\nseems to be utf8\n"; }
  20. else { print "\ndoesn't seems to be utf8\n"; }
  21.  
  22. sub _is_utf8 {
  23. my $text=shift;
  24. my @asc=unpack("C*",$text);
  25. my $len=scalar(@asc);
  26. my $i=0;
  27. my $utflen=0;
  28. my $c=0;
  29. #print "PParsing ".$len." bytes\n";
  30. while($i<$len) {
  31. $c=$asc[$i];
  32. if($c & 0x80) { #high bit? => utf char ?
  33. #print "highbit! [".$c."]";
  34. if (($c&0xe0)==0xc0) { $utflen=1; }
  35. elsif(($c&0xf0)==0xe0) { $utflen=2; }
  36. elsif(($c&0xf8)==0xf0) { $utflen=3; }
  37. else { return 0; }
  38. #print " len=".$utflen;
  39. while($utflen--) {
  40. $c=$asc[++$i];
  41. return 0 if(($c&0xc0)!=0x80);
  42. }
  43. #print " ok\n";
  44. }
  45. ++$i;
  46. }
  47. 1;
  48. }
  49.  
  50. __END__
  51.  
  52. __C__
  53.  
  54. int __is_utf8(char *txt) {
  55. unsigned char c;
  56. int len;
  57. while((c=*txt++)!=0)
  58. if((c&0x80)==0x80) {
  59. if ((c&0xe0)==0xc0) len=1;
  60. else if((c&0xf0)==0xe0) len=2;
  61. else if((c&0xf8)==0xf0) len=3;
  62. else return 0;
  63. while(len--) {
  64. c=*txt++;
  65. if((c&0xc0)!=0x80) return 0;
  66. }
  67. }
  68. return 1;
  69. }
Add Comment
Please, Sign In to add comment