Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl -w
- use Inline C;
- #use Benchmark qw(:all);
- local $/=undef;
- my $file=$ARGV[0] or die("you must specify a file");
- open my $fh, "<", $file;
- my $text=<$fh>;
- close $fh;
- ##$stat=timethese(100, {
- ## 'Perl' => sub { _is_utf8($text); },
- ## 'InlineC' => sub { __is_utf8($text); },
- ##});
- ##cmpthese($stat) ;
- if(__is_utf8($text)) { print "\nseems to be utf8\n"; }
- else { print "\ndoesn't seems to be utf8\n"; }
- sub _is_utf8 {
- my $text=shift;
- my @asc=unpack("C*",$text);
- my $len=scalar(@asc);
- my $i=0;
- my $utflen=0;
- my $c=0;
- #print "PParsing ".$len." bytes\n";
- while($i<$len) {
- $c=$asc[$i];
- if($c & 0x80) { #high bit? => utf char ?
- #print "highbit! [".$c."]";
- if (($c&0xe0)==0xc0) { $utflen=1; }
- elsif(($c&0xf0)==0xe0) { $utflen=2; }
- elsif(($c&0xf8)==0xf0) { $utflen=3; }
- else { return 0; }
- #print " len=".$utflen;
- while($utflen--) {
- $c=$asc[++$i];
- return 0 if(($c&0xc0)!=0x80);
- }
- #print " ok\n";
- }
- ++$i;
- }
- 1;
- }
- __END__
- __C__
- int __is_utf8(char *txt) {
- unsigned char c;
- int len;
- while((c=*txt++)!=0)
- if((c&0x80)==0x80) {
- if ((c&0xe0)==0xc0) len=1;
- else if((c&0xf0)==0xe0) len=2;
- else if((c&0xf8)==0xf0) len=3;
- else return 0;
- while(len--) {
- c=*txt++;
- if((c&0xc0)!=0x80) return 0;
- }
- }
- return 1;
- }
Add Comment
Please, Sign In to add comment