Untitled

#!/usr/bin/perl -w -CD
binmode STDOUT, ":utf8";
binmode STDIN, ":utf8";
no warnings;


#-----Description------------------------------------------------------
#
# Program:urdu-segmenter.pl
# Written by: Danish Munir
# Purpose:breaks urdu text into sentences
#
# Syntax: urdu-segmenter.pl [filename]
#or program_that_outputs_urdu_text | urdu-segmenter.pl [-x] -s [docid(optional)]
# This script takes a utf8 encoded file with Urdu text as input
# and outputs to STDOUT, the text after segmenting it into sentences.
#
# The xml format of the output is as follows
# <DOC docid = "Filename" lang = "URD">
# <SEG id = "1">Urdu Sentence 1</SEG>
# <SEG id = "2">Urdu Sentence 2</SEG>
# <SEG id = "3">Urdu Sentence 3</SEG>
# </DOC>
#
# This script breaks urdu sentences based on the following punctuations:
# [dash]Unicode 06D4
# [question]Unicode 061F
# multiple newline characters
#-----------------------------------------------------------------------

if ($ARGV[0] =~ m/^-h$/ || $ARGV[0] =~ m/^-+help$/){
    print "\n
breaksenteces.pl
----------------
Syntax: urdu-segmenter.pl [filename]
 of urdu-segmenter.pl -x [filename]
 or program_that_outputs_urdu_text | urdu-segmenter.pl -s [docid(optional)]
 of program_that_outputs_urdu_text | urdu-segmenter.pl -s -x [docid(optional)]
 eg: more sourcefile1.txt | urdu-segmenter.pl -s Title

The -x option is used to output xml tags, if and only if the -x option is used

This script takes a utf8 encoded file with Urdu text as input and outputs to STDOUT, the text after segmenting it into sentences.

The xml format of the output is as follows
<DOC docid = \"Filename\" lang = \"URD\">
<SEG id = \"1\">Urdu Sentence 1</SEG>
<SEG id = \"2\">Urdu Sentence 2</SEG>
</DOC>

This script breaks urdu sentences based on the following punctuations:

 multiple newline characters
 [dash]Unicode 06D4
 [question]Unicode 061F
 [ellipsis]Unicode 2026
 [bullet]Unicode 2022

\n\n";
    exit;
}


#Code Starts here
if ($ARGV[0] =~ m/^-s$/){
    if ($ARGV[1] =~ m/^-x$/) {
        $printxml = 1;
        $filename = $ARGV[2];
        $/=undef;
        $_=<STDIN>;
    } else {
        $printxml = 0;
        $filename = $ARGV[1];
        $/=undef;
        $_=<STDIN>;
    }
}
else {
    if ($ARGV[0] =~ m/^-x$/) {
        $printxml = 1;
        open(I,"<:utf8", $ARGV[1]) #Open the file passed, or exit upon error
            or die "Cannot open file $ARGV[1]: $!";

        $filename = $ARGV[1]; #This and the next 2 lines cleanup the

        $filename =~ s/.*\///; #filename by removing the path and the
        $filename =~ s/\.[^\.]*$//; #extension.

        $/=undef; #Set the input delimeter to undef to
#read the entire file at once.

        $_=<I>;
    } else {
        $printxml = 0;
        open(I,"<:utf8", $ARGV[0]) #Open the file passed, or exit upon error
            or die "Cannot open file $ARGV[0]: $!";

        $filename = $ARGV[0]; #This and the next 2 lines cleanup the

        $filename =~ s/.*\///; #filename by removing the path and the
        $filename =~ s/\.[^\.]*$//; #extension.

        $/=undef; #Set the input delimeter to undef to
#read the entire file at once.

        $_=<I>;
    }
}
s/\r//sgi;
s/\n/\n\n/sg;
if ($printxml) {
    print "<DOC docid = \"$filename\" lang = \"URD\">\n";
}
s/\s*\x{2022}\s*/\n\n\n\n\n/g; #Replace bullets with sentence breaks.

s/\t* +\t*$/ /g;
s/[\n\x{000D}][ ]+[\n\x{000d}]/\n\n/sg; #This and the following 4 lines
s/^[\t\x{0020}]+$/\n\n/g; #attempt to remove lines with
#s/ +$/\n\n/g; #spaces only.

#s/([\x{06d4}\x{061f}\n\x{000d}]) *[\n\x{000d}]*/$1/g;

s/|//g; #Remove pipe character from files.


my @sentences=split(/(\n{2,}|!|\x{061f}|\x{06D4}|\x{2022}|\x{000d}|\s{2,}|\x{2026}|\x{002e})/); #This line actually splits the text into
###my @sentences=split(/(\n{2,}|!|\x{002e})/); #This line actually splits the text into
#sentences based on the various delimiters
#described above


my $i = 0; #Initialize loop counter i, and the
my $j=1; #segment counter j

 sent: while ( $i < @sentences ) { #The @sentences array has a list of items such that
     #an item at index i, is followed by the punctuation
     #that marked the end of this sentence at index i+1

     $string=$sentences[$i]; #Take a sentence and trim any white d
     $string =~ s/^\s*(.*?)\s*$/$1/g; #spaces at the start or en

     if (length($string) <=3 || $string =~ m/^\s+$/){#Check to see if a sentence contains only white
         $i += 0; #space. If it does, than discard it.
     }
     else{
         if ($printxml) {
             print "<SEG id=\"$j\">$string"; #If it passes the test, than sentence is valid, so print it
         } else {
             print "$string";
         }

         if($sentences[$i+1] =~ m/[\n\x{000d}\x{2022}]/){#Check the punctuation follownig the sentence.
             print ""; #If newline, carraige-return, or bullet than dont print it.
         } else{ #If passes both tests than print it.
             print "$sentences[$i+1]";
         }

         if ($printxml) {
             print "</SEG>\n"; #Close segment tag.
         } else {
             print "\n";
         }
         $j++; #Increment segment counter.
     }
     $i+=2; #Increment sentence counter by 2, to move to next
} #set of sentence and its ending punctuation.

if ($printxml) {
    print "</DOC>\n"; #Close DOC tag.
}

close;