Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl -w -CD
- binmode STDOUT, ":utf8";
- binmode STDIN, ":utf8";
- no warnings;
- #-----Description------------------------------------------------------
- #
- # Program:urdu-segmenter.pl
- # Written by: Danish Munir
- # Purpose:breaks urdu text into sentences
- #
- # Syntax: urdu-segmenter.pl [filename]
- #or program_that_outputs_urdu_text | urdu-segmenter.pl [-x] -s [docid(optional)]
- # This script takes a utf8 encoded file with Urdu text as input
- # and outputs to STDOUT, the text after segmenting it into sentences.
- #
- # The xml format of the output is as follows
- # <DOC docid = "Filename" lang = "URD">
- # <SEG id = "1">Urdu Sentence 1</SEG>
- # <SEG id = "2">Urdu Sentence 2</SEG>
- # <SEG id = "3">Urdu Sentence 3</SEG>
- # </DOC>
- #
- # This script breaks urdu sentences based on the following punctuations:
- # [dash]Unicode 06D4
- # [question]Unicode 061F
- # multiple newline characters
- #-----------------------------------------------------------------------
- if ($ARGV[0] =~ m/^-h$/ || $ARGV[0] =~ m/^-+help$/){
- print "\n
- breaksenteces.pl
- ----------------
- Syntax: urdu-segmenter.pl [filename]
- of urdu-segmenter.pl -x [filename]
- or program_that_outputs_urdu_text | urdu-segmenter.pl -s [docid(optional)]
- of program_that_outputs_urdu_text | urdu-segmenter.pl -s -x [docid(optional)]
- eg: more sourcefile1.txt | urdu-segmenter.pl -s Title
- The -x option is used to output xml tags, if and only if the -x option is used
- This script takes a utf8 encoded file with Urdu text as input and outputs to STDOUT, the text after segmenting it into sentences.
- The xml format of the output is as follows
- <DOC docid = \"Filename\" lang = \"URD\">
- <SEG id = \"1\">Urdu Sentence 1</SEG>
- <SEG id = \"2\">Urdu Sentence 2</SEG>
- </DOC>
- This script breaks urdu sentences based on the following punctuations:
- multiple newline characters
- [dash]Unicode 06D4
- [question]Unicode 061F
- [ellipsis]Unicode 2026
- [bullet]Unicode 2022
- \n\n";
- exit;
- }
- #Code Starts here
- if ($ARGV[0] =~ m/^-s$/){
- if ($ARGV[1] =~ m/^-x$/) {
- $printxml = 1;
- $filename = $ARGV[2];
- $/=undef;
- $_=<STDIN>;
- } else {
- $printxml = 0;
- $filename = $ARGV[1];
- $/=undef;
- $_=<STDIN>;
- }
- }
- else {
- if ($ARGV[0] =~ m/^-x$/) {
- $printxml = 1;
- open(I,"<:utf8", $ARGV[1]) #Open the file passed, or exit upon error
- or die "Cannot open file $ARGV[1]: $!";
- $filename = $ARGV[1]; #This and the next 2 lines cleanup the
- $filename =~ s/.*\///; #filename by removing the path and the
- $filename =~ s/\.[^\.]*$//; #extension.
- $/=undef; #Set the input delimeter to undef to
- #read the entire file at once.
- $_=<I>;
- } else {
- $printxml = 0;
- open(I,"<:utf8", $ARGV[0]) #Open the file passed, or exit upon error
- or die "Cannot open file $ARGV[0]: $!";
- $filename = $ARGV[0]; #This and the next 2 lines cleanup the
- $filename =~ s/.*\///; #filename by removing the path and the
- $filename =~ s/\.[^\.]*$//; #extension.
- $/=undef; #Set the input delimeter to undef to
- #read the entire file at once.
- $_=<I>;
- }
- }
- s/\r//sgi;
- s/\n/\n\n/sg;
- if ($printxml) {
- print "<DOC docid = \"$filename\" lang = \"URD\">\n";
- }
- s/\s*\x{2022}\s*/\n\n\n\n\n/g; #Replace bullets with sentence breaks.
- s/\t* +\t*$/ /g;
- s/[\n\x{000D}][ ]+[\n\x{000d}]/\n\n/sg; #This and the following 4 lines
- s/^[\t\x{0020}]+$/\n\n/g; #attempt to remove lines with
- #s/ +$/\n\n/g; #spaces only.
- #s/([\x{06d4}\x{061f}\n\x{000d}]) *[\n\x{000d}]*/$1/g;
- s/|//g; #Remove pipe character from files.
- my @sentences=split(/(\n{2,}|!|\x{061f}|\x{06D4}|\x{2022}|\x{000d}|\s{2,}|\x{2026}|\x{002e})/); #This line actually splits the text into
- ###my @sentences=split(/(\n{2,}|!|\x{002e})/); #This line actually splits the text into
- #sentences based on the various delimiters
- #described above
- my $i = 0; #Initialize loop counter i, and the
- my $j=1; #segment counter j
- sent: while ( $i < @sentences ) { #The @sentences array has a list of items such that
- #an item at index i, is followed by the punctuation
- #that marked the end of this sentence at index i+1
- $string=$sentences[$i]; #Take a sentence and trim any white d
- $string =~ s/^\s*(.*?)\s*$/$1/g; #spaces at the start or en
- if (length($string) <=3 || $string =~ m/^\s+$/){#Check to see if a sentence contains only white
- $i += 0; #space. If it does, than discard it.
- }
- else{
- if ($printxml) {
- print "<SEG id=\"$j\">$string"; #If it passes the test, than sentence is valid, so print it
- } else {
- print "$string";
- }
- if($sentences[$i+1] =~ m/[\n\x{000d}\x{2022}]/){#Check the punctuation follownig the sentence.
- print ""; #If newline, carraige-return, or bullet than dont print it.
- } else{ #If passes both tests than print it.
- print "$sentences[$i+1]";
- }
- if ($printxml) {
- print "</SEG>\n"; #Close segment tag.
- } else {
- print "\n";
- }
- $j++; #Increment segment counter.
- }
- $i+=2; #Increment sentence counter by 2, to move to next
- } #set of sentence and its ending punctuation.
- if ($printxml) {
- print "</DOC>\n"; #Close DOC tag.
- }
- close;
Add Comment
Please, Sign In to add comment