Guest User

Untitled

a guest
Apr 23rd, 2018
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.19 KB | None | 0 0
  1. #!/usr/bin/perl -w -CD
  2. binmode STDOUT, ":utf8";
  3. binmode STDIN, ":utf8";
  4. no warnings;
  5.  
  6.  
  7. #-----Description------------------------------------------------------
  8. #
  9. # Program:urdu-segmenter.pl
  10. # Written by: Danish Munir
  11. # Purpose:breaks urdu text into sentences
  12. #
  13. # Syntax: urdu-segmenter.pl [filename]
  14. #or program_that_outputs_urdu_text | urdu-segmenter.pl [-x] -s [docid(optional)]
  15. # This script takes a utf8 encoded file with Urdu text as input
  16. # and outputs to STDOUT, the text after segmenting it into sentences.
  17. #
  18. # The xml format of the output is as follows
  19. # <DOC docid = "Filename" lang = "URD">
  20. # <SEG id = "1">Urdu Sentence 1</SEG>
  21. # <SEG id = "2">Urdu Sentence 2</SEG>
  22. # <SEG id = "3">Urdu Sentence 3</SEG>
  23. # </DOC>
  24. #
  25. # This script breaks urdu sentences based on the following punctuations:
  26. # [dash]Unicode 06D4
  27. # [question]Unicode 061F
  28. # multiple newline characters
  29. #-----------------------------------------------------------------------
  30.  
  31. if ($ARGV[0] =~ m/^-h$/ || $ARGV[0] =~ m/^-+help$/){
  32. print "\n
  33. breaksenteces.pl
  34. ----------------
  35. Syntax: urdu-segmenter.pl [filename]
  36. of urdu-segmenter.pl -x [filename]
  37. or program_that_outputs_urdu_text | urdu-segmenter.pl -s [docid(optional)]
  38. of program_that_outputs_urdu_text | urdu-segmenter.pl -s -x [docid(optional)]
  39. eg: more sourcefile1.txt | urdu-segmenter.pl -s Title
  40.  
  41. The -x option is used to output xml tags, if and only if the -x option is used
  42.  
  43. This script takes a utf8 encoded file with Urdu text as input and outputs to STDOUT, the text after segmenting it into sentences.
  44.  
  45. The xml format of the output is as follows
  46. <DOC docid = \"Filename\" lang = \"URD\">
  47. <SEG id = \"1\">Urdu Sentence 1</SEG>
  48. <SEG id = \"2\">Urdu Sentence 2</SEG>
  49. </DOC>
  50.  
  51. This script breaks urdu sentences based on the following punctuations:
  52.  
  53. multiple newline characters
  54. [dash]Unicode 06D4
  55. [question]Unicode 061F
  56. [ellipsis]Unicode 2026
  57. [bullet]Unicode 2022
  58.  
  59. \n\n";
  60. exit;
  61. }
  62.  
  63.  
  64. #Code Starts here
  65. if ($ARGV[0] =~ m/^-s$/){
  66. if ($ARGV[1] =~ m/^-x$/) {
  67. $printxml = 1;
  68. $filename = $ARGV[2];
  69. $/=undef;
  70. $_=<STDIN>;
  71. } else {
  72. $printxml = 0;
  73. $filename = $ARGV[1];
  74. $/=undef;
  75. $_=<STDIN>;
  76. }
  77. }
  78. else {
  79. if ($ARGV[0] =~ m/^-x$/) {
  80. $printxml = 1;
  81. open(I,"<:utf8", $ARGV[1]) #Open the file passed, or exit upon error
  82. or die "Cannot open file $ARGV[1]: $!";
  83.  
  84. $filename = $ARGV[1]; #This and the next 2 lines cleanup the
  85.  
  86. $filename =~ s/.*\///; #filename by removing the path and the
  87. $filename =~ s/\.[^\.]*$//; #extension.
  88.  
  89. $/=undef; #Set the input delimeter to undef to
  90. #read the entire file at once.
  91.  
  92. $_=<I>;
  93. } else {
  94. $printxml = 0;
  95. open(I,"<:utf8", $ARGV[0]) #Open the file passed, or exit upon error
  96. or die "Cannot open file $ARGV[0]: $!";
  97.  
  98. $filename = $ARGV[0]; #This and the next 2 lines cleanup the
  99.  
  100. $filename =~ s/.*\///; #filename by removing the path and the
  101. $filename =~ s/\.[^\.]*$//; #extension.
  102.  
  103. $/=undef; #Set the input delimeter to undef to
  104. #read the entire file at once.
  105.  
  106. $_=<I>;
  107. }
  108. }
  109. s/\r//sgi;
  110. s/\n/\n\n/sg;
  111. if ($printxml) {
  112. print "<DOC docid = \"$filename\" lang = \"URD\">\n";
  113. }
  114. s/\s*\x{2022}\s*/\n\n\n\n\n/g; #Replace bullets with sentence breaks.
  115.  
  116. s/\t* +\t*$/ /g;
  117. s/[\n\x{000D}][ ]+[\n\x{000d}]/\n\n/sg; #This and the following 4 lines
  118. s/^[\t\x{0020}]+$/\n\n/g; #attempt to remove lines with
  119. #s/ +$/\n\n/g; #spaces only.
  120.  
  121. #s/([\x{06d4}\x{061f}\n\x{000d}]) *[\n\x{000d}]*/$1/g;
  122.  
  123. s/|//g; #Remove pipe character from files.
  124.  
  125.  
  126. my @sentences=split(/(\n{2,}|!|\x{061f}|\x{06D4}|\x{2022}|\x{000d}|\s{2,}|\x{2026}|\x{002e})/); #This line actually splits the text into
  127. ###my @sentences=split(/(\n{2,}|!|\x{002e})/); #This line actually splits the text into
  128. #sentences based on the various delimiters
  129. #described above
  130.  
  131.  
  132. my $i = 0; #Initialize loop counter i, and the
  133. my $j=1; #segment counter j
  134.  
  135. sent: while ( $i < @sentences ) { #The @sentences array has a list of items such that
  136. #an item at index i, is followed by the punctuation
  137. #that marked the end of this sentence at index i+1
  138.  
  139. $string=$sentences[$i]; #Take a sentence and trim any white d
  140. $string =~ s/^\s*(.*?)\s*$/$1/g; #spaces at the start or en
  141.  
  142. if (length($string) <=3 || $string =~ m/^\s+$/){#Check to see if a sentence contains only white
  143. $i += 0; #space. If it does, than discard it.
  144. }
  145. else{
  146. if ($printxml) {
  147. print "<SEG id=\"$j\">$string"; #If it passes the test, than sentence is valid, so print it
  148. } else {
  149. print "$string";
  150. }
  151.  
  152. if($sentences[$i+1] =~ m/[\n\x{000d}\x{2022}]/){#Check the punctuation follownig the sentence.
  153. print ""; #If newline, carraige-return, or bullet than dont print it.
  154. } else{ #If passes both tests than print it.
  155. print "$sentences[$i+1]";
  156. }
  157.  
  158. if ($printxml) {
  159. print "</SEG>\n"; #Close segment tag.
  160. } else {
  161. print "\n";
  162. }
  163. $j++; #Increment segment counter.
  164. }
  165. $i+=2; #Increment sentence counter by 2, to move to next
  166. } #set of sentence and its ending punctuation.
  167.  
  168. if ($printxml) {
  169. print "</DOC>\n"; #Close DOC tag.
  170. }
  171.  
  172. close;
Add Comment
Please, Sign In to add comment