Advertisement
Guest User

Untitled

a guest
May 30th, 2017
213
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.13 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
  4. # letters (a-z, converted from A-Z), and spaces (never consecutive).  
  5. # All other characters are converted to spaces.  Only text which normally appears
  6. # in the web browser is displayed.  Tables are removed.  Image captions are
  7. # preserved.  Links are converted to normal text.  Digits are spelled out.
  8.  
  9. # Written by Matt Mahoney, June 10, 2006.  This program is released to the public domain.
  10.  
  11. use utf8;
  12.  
  13. $/=">";                     # input record separator
  14. while (<>) {
  15.   if (/<text /) {$text=1;}  # remove all but between <text> ... </text>
  16.   if (/#redirect/i) {$text=0;}  # remove #REDIRECT
  17.   if ($text) {
  18.  
  19.     # Remove any text not normally visible
  20.     if (/<\/text>/) {$text=0;}
  21.     s/<.*>//;               # remove xml tags
  22.     s/&amp;/&/g;            # decode URL encoded chars
  23.     s/&lt;/</g;
  24.     s/&gt;/>/g;
  25.     s/<ref[^<]*<\/ref>//g;  # remove references <ref...> ... </ref>
  26.     s/<[^>]*>//g;           # remove xhtml tags
  27.     s/\[http:[^] ]*/[/g;    # remove normal url, preserve visible text
  28.     s/\|thumb//ig;          # remove images links, preserve caption
  29.     s/\|left//ig;
  30.     s/\|right//ig;
  31.     s/\|\d+px//ig;
  32.     s/\[\[image:[^\[\]]*\|//ig;
  33.     s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig;  # show categories without markup
  34.     s/\[\[[a-z\-]*:[^\]]*\]\]//g;  # remove links to other languages
  35.     s/\[\[[^\|\]]*\|/[[/g;  # remove wiki url, preserve visible text
  36.     s/{{[^}]*}}//g;         # remove {{icons}} and {tables}
  37.     s/{[^}]*}//g;
  38.     s/\[//g;                # remove [ and ]
  39.     s/\]//g;
  40.     s/&[^;]*;/ /g;          # remove URL encoded chars
  41.  
  42.     # convert to lowercase letters and spaces, spell digits
  43.     $_=" $_ ";
  44.     tr/QWERTYUIOPASDFGHJKLZXCVBNMĘÓĄŚŁŻŹĆŃ/qwertyuiopasdfghjklzxcvbnmęóąśłżźćń/;
  45.     s/0/ zero /g;
  46.     s/1/ one /g;
  47.     s/2/ two /g;
  48.     s/3/ three /g;
  49.     s/4/ four /g;
  50.     s/5/ five /g;
  51.     s/6/ six /g;
  52.     s/7/ seven /g;
  53.     s/8/ eight /g;
  54.     s/9/ nine /g;
  55.     tr/qwertyuiopasdfghjklzxcvbnmęóąśłżźćń/ /cs;
  56.     chop;
  57.     print $_;
  58.   }
  59. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement