Pastebin launched a little side project called VERYVIRAL.com, check it out ;-) Want more features on Pastebin? Sign Up, it's FREE!
Guest

gnu

By: a guest on Sep 5th, 2010  |  syntax: None  |  size: 3.90 KB  |  views: 55  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #!/usr/bin/perl
  2. # adapted from Archdocumentalist
  3. # Copyright (C) 2010  Francois Boulogne <fboulogne at april dot org>
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License along
  16. # with this program; if not, write to the Free Software Foundation, Inc.,
  17. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18.  
  19.  
  20.  
  21. use warnings;
  22. use strict;
  23.  
  24. my $LANGUAGE="fr";
  25. my $VERSION="mdv";
  26.  
  27. sub usage
  28. {
  29.         print "Usage: archdocumentalist.pl PATH\nwhere\n\t" ;
  30.         print "\tPATH is the output path\n";
  31. }
  32.  
  33. if ($#ARGV!=0) # 1 = 2 args
  34. {
  35.         usage();
  36.         exit(0);
  37. }
  38.  
  39.  
  40. #Path
  41. my $PATH=$ARGV[0]; #Declare before use LWP::Simple to avoid errors
  42. unless ($PATH=~m/.*\/$/) {$PATH.='/';} #Complete the path with a / if needed
  43.  
  44. use Encode;
  45. use JSON::XS;
  46. use LWP::Simple;
  47.  
  48.  
  49.  
  50. my $DATADIR=$PATH."mdv-wiki-".$LANGUAGE."/"; #Directory for data
  51. mkdir $DATADIR;
  52. mkdir $DATADIR."pictures/";
  53. my $indexfile=$DATADIR."index.html"; #index file
  54.  
  55. #Start the index page
  56. open (INDEX,">:utf8",$indexfile) or die "cannot open index.html";
  57. print INDEX "<HTML><HEAD> Mandriva wiki ".$LANGUAGE." </HEAD><BODY>\n";
  58. close(INDEX);
  59.  
  60. my $from = "";
  61. my $count = 0;
  62. use constant TITLE => $from;
  63.  
  64.  
  65. print "Download pages... it might take a while.\n";
  66.  
  67. #loop on different pages. Stop when $count==1.
  68. while()
  69. {
  70.         $count=0;
  71.  
  72.         my $text= get("http://wiki.mandriva.com/". $LANGUAGE ."/api.php?action=query&list=allpages&aplimit=500&format=json&apfilterredir=nonredirects&apfrom=$from");
  73.         my $ret = JSON::XS->new->utf8->decode($text);
  74.         my $elements = $ret->{query}->{allpages};
  75.  
  76.         #loop on all elements of the current page($from)
  77.         foreach (@$elements)
  78.         {
  79.                 my $title=encode("utf8","$_->{title}");
  80.                 $from=$title; #Do not modify this variable. No perl module for constant in extra/community...
  81.                 print $title."\n";
  82.                 #Detect the language of the current page
  83.                 my $page_lang=$title;
  84.                 my $index_entry = $title;
  85.                
  86.                         #Save the page if language is OK.              
  87.                                 #Download the wiki page
  88.                                 my $link="http://wiki.mandriva.com/". $LANGUAGE ."/index.php?title=".$title ."&printable=yes";
  89.                                 my $doc = get($link); #Download the page
  90.                                
  91.                                 if (defined $doc)
  92.                                 {
  93.                                         #download pictures
  94.                                         my @docarray = split( '\n', $doc);     
  95.                                         my @lines = grep (/$LANGUAGE\/uploads\//, @docarray);
  96.                                         foreach (@lines)
  97.                                         {
  98.                                                 $_=~s/.*($LANGUAGE\/uploads.*(png|jpeg|jpg)).*/$1/;
  99.                                                 my $picname = $_;
  100.                                                 $picname=~s/.+\/(.+)$/$1/;
  101.                                                 print $picname."\n";
  102.                                                 getstore("http://wiki.mandriva.com/".$_,$DATADIR."pictures/".$picname)  ;
  103.                                         }
  104.  
  105.  
  106.                                         #modify html file
  107.                                         $doc=~s/href\=\"\/fr\/Fi.+\:(.*)\"(.*)src\=\".*\"/href\=\"pictures\/$1\"$2src\=\"pictures\/$1\"/g;
  108.  
  109.                                         #Save the page
  110.                                         my $fname=$DATADIR.$_->{pageid}.'.html';
  111.                                         open (FILE,">:utf8",$fname) or die "cannot open file $fname";
  112.                                         print FILE $doc;
  113.                                         close(FILE);
  114.  
  115.                                         open (INDEX,">>:utf8",$indexfile) or die "cannot open index.html";
  116.                                         print INDEX "<P><A HREF=\'".$_->{pageid}.".html\'>".$index_entry."</A>\n";
  117.                                         close(INDEX);
  118.  
  119.                                 }
  120.                 $count++;
  121.         }
  122.         last if($count == 1) #end of while loop
  123. }
  124.  
  125. #Finish the index page
  126. open (INDEX,">>:utf8",$indexfile) or die "cannot open index.html";
  127. print INDEX "</BODY></HTML>";
  128. close(INDEX);
  129.  
  130. #move bad named pictures
  131. opendir(MY_DIR,$DATADIR."pictures/" ) or die "error reading the directory : $!";
  132. my @all_files = grep /^\d+px-.*$/, readdir MY_DIR;
  133. closedir MY_DIR;
  134.  
  135. foreach (@all_files)
  136. {
  137.         my $new=$_;
  138.                 $new=~s/^\d+px-(.*)/$1/;
  139.                 rename $_,$new;
  140. }
  141.  
  142.  
  143.  
  144.  
  145.  
  146. print "Done.\nDocumentation generated in ".$DATADIR."\n";
  147. exit(0);
clone this paste RAW Paste Data