Advertisement
Guest User

gnu

a guest
Sep 5th, 2010
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.90 KB | None | 0 0
  1. #!/usr/bin/perl
  2. # adapted from Archdocumentalist
  3. # Copyright (C) 2010 Francois Boulogne <fboulogne at april dot org>
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License along
  16. # with this program; if not, write to the Free Software Foundation, Inc.,
  17. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18.  
  19.  
  20.  
  21. use warnings;
  22. use strict;
  23.  
  24. my $LANGUAGE="fr";
  25. my $VERSION="mdv";
  26.  
  27. sub usage
  28. {
  29. print "Usage: archdocumentalist.pl PATH\nwhere\n\t" ;
  30. print "\tPATH is the output path\n";
  31. }
  32.  
  33. if ($#ARGV!=0) # 1 = 2 args
  34. {
  35. usage();
  36. exit(0);
  37. }
  38.  
  39.  
  40. #Path
  41. my $PATH=$ARGV[0]; #Declare before use LWP::Simple to avoid errors
  42. unless ($PATH=~m/.*\/$/) {$PATH.='/';} #Complete the path with a / if needed
  43.  
  44. use Encode;
  45. use JSON::XS;
  46. use LWP::Simple;
  47.  
  48.  
  49.  
  50. my $DATADIR=$PATH."mdv-wiki-".$LANGUAGE."/"; #Directory for data
  51. mkdir $DATADIR;
  52. mkdir $DATADIR."pictures/";
  53. my $indexfile=$DATADIR."index.html"; #index file
  54.  
  55. #Start the index page
  56. open (INDEX,">:utf8",$indexfile) or die "cannot open index.html";
  57. print INDEX "<HTML><HEAD> Mandriva wiki ".$LANGUAGE." </HEAD><BODY>\n";
  58. close(INDEX);
  59.  
  60. my $from = "";
  61. my $count = 0;
  62. use constant TITLE => $from;
  63.  
  64.  
  65. print "Download pages... it might take a while.\n";
  66.  
  67. #loop on different pages. Stop when $count==1.
  68. while()
  69. {
  70. $count=0;
  71.  
  72. my $text= get("http://wiki.mandriva.com/". $LANGUAGE ."/api.php?action=query&list=allpages&aplimit=500&format=json&apfilterredir=nonredirects&apfrom=$from");
  73. my $ret = JSON::XS->new->utf8->decode($text);
  74. my $elements = $ret->{query}->{allpages};
  75.  
  76. #loop on all elements of the current page($from)
  77. foreach (@$elements)
  78. {
  79. my $title=encode("utf8","$_->{title}");
  80. $from=$title; #Do not modify this variable. No perl module for constant in extra/community...
  81. print $title."\n";
  82. #Detect the language of the current page
  83. my $page_lang=$title;
  84. my $index_entry = $title;
  85.  
  86. #Save the page if language is OK.
  87. #Download the wiki page
  88. my $link="http://wiki.mandriva.com/". $LANGUAGE ."/index.php?title=".$title ."&printable=yes";
  89. my $doc = get($link); #Download the page
  90.  
  91. if (defined $doc)
  92. {
  93. #download pictures
  94. my @docarray = split( '\n', $doc);
  95. my @lines = grep (/$LANGUAGE\/uploads\//, @docarray);
  96. foreach (@lines)
  97. {
  98. $_=~s/.*($LANGUAGE\/uploads.*(png|jpeg|jpg)).*/$1/;
  99. my $picname = $_;
  100. $picname=~s/.+\/(.+)$/$1/;
  101. print $picname."\n";
  102. getstore("http://wiki.mandriva.com/".$_,$DATADIR."pictures/".$picname) ;
  103. }
  104.  
  105.  
  106. #modify html file
  107. $doc=~s/href\=\"\/fr\/Fi.+\:(.*)\"(.*)src\=\".*\"/href\=\"pictures\/$1\"$2src\=\"pictures\/$1\"/g;
  108.  
  109. #Save the page
  110. my $fname=$DATADIR.$_->{pageid}.'.html';
  111. open (FILE,">:utf8",$fname) or die "cannot open file $fname";
  112. print FILE $doc;
  113. close(FILE);
  114.  
  115. open (INDEX,">>:utf8",$indexfile) or die "cannot open index.html";
  116. print INDEX "<P><A HREF=\'".$_->{pageid}.".html\'>".$index_entry."</A>\n";
  117. close(INDEX);
  118.  
  119. }
  120. $count++;
  121. }
  122. last if($count == 1) #end of while loop
  123. }
  124.  
  125. #Finish the index page
  126. open (INDEX,">>:utf8",$indexfile) or die "cannot open index.html";
  127. print INDEX "</BODY></HTML>";
  128. close(INDEX);
  129.  
  130. #move bad named pictures
  131. opendir(MY_DIR,$DATADIR."pictures/" ) or die "error reading the directory : $!";
  132. my @all_files = grep /^\d+px-.*$/, readdir MY_DIR;
  133. closedir MY_DIR;
  134.  
  135. foreach (@all_files)
  136. {
  137. my $new=$_;
  138. $new=~s/^\d+px-(.*)/$1/;
  139. rename $_,$new;
  140. }
  141.  
  142.  
  143.  
  144.  
  145.  
  146. print "Done.\nDocumentation generated in ".$DATADIR."\n";
  147. exit(0);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement