Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- # adapted from Archdocumentalist
- # Copyright (C) 2010 Francois Boulogne <fboulogne at april dot org>
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License along
- # with this program; if not, write to the Free Software Foundation, Inc.,
- # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- use warnings;
- use strict;
- my $LANGUAGE="fr";
- my $VERSION="mdv";
- sub usage
- {
- print "Usage: archdocumentalist.pl PATH\nwhere\n\t" ;
- print "\tPATH is the output path\n";
- }
- if ($#ARGV!=0) # 1 = 2 args
- {
- usage();
- exit(0);
- }
- #Path
- my $PATH=$ARGV[0]; #Declare before use LWP::Simple to avoid errors
- unless ($PATH=~m/.*\/$/) {$PATH.='/';} #Complete the path with a / if needed
- use Encode;
- use JSON::XS;
- use LWP::Simple;
- my $DATADIR=$PATH."mdv-wiki-".$LANGUAGE."/"; #Directory for data
- mkdir $DATADIR;
- mkdir $DATADIR."pictures/";
- my $indexfile=$DATADIR."index.html"; #index file
- #Start the index page
- open (INDEX,">:utf8",$indexfile) or die "cannot open index.html";
- print INDEX "<HTML><HEAD> Mandriva wiki ".$LANGUAGE." </HEAD><BODY>\n";
- close(INDEX);
- my $from = "";
- my $count = 0;
- use constant TITLE => $from;
- print "Download pages... it might take a while.\n";
- #loop on different pages. Stop when $count==1.
- while()
- {
- $count=0;
- my $text= get("http://wiki.mandriva.com/". $LANGUAGE ."/api.php?action=query&list=allpages&aplimit=500&format=json&apfilterredir=nonredirects&apfrom=$from");
- my $ret = JSON::XS->new->utf8->decode($text);
- my $elements = $ret->{query}->{allpages};
- #loop on all elements of the current page($from)
- foreach (@$elements)
- {
- my $title=encode("utf8","$_->{title}");
- $from=$title; #Do not modify this variable. No perl module for constant in extra/community...
- print $title."\n";
- #Detect the language of the current page
- my $page_lang=$title;
- my $index_entry = $title;
- #Save the page if language is OK.
- #Download the wiki page
- my $link="http://wiki.mandriva.com/". $LANGUAGE ."/index.php?title=".$title ."&printable=yes";
- my $doc = get($link); #Download the page
- if (defined $doc)
- {
- #download pictures
- my @docarray = split( '\n', $doc);
- my @lines = grep (/$LANGUAGE\/uploads\//, @docarray);
- foreach (@lines)
- {
- $_=~s/.*($LANGUAGE\/uploads.*(png|jpeg|jpg)).*/$1/;
- my $picname = $_;
- $picname=~s/.+\/(.+)$/$1/;
- print $picname."\n";
- getstore("http://wiki.mandriva.com/".$_,$DATADIR."pictures/".$picname) ;
- }
- #modify html file
- $doc=~s/href\=\"\/fr\/Fi.+\:(.*)\"(.*)src\=\".*\"/href\=\"pictures\/$1\"$2src\=\"pictures\/$1\"/g;
- #Save the page
- my $fname=$DATADIR.$_->{pageid}.'.html';
- open (FILE,">:utf8",$fname) or die "cannot open file $fname";
- print FILE $doc;
- close(FILE);
- open (INDEX,">>:utf8",$indexfile) or die "cannot open index.html";
- print INDEX "<P><A HREF=\'".$_->{pageid}.".html\'>".$index_entry."</A>\n";
- close(INDEX);
- }
- $count++;
- }
- last if($count == 1) #end of while loop
- }
- #Finish the index page
- open (INDEX,">>:utf8",$indexfile) or die "cannot open index.html";
- print INDEX "</BODY></HTML>";
- close(INDEX);
- #move bad named pictures
- opendir(MY_DIR,$DATADIR."pictures/" ) or die "error reading the directory : $!";
- my @all_files = grep /^\d+px-.*$/, readdir MY_DIR;
- closedir MY_DIR;
- foreach (@all_files)
- {
- my $new=$_;
- $new=~s/^\d+px-(.*)/$1/;
- rename $_,$new;
- }
- print "Done.\nDocumentation generated in ".$DATADIR."\n";
- exit(0);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement