#!/usr/bin/perl
# adapted from Archdocumentalist
# Copyright (C) 2010 Francois Boulogne <fboulogne at april dot org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
use warnings;
use strict;
my $LANGUAGE="fr";
my $VERSION="mdv";
sub usage
{
print "Usage: archdocumentalist.pl PATH\nwhere\n\t" ;
print "\tPATH is the output path\n";
}
if ($#ARGV!=0) # 1 = 2 args
{
usage();
exit(0);
}
#Path
my $PATH=$ARGV[0]; #Declare before use LWP::Simple to avoid errors
unless ($PATH=~m/.*\/$/) {$PATH.='/';} #Complete the path with a / if needed
use Encode;
use JSON::XS;
use LWP::Simple;
my $DATADIR=$PATH."mdv-wiki-".$LANGUAGE."/"; #Directory for data
mkdir $DATADIR;
mkdir $DATADIR."pictures/";
my $indexfile=$DATADIR."index.html"; #index file
#Start the index page
open (INDEX,">:utf8",$indexfile) or die "cannot open index.html";
print INDEX "<HTML><HEAD> Mandriva wiki ".$LANGUAGE." </HEAD><BODY>\n";
close(INDEX);
my $from = "";
my $count = 0;
use constant TITLE => $from;
print "Download pages... it might take a while.\n";
#loop on different pages. Stop when $count==1.
while()
{
$count=0;
my $text= get("http://wiki.mandriva.com/". $LANGUAGE ."/api.php?action=query&list=allpages&aplimit=500&format=json&apfilterredir=nonredirects&apfrom=$from");
my $ret = JSON::XS->new->utf8->decode($text);
my $elements = $ret->{query}->{allpages};
#loop on all elements of the current page($from)
foreach (@$elements)
{
my $title=encode("utf8","$_->{title}");
$from=$title; #Do not modify this variable. No perl module for constant in extra/community...
print $title."\n";
#Detect the language of the current page
my $page_lang=$title;
my $index_entry = $title;
#Save the page if language is OK.
#Download the wiki page
my $link="http://wiki.mandriva.com/". $LANGUAGE ."/index.php?title=".$title ."&printable=yes";
my $doc = get($link); #Download the page
if (defined $doc)
{
#download pictures
my @docarray = split( '\n', $doc);
my @lines = grep (/$LANGUAGE\/uploads\//, @docarray);
foreach (@lines)
{
$_=~s/.*($LANGUAGE\/uploads.*(png|jpeg|jpg)).*/$1/;
my $picname = $_;
$picname=~s/.+\/(.+)$/$1/;
print $picname."\n";
getstore("http://wiki.mandriva.com/".$_,$DATADIR."pictures/".$picname) ;
}
#modify html file
$doc=~s/href\=\"\/fr\/Fi.+\:(.*)\"(.*)src\=\".*\"/href\=\"pictures\/$1\"$2src\=\"pictures\/$1\"/g;
#Save the page
my $fname=$DATADIR.$_->{pageid}.'.html';
open (FILE,">:utf8",$fname) or die "cannot open file $fname";
print FILE $doc;
close(FILE);
open (INDEX,">>:utf8",$indexfile) or die "cannot open index.html";
print INDEX "<P><A HREF=\'".$_->{pageid}.".html\'>".$index_entry."</A>\n";
close(INDEX);
}
$count++;
}
last if($count == 1) #end of while loop
}
#Finish the index page
open (INDEX,">>:utf8",$indexfile) or die "cannot open index.html";
print INDEX "</BODY></HTML>";
close(INDEX);
#move bad named pictures
opendir(MY_DIR,$DATADIR."pictures/" ) or die "error reading the directory : $!";
my @all_files = grep /^\d+px-.*$/, readdir MY_DIR;
closedir MY_DIR;
foreach (@all_files)
{
my $new=$_;
$new=~s/^\d+px-(.*)/$1/;
rename $_,$new;
}
print "Done.\nDocumentation generated in ".$DATADIR."\n";
exit(0);