Untitled

#! usr/bin/perl -w
use strict;
use LWP::Simple;
use JSON qw( decode_json );
use HTML::Strip;
use WWW::Mechanize;
use experimental 'smartmatch';

# get the length of command line arguments
my $param_length = $#ARGV + 1;
# print "length: $param_length\n";

# check number of compulsory arguments
if ($param_length < 3){
	print("Insufficient arguments!\n");
	exit;
}

# get command line arguments
my $index_name = $ARGV[0];
my $start_url = $ARGV[1];
my $exclude_file = $ARGV[2];

# get optional arguments
my $max_depth = 0;
my $dir;

if($param_length eq 4){
	$max_depth = $ARGV[3];
} elsif ($param_length eq 5){
	$max_depth = $ARGV[3];
	$dir = $ARGV[4];
}


# get list of excluded words
open(EXCLUDE, "EXCLUDE.txt");
my @exclude = <EXCLUDE>;
close(EXCLUDE);


my $keyword;
# check if the prefix is correct
if (index($start_url, "https://en.wikipedia.org/wiki/") == -1){
	die ("Invalid Wikipedia url !\n");
} else{
	$keyword = substr($start_url, 30);
}


my $extract;
my $extract_decoded;
my $page_ID;
my $extract_str;
my $Striper;
my @extract_array;

# get the main body article from the url, finally get an array of the words
sub get_extract{
	print("$keyword\n");
	$extract = get ( "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&titles=".$keyword);
	$extract_decoded = decode_json($extract);

	$page_ID = (keys $extract_decoded->{'query'}{'pages'})[0];
	$extract_str = $extract_decoded->{'query'}{'pages'}{$page_ID}{'extract'};

	if(defined($extract_str)){

		# remove punctuation
		$extract_str =~ s/[[:punct:]]//g;

		# remove html tags
		$Striper = HTML::Strip -> new();
		$extract_str = $Striper->parse($extract_str);
		$Striper->eof;

		# split by space
		@extract_array = split(" ", $extract_str);
	}
}

my @links_to_visit;
my $links_json;
my $links_decoded;
my @number_of_links_array;
my $number_of_links;
my $j;
my $url_to_insert;
my @urls;

my @tmp;

# get all links referenced in the current page and push them to @links_to_visit
sub get_links{
	$links_json = get("https://en.wikipedia.org/w/api.php?format=json&action=query&prop=links&pllimit=3&titles=".$keyword);
	$links_decoded = decode_json($links_json);

	$page_ID = (keys $links_decoded->{'query'}{'pages'})[0];
	my @links = @{$links_decoded ->{'query'}{'pages'}{$page_ID}{'links'}};
        foreach my $loop_variable (@links) {
            if ($loop_variable->{"ns"} eq "0"){ #get all relevant wiki urls not common ones like /Wikipedia:Stub
                $loop_variable->{"title"} =~ s/ /_/g; #add underscore to space for wiki url
                push(@urls,$loop_variable->{"title"});
            }
    }
}

get_extract;

#get_links;

# hash to store word=>links
my %hash;

# put the word found in the start page and that link to hash
foreach my $item (@extract_array){
	if(!($item ~~ @exclude)){

		if(!exists $hash{$item}){
			push(@{$hash{$item}},$start_url);
		} elsif(!($start_url ~~ @{$hash{$item}})) {
			push(@{$hash{$item}},$start_url);
		}
		#print("@{$hash{$item}}\n");
	}
}

if($max_depth ne 0){
	get_links;
}

my $current_link;


for(my $i=0;$i<$max_depth;$i++){
	my $count = $#links_to_visit+1;
	print("count: $count\n");
	while($count>0){
		$current_link = $links_to_visit[-1];
		pop(@links_to_visit);
		$keyword = substr($current_link, 30);
		get_extract;
		# e.g. max_depth =2, we want to perform get links once for the first-level links (not the start link!)
		# Iteration 1: i = 0, max_depth - i = 2 > 1, perform get_links
		# Iteration 2: i = 1, max depth - i = 1 = 1, not perform get_links
		if(($max_depth - $i) > 1){
			get_links;
		}
		$count--;

		# add the link containing the word to hash
		foreach my $item(@extract_array){
			if(!($item ~~ @exclude)){
				if(!exists $hash{$item}){
					push(@{$hash{$item}},$current_link);
				} elsif(!($current_link ~~ @{$hash{$item}})) {
					push(@{$hash{$item}},$current_link);
				}
			}
			#print("@{$hash{$item}}\n");
		}
	}
}


open(WRITE, ">$index_name.txt");
foreach my $k (keys %hash){
	print(WRITE "$k, ");
	my @subhash = @{$hash{$k}};
	foreach my $subk (@subhash){
		print(WRITE "$subk, ");
	}
	print(WRITE "\n");

}
close(WRITE);