Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! usr/bin/perl -w
- use strict;
- use LWP::Simple;
- use JSON qw( decode_json );
- use HTML::Strip;
- use WWW::Mechanize;
- use experimental 'smartmatch';
- # get the length of command line arguments
- my $param_length = $#ARGV + 1;
- # print "length: $param_length\n";
- # check number of compulsory arguments
- if ($param_length < 3){
- print("Insufficient arguments!\n");
- exit;
- }
- # get command line arguments
- my $index_name = $ARGV[0];
- my $start_url = $ARGV[1];
- my $exclude_file = $ARGV[2];
- # get optional arguments
- my $max_depth = 0;
- my $dir;
- if($param_length eq 4){
- $max_depth = $ARGV[3];
- } elsif ($param_length eq 5){
- $max_depth = $ARGV[3];
- $dir = $ARGV[4];
- }
- # get list of excluded words
- open(EXCLUDE, "EXCLUDE.txt");
- my @exclude = <EXCLUDE>;
- close(EXCLUDE);
- my $keyword;
- # check if the prefix is correct
- if (index($start_url, "https://en.wikipedia.org/wiki/") == -1){
- die ("Invalid Wikipedia url !\n");
- } else{
- $keyword = substr($start_url, 30);
- }
- my $extract;
- my $extract_decoded;
- my $page_ID;
- my $extract_str;
- my $Striper;
- my @extract_array;
- # get the main body article from the url, finally get an array of the words
- sub get_extract{
- print("$keyword\n");
- $extract = get ( "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&titles=".$keyword);
- $extract_decoded = decode_json($extract);
- $page_ID = (keys $extract_decoded->{'query'}{'pages'})[0];
- $extract_str = $extract_decoded->{'query'}{'pages'}{$page_ID}{'extract'};
- if(defined($extract_str)){
- # remove punctuation
- $extract_str =~ s/[[:punct:]]//g;
- # remove html tags
- $Striper = HTML::Strip -> new();
- $extract_str = $Striper->parse($extract_str);
- $Striper->eof;
- # split by space
- @extract_array = split(" ", $extract_str);
- }
- }
- my @links_to_visit;
- my $links_json;
- my $links_decoded;
- my @number_of_links_array;
- my $number_of_links;
- my $j;
- my $url_to_insert;
- my @urls;
- my @tmp;
- # get all links referenced in the current page and push them to @links_to_visit
- sub get_links{
- $links_json = get("https://en.wikipedia.org/w/api.php?format=json&action=query&prop=links&pllimit=3&titles=".$keyword);
- $links_decoded = decode_json($links_json);
- $page_ID = (keys $links_decoded->{'query'}{'pages'})[0];
- my @links = @{$links_decoded ->{'query'}{'pages'}{$page_ID}{'links'}};
- foreach my $loop_variable (@links) {
- if ($loop_variable->{"ns"} eq "0"){ #get all relevant wiki urls not common ones like /Wikipedia:Stub
- $loop_variable->{"title"} =~ s/ /_/g; #add underscore to space for wiki url
- push(@urls,$loop_variable->{"title"});
- }
- }
- }
- get_extract;
- #get_links;
- # hash to store word=>links
- my %hash;
- # put the word found in the start page and that link to hash
- foreach my $item (@extract_array){
- if(!($item ~~ @exclude)){
- if(!exists $hash{$item}){
- push(@{$hash{$item}},$start_url);
- } elsif(!($start_url ~~ @{$hash{$item}})) {
- push(@{$hash{$item}},$start_url);
- }
- #print("@{$hash{$item}}\n");
- }
- }
- if($max_depth ne 0){
- get_links;
- }
- my $current_link;
- for(my $i=0;$i<$max_depth;$i++){
- my $count = $#links_to_visit+1;
- print("count: $count\n");
- while($count>0){
- $current_link = $links_to_visit[-1];
- pop(@links_to_visit);
- $keyword = substr($current_link, 30);
- get_extract;
- # e.g. max_depth =2, we want to perform get links once for the first-level links (not the start link!)
- # Iteration 1: i = 0, max_depth - i = 2 > 1, perform get_links
- # Iteration 2: i = 1, max depth - i = 1 = 1, not perform get_links
- if(($max_depth - $i) > 1){
- get_links;
- }
- $count--;
- # add the link containing the word to hash
- foreach my $item(@extract_array){
- if(!($item ~~ @exclude)){
- if(!exists $hash{$item}){
- push(@{$hash{$item}},$current_link);
- } elsif(!($current_link ~~ @{$hash{$item}})) {
- push(@{$hash{$item}},$current_link);
- }
- }
- #print("@{$hash{$item}}\n");
- }
- }
- }
- open(WRITE, ">$index_name.txt");
- foreach my $k (keys %hash){
- print(WRITE "$k, ");
- my @subhash = @{$hash{$k}};
- foreach my $subk (@subhash){
- print(WRITE "$subk, ");
- }
- print(WRITE "\n");
- }
- close(WRITE);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement