Advertisement
Guest User

Untitled

a guest
May 26th, 2016
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.19 KB | None | 0 0
  1. #! usr/bin/perl -w
  2. use strict;
  3. use LWP::Simple;
  4. use JSON qw( decode_json );
  5. use HTML::Strip;
  6. use WWW::Mechanize;
  7. use experimental 'smartmatch';
  8.  
  9. # get the length of command line arguments
  10. my $param_length = $#ARGV + 1;
  11. # print "length: $param_length\n";
  12.  
  13. # check number of compulsory arguments
  14. if ($param_length < 3){
  15. print("Insufficient arguments!\n");
  16. exit;
  17. }
  18.  
  19. # get command line arguments
  20. my $index_name = $ARGV[0];
  21. my $start_url = $ARGV[1];
  22. my $exclude_file = $ARGV[2];
  23.  
  24. # get optional arguments
  25. my $max_depth = 0;
  26. my $dir;
  27.  
  28. if($param_length eq 4){
  29. $max_depth = $ARGV[3];
  30. } elsif ($param_length eq 5){
  31. $max_depth = $ARGV[3];
  32. $dir = $ARGV[4];
  33. }
  34.  
  35.  
  36. # get list of excluded words
  37. open(EXCLUDE, "EXCLUDE.txt");
  38. my @exclude = <EXCLUDE>;
  39. close(EXCLUDE);
  40.  
  41.  
  42. my $keyword;
  43. # check if the prefix is correct
  44. if (index($start_url, "https://en.wikipedia.org/wiki/") == -1){
  45. die ("Invalid Wikipedia url !\n");
  46. } else{
  47. $keyword = substr($start_url, 30);
  48. }
  49.  
  50.  
  51. my $extract;
  52. my $extract_decoded;
  53. my $page_ID;
  54. my $extract_str;
  55. my $Striper;
  56. my @extract_array;
  57.  
  58. # get the main body article from the url, finally get an array of the words
  59. sub get_extract{
  60. print("$keyword\n");
  61. $extract = get ( "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&titles=".$keyword);
  62. $extract_decoded = decode_json($extract);
  63.  
  64. $page_ID = (keys $extract_decoded->{'query'}{'pages'})[0];
  65. $extract_str = $extract_decoded->{'query'}{'pages'}{$page_ID}{'extract'};
  66.  
  67. if(defined($extract_str)){
  68.  
  69. # remove punctuation
  70. $extract_str =~ s/[[:punct:]]//g;
  71.  
  72. # remove html tags
  73. $Striper = HTML::Strip -> new();
  74. $extract_str = $Striper->parse($extract_str);
  75. $Striper->eof;
  76.  
  77. # split by space
  78. @extract_array = split(" ", $extract_str);
  79. }
  80. }
  81.  
  82. my @links_to_visit;
  83. my $links_json;
  84. my $links_decoded;
  85. my @number_of_links_array;
  86. my $number_of_links;
  87. my $j;
  88. my $url_to_insert;
  89. my @urls;
  90.  
  91. my @tmp;
  92.  
  93. # get all links referenced in the current page and push them to @links_to_visit
  94. sub get_links{
  95. $links_json = get("https://en.wikipedia.org/w/api.php?format=json&action=query&prop=links&pllimit=3&titles=".$keyword);
  96. $links_decoded = decode_json($links_json);
  97.  
  98. $page_ID = (keys $links_decoded->{'query'}{'pages'})[0];
  99. my @links = @{$links_decoded ->{'query'}{'pages'}{$page_ID}{'links'}};
  100. foreach my $loop_variable (@links) {
  101. if ($loop_variable->{"ns"} eq "0"){ #get all relevant wiki urls not common ones like /Wikipedia:Stub
  102. $loop_variable->{"title"} =~ s/ /_/g; #add underscore to space for wiki url
  103. push(@urls,$loop_variable->{"title"});
  104. }
  105. }
  106. }
  107.  
  108. get_extract;
  109.  
  110. #get_links;
  111.  
  112. # hash to store word=>links
  113. my %hash;
  114.  
  115. # put the word found in the start page and that link to hash
  116. foreach my $item (@extract_array){
  117. if(!($item ~~ @exclude)){
  118.  
  119. if(!exists $hash{$item}){
  120. push(@{$hash{$item}},$start_url);
  121. } elsif(!($start_url ~~ @{$hash{$item}})) {
  122. push(@{$hash{$item}},$start_url);
  123. }
  124. #print("@{$hash{$item}}\n");
  125. }
  126. }
  127.  
  128. if($max_depth ne 0){
  129. get_links;
  130. }
  131.  
  132. my $current_link;
  133.  
  134.  
  135. for(my $i=0;$i<$max_depth;$i++){
  136. my $count = $#links_to_visit+1;
  137. print("count: $count\n");
  138. while($count>0){
  139. $current_link = $links_to_visit[-1];
  140. pop(@links_to_visit);
  141. $keyword = substr($current_link, 30);
  142. get_extract;
  143. # e.g. max_depth =2, we want to perform get links once for the first-level links (not the start link!)
  144. # Iteration 1: i = 0, max_depth - i = 2 > 1, perform get_links
  145. # Iteration 2: i = 1, max depth - i = 1 = 1, not perform get_links
  146. if(($max_depth - $i) > 1){
  147. get_links;
  148. }
  149. $count--;
  150.  
  151. # add the link containing the word to hash
  152. foreach my $item(@extract_array){
  153. if(!($item ~~ @exclude)){
  154. if(!exists $hash{$item}){
  155. push(@{$hash{$item}},$current_link);
  156. } elsif(!($current_link ~~ @{$hash{$item}})) {
  157. push(@{$hash{$item}},$current_link);
  158. }
  159. }
  160. #print("@{$hash{$item}}\n");
  161. }
  162. }
  163. }
  164.  
  165.  
  166.  
  167. open(WRITE, ">$index_name.txt");
  168. foreach my $k (keys %hash){
  169. print(WRITE "$k, ");
  170. my @subhash = @{$hash{$k}};
  171. foreach my $subk (@subhash){
  172. print(WRITE "$subk, ");
  173. }
  174. print(WRITE "\n");
  175.  
  176. }
  177. close(WRITE);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement