Advertisement
rutera

site crawler

Jan 8th, 2015
261
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.89 KB | None | 0 0
  1. #!/usr/bin/perl
  2. # Site Crawler
  3.  
  4. use strict;
  5. use LW2;
  6. use Getopt::Std;
  7.  
  8. my (%opts, @mainarray, @myarray);
  9. my ($target, $request, $depth, $proto, $crawler, $path,
  10.     $resource, $tmp, $key, $value, $track_hash, $cookie_hash);
  11. getopts('h:p:d:s:', \%opts);
  12.  
  13. # Usage Statement
  14. sub usage() {
  15.    print "\nUsage :\tperl $0 -h target [-p -d -s]\n" .
  16.       "\t -p port_num\n" .
  17.       "\t -d depth\n" .
  18.       "\t -s [0 | 1] - 0 = HTTP, 1 = HTTPS \n";
  19.    exit;
  20. }
  21.  
  22. # ensure a target host is provided
  23. if (!(defined($opts{h}))) {
  24.    print "You must specify a host\n";
  25.    usage();
  26. } else {
  27.    $target=$opts{h};
  28. }
  29.  
  30. # use the LW2 new request function
  31. $request = LW2::http_new_request(
  32.    host=>$target,
  33.    method=>'GET',
  34.    timeout=>10
  35.    );
  36.  
  37. # set the port number
  38. if (!(defined($opts{p}))) {
  39.    print "You did not specify a port, defaulting to 80\n";
  40.    $request->{whisker}->{port} = 80;
  41. } else {
  42.    $request->{whisker}->{port} = $opts{p};
  43. }
  44.  
  45.  
  46. # change the 'User-Agent' header for identification
  47. $request->{'User-Agent'} = 'Xasulrevs Crawler/1.0';
  48.  
  49. # ensure everything is protocol-compliant
  50. LW2::http_fixup_request($request);
  51.  
  52. # get or set the depth level
  53. if (!(defined($opts{d}))) {
  54.    print "You did not specify a depth, defaulting to 2\n";
  55.    $depth = 2;
  56. } else {
  57.    $depth=$opts{d};
  58. }
  59.  
  60. # set protocol
  61. if (!(defined($opts{s}))) {
  62.    print "You did not specify a protocol, HTTP = 0, HTTPS = 1, defaulting to
  63. HTTP\n";
  64.    $proto="http://";
  65. } elsif ($opts{s} eq 0) {
  66.    $proto="http://";
  67. } elsif ($opts{s} eq 1) {
  68.    $proto="https://";
  69. }
  70.  
  71. # make a new crawler
  72. $crawler = LW2::crawl_new(
  73.    "$proto$target/",   # start URL
  74.    $depth,             # depth
  75.    $request            # premade LW request
  76.    );
  77.  
  78. # tell the crawler that we want it to save all cookies.
  79. $crawler->{config}->{save_cookies}=1;
  80. # tell the crawler to follow redirects.
  81. $crawler->{config}->{follow_moves}=1;
  82. # tell the crawler to save all the skipped URLs.
  83. $crawler->{config}->{save_skipped}=1;
  84.  
  85. my $result=$crawler->{crawl}->();
  86.  
  87. # The crawler returns once it's crawled all available URLs.
  88. if(!defined $result){
  89.    print "There was an error:\n";
  90.    print $crawler->{errors}->[0];
  91. }
  92.  
  93. $track_hash = $crawler->{track};
  94.  
  95. # populate array with only resources
  96. while( ($key,$value) = each (%$track_hash) ){
  97.    chomp;
  98.    push(@mainarray, $key) unless $key eq '';
  99. }
  100.  
  101. # push out to hash for sorting and
  102. # ensuring uniqueness of data
  103. my %hash1 = map { $_ => 1 } @mainarray;
  104. my @myarray = sort keys %hash1;
  105.  
  106. # print resources discovered
  107. foreach $tmp (@myarray) {
  108.    print "$tmp\n";
  109. }
  110.  
  111. # print out any cookies (requires save_cookies=1)
  112. my $cookie_hash = $crawler->{cookies};
  113. print "\n\nCookie name & value:\n";
  114. while(($key,$value) = each (%$cookie_hash)){
  115.    print "$key:\t$value\n";
  116. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement