help

#!C:\Perl\bin\perl

use strict;
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTML::LinkExtor;
use URI::http;
use Time::HiRes qw(usleep);
use Getopt::Std;

##########   VARIABLE DECLARATION    ############
our ($opt_u,$opt_d);
getopts('u:d');
my %visited;
my %oldemail;
my @urls;
my $linkfile;
my $emailfile;
my $oldemailcount = 0;
my $curtime = localtime(time);
my $firstmail = 0;
my $domainonly = $opt_d;
my $startingURL = $opt_u;
my $domain;
my $sleep;
##########    -------------------    #############


# Loads already found e-mails from a database file IN THE SAME DIRECTORY AS THIS PROGRAM
loadEmails();

# If user did not use flags, show them the configuration settings
# If they DID use the flags, make sure that the URL is correctly formatted
if(!$opt_u){
   getInput();
}else{
   sanitizeURL($startingURL);
}

# Add the starting URL selected by user into the array containing all URLs to crawl
push @urls, $startingURL;

# Initialize LWP browser
my $browser = LWP::UserAgent->new();
$browser->timeout(10);
$browser->agent("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; Win64; x64; Trident/4.0)");

########### -- STARTING THE MAIN LOOP -- ##########
MAIN: while (@urls) {
   # Set the number of milliseconds to sleep between each page crawled.
   # This is randomized to a value between 12 and 17 seconds by default
   $sleep = rand(500000) + 12000000;

   # Draw a URL and remove it from the array
   my $url = shift @urls;

   select(STDOUT);
   next if $visited{$url};
   print "\n\n------------ CHECKING -> $url <- ----------\n\n";

   # Send request to web server via the browser
   my $request = HTTP::Request->new(GET => $url);
   my $response = $browser->request($request);

   # If the server responds with an error...
   if ($response->is_error()) {
      select(STDOUT);
      # print the error message from server
      print $response->status_line, "\n";
      handleErrors();
   }
   # Extract HTML from HTTP response
   my $contents = $response->content();
   print "\nLinks Found:\n";
   # add link crawled to a hash of visited links.
   $visited{$url} = 1;
   # extract all links from HTML
   my ($page_parser) = HTML::LinkExtor->new(undef, $url);
   $page_parser->parse($contents)->eof;
   my @links = $page_parser->links;
   # If there are no more links, call the handleErrors() function
   if(!@links){
      handleErrors();
   }

   # For every link in the links array...
   foreach my $link (@links) {
      # check for illegal file extensions
      if($$link[2]!~ m/.png/i and $$link[2]!~ m/.css/i and $$link[2]!~ m/.ico/i and $$link[2]!~ m/.jpg/i
      and $$link[2]!~ m/.js/i and $$link[2]!~ m/.xml/i and $$link[2]!~ m/.gif/i and $$link[2]!~ m/javascript:(.)/i
      and $$link[2]!~ m/feeds./i and $$link[2]!~ m/rss./i and $$link[2]!~ m/mailto:/i and $$link[2]!~ m/about:./i
      and $$link[2]!~ m/.ashx/i){
         # If the option is set, print only the links from the same domain as the starting URL.
         # Else, do the same for all links found.
         if($domainonly == 1){
            if($$link[2] =~ m/$domain/ig){
               select(STDOUT);
               if($visited{$$link[2]}){
               }else{
                  print "$$link[2]\n";
                  #Push links found on page into the array of URLs
                  push @urls, $$link[2];
               }
            }
         }else{
            select(STDOUT);
            if($visited{$$link[2]}){}
            else{
               print "$$link[2]\n";
               push @urls, $$link[2];
            }
         }
      }
   }

   # Print all e-mail adresses found...
   print "\nEmails Found:\n";
   # ...that matches these regexes.
   while($contents =~ m/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/ig or $contents =~ m/\b[A-Z0-9._%+-]+\[+AT+\][A-Z0-9.-]+\.[A-Z]{2,4}\b/ig){
      # only add e-mail to list if it has not already been printed. (No Duplicates)
      next if $oldemail{$&};
      # unless the address matches any of these unwanted addresses..
      if($& =~ m/example.com/ig or $& =~ m/spam/ig or $& =~ m/xxx/ig){
      }else{

         # print to both the email database file and the console.
         select(STDOUT);
         print $&."\n";
         open $emailfile,">>", ("emails.txt");
         select($emailfile);
         if($firstmail == 0){
            printf "\n-------------------- $curtime --------------------\n";
            $firstmail = 1;
         }
         print $&."\n";
         close $emailfile;
         $oldemail{$&} = 1;
      }
   }
   # The loop is almost done, upon a successful crawl through a link, it will sleep for the amount of time
   # set at the start of the loop.
   select(STDOUT);
   print "\nProgram waits for ". $sleep/1000000 ." seconds before next request.\nThis is to prevent blacklisting.\n";
   usleep($sleep);
}

sub loadEmails {
   print "Loading E-mails from file 'emails.txt'...\n\n";
   open $emailfile,"+<", ("emails.txt") or print "E-mail file does not exist.\nIt will be created when you start crawling.\n\n";
   while(<$emailfile>){
      chomp($_);
      $oldemail{$_} = 1;
      if($_ =~ m/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/ig or $_ =~ m/\b[A-Z0-9._%+-]+\[+AT+\][A-Z0-9.-]+\.[A-Z]{2,4}\b/ig){
         $oldemailcount++;
      }
      print "$_\n";
   }
   close($emailfile);
   print "\n$oldemailcount E-Mails Loaded!\n";
}
########## -- END OF MAIN LOOP -- ##########

# Function to be called if user does not use the terminal flags or there is a mistake in the URL
sub getInput {
   # Get the URL to start crawling
   print "\nPlease enter a URL to start crawling. \n(Example: 'http://google.com' or 'yahoo.com')\n\n";
   print "http://";
   $startingURL = <>;
   chomp $startingURL;

   # Format the URL and extract domainS
   $startingURL = sanitizeURL($startingURL);

   print "\n\nDo you want the links that appear to be only the ones in the same domain that you typed in?\n".
   "This is useful to avoid following links to advertising sites as these usually do not contain e-mail addresses\n\n".
   "Domain = $domain \n\n".
   "1) Yes\n".
   "2) No\n".
   "3) Exit Program\n\n";

   # Ask the user to select whether to use only link in the same domain or not.
   my $domainchoice;
   while($domainchoice != 1 && $domainchoice != 2 && $domainchoice != 3){
      $domainchoice = <>;
      chomp $domainchoice;
      if($domainchoice == 1){
         $domainonly = 1;
      }
      elsif($domainchoice == 3){
            exit;
      }
   }
}
############## ------------------------------------------- ################


############## -- Will correctly format a url string and extract its domain -- #############
sub sanitizeURL {
   my $url = @_[0];
   my $http = 'http://';

   # if there is no "http://" infront of the url, add it.
   if($url !~ m/http:./i && $url!~ m/https:./i){
      $url = $http.$url;
   }

   # extract domain of a URL string using the URI class.
   $domain = URI->new($url,"http");
   $domain = $domain->host;

   # return sanitized URL
   return $url;
}
############# -- Will handle errors by asking for a new URL to start over with if necessary" -- #############
sub handleErrors {
   # if there are no URLs left, ask for new starting URL, then continue.
   # else, sleep and continue
   if(scalar @urls < 1){
      print "No more URLs to crawl.\n";
      getInput();
      push @urls, $startingURL;
   }else{
      print "\nProgram waits for ". $sleep/1000000 ." seconds before next request.\nThis is to prevent blacklisting.\n";
      usleep($sleep);
   }
   redo MAIN;
}


problem: i can't crawl a secured site anything i could do for https?