Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!C:\Perl\bin\perl
- use strict;
- use LWP::Simple;
- use LWP::UserAgent;
- use HTTP::Request;
- use HTTP::Response;
- use HTML::LinkExtor;
- use URI::http;
- use Time::HiRes qw(usleep);
- use Getopt::Std;
- ########## VARIABLE DECLARATION ############
- our ($opt_u,$opt_d);
- getopts('u:d');
- my %visited;
- my %oldemail;
- my @urls;
- my $linkfile;
- my $emailfile;
- my $oldemailcount = 0;
- my $curtime = localtime(time);
- my $firstmail = 0;
- my $domainonly = $opt_d;
- my $startingURL = $opt_u;
- my $domain;
- my $sleep;
- ########## ------------------- #############
- # Loads already found e-mails from a database file IN THE SAME DIRECTORY AS THIS PROGRAM
- loadEmails();
- # If user did not use flags, show them the configuration settings
- # If they DID use the flags, make sure that the URL is correctly formatted
- if(!$opt_u){
- getInput();
- }else{
- sanitizeURL($startingURL);
- }
- # Add the starting URL selected by user into the array containing all URLs to crawl
- push @urls, $startingURL;
- # Initialize LWP browser
- my $browser = LWP::UserAgent->new();
- $browser->timeout(10);
- $browser->agent("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; Win64; x64; Trident/4.0)");
- ########### -- STARTING THE MAIN LOOP -- ##########
- MAIN: while (@urls) {
- # Set the number of milliseconds to sleep between each page crawled.
- # This is randomized to a value between 12 and 17 seconds by default
- $sleep = rand(500000) + 12000000;
- # Draw a URL and remove it from the array
- my $url = shift @urls;
- select(STDOUT);
- next if $visited{$url};
- print "\n\n------------ CHECKING -> $url <- ----------\n\n";
- # Send request to web server via the browser
- my $request = HTTP::Request->new(GET => $url);
- my $response = $browser->request($request);
- # If the server responds with an error...
- if ($response->is_error()) {
- select(STDOUT);
- # print the error message from server
- print $response->status_line, "\n";
- handleErrors();
- }
- # Extract HTML from HTTP response
- my $contents = $response->content();
- print "\nLinks Found:\n";
- # add link crawled to a hash of visited links.
- $visited{$url} = 1;
- # extract all links from HTML
- my ($page_parser) = HTML::LinkExtor->new(undef, $url);
- $page_parser->parse($contents)->eof;
- my @links = $page_parser->links;
- # If there are no more links, call the handleErrors() function
- if(!@links){
- handleErrors();
- }
- # For every link in the links array...
- foreach my $link (@links) {
- # check for illegal file extensions
- if($$link[2]!~ m/.png/i and $$link[2]!~ m/.css/i and $$link[2]!~ m/.ico/i and $$link[2]!~ m/.jpg/i
- and $$link[2]!~ m/.js/i and $$link[2]!~ m/.xml/i and $$link[2]!~ m/.gif/i and $$link[2]!~ m/javascript:(.)/i
- and $$link[2]!~ m/feeds./i and $$link[2]!~ m/rss./i and $$link[2]!~ m/mailto:/i and $$link[2]!~ m/about:./i
- and $$link[2]!~ m/.ashx/i){
- # If the option is set, print only the links from the same domain as the starting URL.
- # Else, do the same for all links found.
- if($domainonly == 1){
- if($$link[2] =~ m/$domain/ig){
- select(STDOUT);
- if($visited{$$link[2]}){
- }else{
- print "$$link[2]\n";
- #Push links found on page into the array of URLs
- push @urls, $$link[2];
- }
- }
- }else{
- select(STDOUT);
- if($visited{$$link[2]}){}
- else{
- print "$$link[2]\n";
- push @urls, $$link[2];
- }
- }
- }
- }
- # Print all e-mail adresses found...
- print "\nEmails Found:\n";
- # ...that matches these regexes.
- while($contents =~ m/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/ig or $contents =~ m/\b[A-Z0-9._%+-]+\[+AT+\][A-Z0-9.-]+\.[A-Z]{2,4}\b/ig){
- # only add e-mail to list if it has not already been printed. (No Duplicates)
- next if $oldemail{$&};
- # unless the address matches any of these unwanted addresses..
- if($& =~ m/example.com/ig or $& =~ m/spam/ig or $& =~ m/xxx/ig){
- }else{
- # print to both the email database file and the console.
- select(STDOUT);
- print $&."\n";
- open $emailfile,">>", ("emails.txt");
- select($emailfile);
- if($firstmail == 0){
- printf "\n-------------------- $curtime --------------------\n";
- $firstmail = 1;
- }
- print $&."\n";
- close $emailfile;
- $oldemail{$&} = 1;
- }
- }
- # The loop is almost done, upon a successful crawl through a link, it will sleep for the amount of time
- # set at the start of the loop.
- select(STDOUT);
- print "\nProgram waits for ". $sleep/1000000 ." seconds before next request.\nThis is to prevent blacklisting.\n";
- usleep($sleep);
- }
- sub loadEmails {
- print "Loading E-mails from file 'emails.txt'...\n\n";
- open $emailfile,"+<", ("emails.txt") or print "E-mail file does not exist.\nIt will be created when you start crawling.\n\n";
- while(<$emailfile>){
- chomp($_);
- $oldemail{$_} = 1;
- if($_ =~ m/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/ig or $_ =~ m/\b[A-Z0-9._%+-]+\[+AT+\][A-Z0-9.-]+\.[A-Z]{2,4}\b/ig){
- $oldemailcount++;
- }
- print "$_\n";
- }
- close($emailfile);
- print "\n$oldemailcount E-Mails Loaded!\n";
- }
- ########## -- END OF MAIN LOOP -- ##########
- # Function to be called if user does not use the terminal flags or there is a mistake in the URL
- sub getInput {
- # Get the URL to start crawling
- print "\nPlease enter a URL to start crawling. \n(Example: 'http://google.com' or 'yahoo.com')\n\n";
- print "http://";
- $startingURL = <>;
- chomp $startingURL;
- # Format the URL and extract domainS
- $startingURL = sanitizeURL($startingURL);
- print "\n\nDo you want the links that appear to be only the ones in the same domain that you typed in?\n".
- "This is useful to avoid following links to advertising sites as these usually do not contain e-mail addresses\n\n".
- "Domain = $domain \n\n".
- "1) Yes\n".
- "2) No\n".
- "3) Exit Program\n\n";
- # Ask the user to select whether to use only link in the same domain or not.
- my $domainchoice;
- while($domainchoice != 1 && $domainchoice != 2 && $domainchoice != 3){
- $domainchoice = <>;
- chomp $domainchoice;
- if($domainchoice == 1){
- $domainonly = 1;
- }
- elsif($domainchoice == 3){
- exit;
- }
- }
- }
- ############## ------------------------------------------- ################
- ############## -- Will correctly format a url string and extract its domain -- #############
- sub sanitizeURL {
- my $url = @_[0];
- my $http = 'http://';
- # if there is no "http://" infront of the url, add it.
- if($url !~ m/http:./i && $url!~ m/https:./i){
- $url = $http.$url;
- }
- # extract domain of a URL string using the URI class.
- $domain = URI->new($url,"http");
- $domain = $domain->host;
- # return sanitized URL
- return $url;
- }
- ############# -- Will handle errors by asking for a new URL to start over with if necessary" -- #############
- sub handleErrors {
- # if there are no URLs left, ask for new starting URL, then continue.
- # else, sleep and continue
- if(scalar @urls < 1){
- print "No more URLs to crawl.\n";
- getInput();
- push @urls, $startingURL;
- }else{
- print "\nProgram waits for ". $sleep/1000000 ." seconds before next request.\nThis is to prevent blacklisting.\n";
- usleep($sleep);
- }
- redo MAIN;
- }
- problem: i can't crawl a secured site anything i could do for https?
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement