Advertisement
Guest User

help

a guest
May 25th, 2012
41
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.70 KB | None | 0 0
  1. #!C:\Perl\bin\perl
  2.  
  3. use strict;
  4. use LWP::Simple;
  5. use LWP::UserAgent;
  6. use HTTP::Request;
  7. use HTTP::Response;
  8. use HTML::LinkExtor;
  9. use URI::http;
  10. use Time::HiRes qw(usleep);
  11. use Getopt::Std;
  12.  
  13. ########## VARIABLE DECLARATION ############
  14. our ($opt_u,$opt_d);
  15. getopts('u:d');
  16. my %visited;
  17. my %oldemail;
  18. my @urls;
  19. my $linkfile;
  20. my $emailfile;
  21. my $oldemailcount = 0;
  22. my $curtime = localtime(time);
  23. my $firstmail = 0;
  24. my $domainonly = $opt_d;
  25. my $startingURL = $opt_u;
  26. my $domain;
  27. my $sleep;
  28. ########## ------------------- #############
  29.  
  30.  
  31. # Loads already found e-mails from a database file IN THE SAME DIRECTORY AS THIS PROGRAM
  32. loadEmails();
  33.  
  34. # If user did not use flags, show them the configuration settings
  35. # If they DID use the flags, make sure that the URL is correctly formatted
  36. if(!$opt_u){
  37. getInput();
  38. }else{
  39. sanitizeURL($startingURL);
  40. }
  41.  
  42. # Add the starting URL selected by user into the array containing all URLs to crawl
  43. push @urls, $startingURL;
  44.  
  45. # Initialize LWP browser
  46. my $browser = LWP::UserAgent->new();
  47. $browser->timeout(10);
  48. $browser->agent("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; Win64; x64; Trident/4.0)");
  49.  
  50. ########### -- STARTING THE MAIN LOOP -- ##########
  51. MAIN: while (@urls) {
  52. # Set the number of milliseconds to sleep between each page crawled.
  53. # This is randomized to a value between 12 and 17 seconds by default
  54. $sleep = rand(500000) + 12000000;
  55.  
  56. # Draw a URL and remove it from the array
  57. my $url = shift @urls;
  58.  
  59. select(STDOUT);
  60. next if $visited{$url};
  61. print "\n\n------------ CHECKING -> $url <- ----------\n\n";
  62.  
  63. # Send request to web server via the browser
  64. my $request = HTTP::Request->new(GET => $url);
  65. my $response = $browser->request($request);
  66.  
  67. # If the server responds with an error...
  68. if ($response->is_error()) {
  69. select(STDOUT);
  70. # print the error message from server
  71. print $response->status_line, "\n";
  72. handleErrors();
  73. }
  74. # Extract HTML from HTTP response
  75. my $contents = $response->content();
  76. print "\nLinks Found:\n";
  77. # add link crawled to a hash of visited links.
  78. $visited{$url} = 1;
  79. # extract all links from HTML
  80. my ($page_parser) = HTML::LinkExtor->new(undef, $url);
  81. $page_parser->parse($contents)->eof;
  82. my @links = $page_parser->links;
  83. # If there are no more links, call the handleErrors() function
  84. if(!@links){
  85. handleErrors();
  86. }
  87.  
  88. # For every link in the links array...
  89. foreach my $link (@links) {
  90. # check for illegal file extensions
  91. if($$link[2]!~ m/.png/i and $$link[2]!~ m/.css/i and $$link[2]!~ m/.ico/i and $$link[2]!~ m/.jpg/i
  92. and $$link[2]!~ m/.js/i and $$link[2]!~ m/.xml/i and $$link[2]!~ m/.gif/i and $$link[2]!~ m/javascript:(.)/i
  93. and $$link[2]!~ m/feeds./i and $$link[2]!~ m/rss./i and $$link[2]!~ m/mailto:/i and $$link[2]!~ m/about:./i
  94. and $$link[2]!~ m/.ashx/i){
  95. # If the option is set, print only the links from the same domain as the starting URL.
  96. # Else, do the same for all links found.
  97. if($domainonly == 1){
  98. if($$link[2] =~ m/$domain/ig){
  99. select(STDOUT);
  100. if($visited{$$link[2]}){
  101. }else{
  102. print "$$link[2]\n";
  103. #Push links found on page into the array of URLs
  104. push @urls, $$link[2];
  105. }
  106. }
  107. }else{
  108. select(STDOUT);
  109. if($visited{$$link[2]}){}
  110. else{
  111. print "$$link[2]\n";
  112. push @urls, $$link[2];
  113. }
  114. }
  115. }
  116. }
  117.  
  118. # Print all e-mail adresses found...
  119. print "\nEmails Found:\n";
  120. # ...that matches these regexes.
  121. while($contents =~ m/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/ig or $contents =~ m/\b[A-Z0-9._%+-]+\[+AT+\][A-Z0-9.-]+\.[A-Z]{2,4}\b/ig){
  122. # only add e-mail to list if it has not already been printed. (No Duplicates)
  123. next if $oldemail{$&};
  124. # unless the address matches any of these unwanted addresses..
  125. if($& =~ m/example.com/ig or $& =~ m/spam/ig or $& =~ m/xxx/ig){
  126. }else{
  127.  
  128. # print to both the email database file and the console.
  129. select(STDOUT);
  130. print $&."\n";
  131. open $emailfile,">>", ("emails.txt");
  132. select($emailfile);
  133. if($firstmail == 0){
  134. printf "\n-------------------- $curtime --------------------\n";
  135. $firstmail = 1;
  136. }
  137. print $&."\n";
  138. close $emailfile;
  139. $oldemail{$&} = 1;
  140. }
  141. }
  142. # The loop is almost done, upon a successful crawl through a link, it will sleep for the amount of time
  143. # set at the start of the loop.
  144. select(STDOUT);
  145. print "\nProgram waits for ". $sleep/1000000 ." seconds before next request.\nThis is to prevent blacklisting.\n";
  146. usleep($sleep);
  147. }
  148.  
  149. sub loadEmails {
  150. print "Loading E-mails from file 'emails.txt'...\n\n";
  151. open $emailfile,"+<", ("emails.txt") or print "E-mail file does not exist.\nIt will be created when you start crawling.\n\n";
  152. while(<$emailfile>){
  153. chomp($_);
  154. $oldemail{$_} = 1;
  155. if($_ =~ m/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/ig or $_ =~ m/\b[A-Z0-9._%+-]+\[+AT+\][A-Z0-9.-]+\.[A-Z]{2,4}\b/ig){
  156. $oldemailcount++;
  157. }
  158. print "$_\n";
  159. }
  160. close($emailfile);
  161. print "\n$oldemailcount E-Mails Loaded!\n";
  162. }
  163. ########## -- END OF MAIN LOOP -- ##########
  164.  
  165. # Function to be called if user does not use the terminal flags or there is a mistake in the URL
  166. sub getInput {
  167. # Get the URL to start crawling
  168. print "\nPlease enter a URL to start crawling. \n(Example: 'http://google.com' or 'yahoo.com')\n\n";
  169. print "http://";
  170. $startingURL = <>;
  171. chomp $startingURL;
  172.  
  173. # Format the URL and extract domainS
  174. $startingURL = sanitizeURL($startingURL);
  175.  
  176. print "\n\nDo you want the links that appear to be only the ones in the same domain that you typed in?\n".
  177. "This is useful to avoid following links to advertising sites as these usually do not contain e-mail addresses\n\n".
  178. "Domain = $domain \n\n".
  179. "1) Yes\n".
  180. "2) No\n".
  181. "3) Exit Program\n\n";
  182.  
  183. # Ask the user to select whether to use only link in the same domain or not.
  184. my $domainchoice;
  185. while($domainchoice != 1 && $domainchoice != 2 && $domainchoice != 3){
  186. $domainchoice = <>;
  187. chomp $domainchoice;
  188. if($domainchoice == 1){
  189. $domainonly = 1;
  190. }
  191. elsif($domainchoice == 3){
  192. exit;
  193. }
  194. }
  195. }
  196. ############## ------------------------------------------- ################
  197.  
  198.  
  199. ############## -- Will correctly format a url string and extract its domain -- #############
  200. sub sanitizeURL {
  201. my $url = @_[0];
  202. my $http = 'http://';
  203.  
  204. # if there is no "http://" infront of the url, add it.
  205. if($url !~ m/http:./i && $url!~ m/https:./i){
  206. $url = $http.$url;
  207. }
  208.  
  209. # extract domain of a URL string using the URI class.
  210. $domain = URI->new($url,"http");
  211. $domain = $domain->host;
  212.  
  213. # return sanitized URL
  214. return $url;
  215. }
  216. ############# -- Will handle errors by asking for a new URL to start over with if necessary" -- #############
  217. sub handleErrors {
  218. # if there are no URLs left, ask for new starting URL, then continue.
  219. # else, sleep and continue
  220. if(scalar @urls < 1){
  221. print "No more URLs to crawl.\n";
  222. getInput();
  223. push @urls, $startingURL;
  224. }else{
  225. print "\nProgram waits for ". $sleep/1000000 ." seconds before next request.\nThis is to prevent blacklisting.\n";
  226. usleep($sleep);
  227. }
  228. redo MAIN;
  229. }
  230.  
  231.  
  232. problem: i can't crawl a secured site anything i could do for https?
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement