Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl --
- use warnings;
- use strict;
- use WWW::SimpleRobot;
- use HTML::Entities;
- require HTML::Parser;
- use Lingua::EN::Summarize;
- use HTML::TreeBuilder;
- use Lingua::EN::Keywords;
- use HTML::Tree;
- use LWP::Simple;
- Main( @ARGV );
- exit( 0 );
- my $current_url;
- my $safe_domain;
- sub Main {
- no warnings 'uninitialized';
- my @urls = @_; # or hardcode them here
- my $FOLLOW = m/(((ht|f)tp(s?):\/\/)|(www\.[^ \[\]\(\)\n\r\t]+)|(([012]?[0-9]{1,2}\.){3}[012]?[0-9]{1,2})\/)([^ \[\]\(\),;"'<>\n\r\t]+)([^\. \[\]\(\),;"'<>\n\r\t])|(([012]?[0-9]{1,2}\.){3}[012]?[0-9]{1,2})/;
- my $tmp;
- $safe_domain = substr($ARGV[0], 7);
- if(index($safe_domain, "/") > -1) {
- $safe_domain = substr($safe_domain, 0, index($safe_domain, "/"));
- }
- foreach $tmp (@urls)
- {
- my @cur_url;
- $cur_url[0] = $tmp;
- $current_url = $tmp;
- print "current url: $current_url\n\n";
- my $robot = WWW::SimpleRobot->new(
- URLS => \@cur_url,
- FOLLOW_REGEX => $FOLLOW,
- DEPTH => 2,
- TRAVERSAL => 'depth',
- VISIT_CALLBACK => \&Botulism,
- BROKEN_LINK_CALLBACK => \&Snicklefritz,
- );
- eval { $robot->traverse; 1 } or warn "robot died, but we caught it: $@ ";
- }
- }
- sub MAlt {
- my $imgscalar = $_[0];
- my $imgsrc = $imgscalar->attr('src');
- use File::Basename;
- my @suffixlist = qw(.gif .jpg .jpeg .png .bmp .php .ico .GIF .JPG .JPEG .PNG .BMP .PHP .ICO);
- my $imgfilenopathnoext = fileparse($imgsrc,@suffixlist);
- $imgfilenopathnoext =~ s/[-_]/ /g;
- $imgfilenopathnoext.' *';
- }
- sub Botulism {
- my ( $url, $depth, $html, $links ) = @_;
- my $this_domain = substr($url, 7);
- if(index($this_domain, "/") > -1) {
- $this_domain = substr($this_domain, 0, index($this_domain, "/"));
- }
- if ($this_domain ne $safe_domain)
- {
- print "\n$url \noutside domain - not crawling";
- return;
- }
- print "\nURL: $url - depth $depth\n";
- $html = decode_entities($html);
- $html =~ s/document\.write\(.+?\)\;//g;
- $html =~ s/\&\#.+?\;//g;
- my $tree = HTML::TreeBuilder->new();
- $tree->parse($html);
- no warnings 'uninitialized';
- eval {
- my $title = substr $tree->look_down( '_tag', 'title' )->as_text , 0, 65;
- print "Title exists and is: $title.\n";
- } or do {
- my $title;
- for my $tag( qw' h1 h2 h3 h4 p ' ){
- last if eval {
- $title = substr $tree->look_down( '_tag', $tag )->as_text , 0, 65;
- if( length $title ){
- $html->push_content($title);
- print "No title was found so the first $tag tag contents \n
- were written to the title field in the header.\n";
- }
- }
- }
- unless($title){
- print "No title exists and no suitable \ntext
- was found by this bot to use as one.\n";
- }
- };
- use HTML::Summary;
- my $summarizer = new HTML::Summary(
- LENGTH => 155,
- USE_META => 0,
- );
- my $summary = $summarizer->generate( $tree );
- my $filteredhtml = summarize( $html, filter => 'html' );
- my $summary2 = summarize( $filteredhtml, maxlength => 500 );
- $summary2 =~ s/\s+/ /gs;
- # my $var = substr($summary, 0, 155);
- print "Using Lingua::EN::Summarize Summary: $summary\n\n";
- local $\ = $/;
- my $newdescription = HTML::Element->new('meta', 'content' => "$summary", 'name' => 'description');
- my $head = $tree->look_down( '_tag', 'head' );
- $head->splice_content(0,0,$newdescription);
- # $newdescription = $newdescription -> delete;
- # my $title = substr $tree->look_down( '_tag', 'title' )->as_text , 0, 65;
- my @keywords = keywords($summary);
- print "Keywords: " . join(", ", @keywords) . "\n\n";
- local $\ = $/;
- my $newkeywords = HTML::Element->new('meta', 'content' => join(", ", @keywords), 'name' => 'keywords');
- $head->splice_content(0,0,$newkeywords);
- # $newkeywords = $newkeywords -> delete;
- local $\ = $/;
- print $_->as_HTML
- for $tree->look_down( '_tag', 'img ',
- sub { not defined $_[0]->attr('alt') } );
- local $\ = $/;
- print $_->as_HTML
- for $tree->look_down( '_tag', 'img ',
- sub { not defined $_[0]->attr('title') } );
- print '---';
- print $_->as_HTML
- for $tree->look_down( qw' _tag img ',
- sub { not length $_[0]->attr('alt') } );
- print $_->as_HTML
- for $tree->look_down( qw' _tag img ',
- sub { not length $_[0]->attr('title') } );
- print '---';
- $_->attr( alt => MAlt($_) )
- for $tree->look_down( qw' _tag img ',
- sub { not length $_[0]->attr('alt') } );
- print $_->as_HTML for $tree->look_down(qw' _tag img ');
- $_->attr( title => MAlt($_) )
- for $tree->look_down( qw' _tag img ',
- sub { not length $_[0]->attr('title') } );
- print $_->as_HTML for $tree->look_down(qw' _tag img ');
- # Save output as file based on domain of spidered site
- my $base_domain_name = $url;
- $base_domain_name =~ s/^http:\/\/([^\/?]+)(.+)$/$1/g;
- my $filename = $2;
- $filename =~ s/\//_/g;
- # If we are just an index, call it index.html
- if ($filename =~ /^.*?_$/)
- {
- $filename .= "index.html";
- }
- # remove superfluous first _'s from filenames
- $filename =~ s/^_(.+)$/$1/;
- # Check if this base directory exists
- if (!(-d "$base_domain_name"))
- {
- mkdir $base_domain_name;
- }
- # Move to base directory and write contents to file
- print "Moving to $base_domain_name folder\n";
- print "Using filename $filename\n";
- chdir $base_domain_name;
- open("OUTFILE",">$filename");
- print OUTFILE $tree->as_HTML;
- close(OUTFILE);
- $tree = $tree->delete;
- # Move back out to top directory
- chdir "..";
- }
- sub Snicklefritz {
- no warnings 'uninitialized';
- my ( $url, $linked_from, $depth ) = @_;
- print "The link $url from the page $linked_from at depth $depth\n
- appears to be broken. please repair the link manually\n";
- }
- sub Ebola {
- my( $html, $clip, $text ) = @_;
- if(defined $text and length $text ) {
- $text = substr $text, 0, $clip;
- $html->push_content( $text );
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement