seofixer.pl

#!/usr/bin/perl --
use warnings;
use strict;
use WWW::SimpleRobot;
use HTML::Entities;
require HTML::Parser;
use Lingua::EN::Summarize;
use HTML::TreeBuilder;
use Lingua::EN::Keywords;
use HTML::Tree;
use LWP::Simple;

Main( @ARGV );
exit( 0 );

my $current_url;
my $safe_domain;

sub Main {
    no warnings 'uninitialized';
    my @urls = @_;    # or hardcode them here
    my $FOLLOW = m/(((ht|f)tp(s?):\/\/)|(www\.[^ \[\]\(\)\n\r\t]+)|(([012]?[0-9]{1,2}\.){3}[012]?[0-9]{1,2})\/)([^ \[\]\(\),;&quot;'&lt;&gt;\n\r\t]+)([^\. \[\]\(\),;&quot;'&lt;&gt;\n\r\t])|(([012]?[0-9]{1,2}\.){3}[012]?[0-9]{1,2})/;
    my $tmp;

    $safe_domain = substr($ARGV[0], 7);
    if(index($safe_domain, "/") > -1) {
        $safe_domain = substr($safe_domain, 0, index($safe_domain, "/"));
    }

    foreach $tmp (@urls)
    {
    my @cur_url;
    $cur_url[0] = $tmp;
    $current_url = $tmp;
    print "current url: $current_url\n\n";

    my $robot = WWW::SimpleRobot->new(
        URLS                 => \@cur_url,
        FOLLOW_REGEX         => $FOLLOW,
        DEPTH                => 2,
        TRAVERSAL            => 'depth',
        VISIT_CALLBACK       => \&Botulism,
        BROKEN_LINK_CALLBACK => \&Snicklefritz,

    );

    eval { $robot->traverse; 1 } or warn "robot died, but we caught it: $@ ";

    }
}

sub MAlt {
my $imgscalar = $_[0];
my $imgsrc = $imgscalar->attr('src');
use File::Basename;
my @suffixlist = qw(.gif .jpg .jpeg .png .bmp .php .ico .GIF .JPG .JPEG .PNG .BMP .PHP .ICO);
my $imgfilenopathnoext = fileparse($imgsrc,@suffixlist);
$imgfilenopathnoext =~ s/[-_]/ /g;
$imgfilenopathnoext.' *';
 }

sub Botulism {
    my ( $url, $depth, $html, $links ) = @_;

    my $this_domain = substr($url, 7);
    if(index($this_domain, "/") > -1) {
        $this_domain = substr($this_domain, 0, index($this_domain, "/"));
    }

    if ($this_domain ne $safe_domain)
    {
        print "\n$url \noutside domain - not crawling";
        return;
    }

    print "\nURL: $url - depth $depth\n";
            $html = decode_entities($html);
            $html =~ s/document\.write\(.+?\)\;//g;
            $html =~ s/\&amp;\#.+?\;//g;
            my $tree = HTML::TreeBuilder->new();
            $tree->parse($html);

    no warnings 'uninitialized';
    eval {
        my $title = substr $tree->look_down( '_tag', 'title' )->as_text , 0, 65;
        print "Title exists and is: $title.\n";
    } or do {
        my $title;
        for my $tag( qw' h1 h2 h3 h4 p ' ){
            last if eval {
                $title = substr $tree->look_down( '_tag', $tag )->as_text , 0, 65;
                if( length $title ){
                    $html->push_content($title);
                    print "No title was found so the first $tag tag contents \n
                    were written to the title field in the header.\n";
                }
            }
        }
        unless($title){
            print "No title exists and no suitable \ntext
            was found by this bot to use as one.\n";
        }
    };

            use HTML::Summary;
            my $summarizer = new HTML::Summary(
                LENGTH      => 155,
                USE_META    => 0,
            );

            my $summary = $summarizer->generate( $tree );
            my $filteredhtml = summarize( $html, filter => 'html' );
            my $summary2 = summarize( $filteredhtml, maxlength => 500 );
            $summary2 =~ s/\s+/ /gs;
            # my $var = substr($summary, 0, 155);
            print "Using Lingua::EN::Summarize Summary: $summary\n\n";

            local $\ = $/;
            my $newdescription = HTML::Element->new('meta', 'content' => "$summary", 'name' => 'description');
            my $head = $tree->look_down( '_tag', 'head' );
            $head->splice_content(0,0,$newdescription);
            # $newdescription = $newdescription -> delete;

            # my $title = substr $tree->look_down( '_tag', 'title' )->as_text , 0, 65;

            my @keywords = keywords($summary);
            print "Keywords: " . join(", ", @keywords) . "\n\n";

            local $\ = $/;
            my $newkeywords = HTML::Element->new('meta', 'content' => join(", ", @keywords), 'name' => 'keywords');
            $head->splice_content(0,0,$newkeywords);
            # $newkeywords = $newkeywords -> delete;

            local $\ = $/;
            print $_->as_HTML
              for $tree->look_down( '_tag', 'img ',
                sub { not defined $_[0]->attr('alt') } );

            local $\ = $/;
            print $_->as_HTML
              for $tree->look_down( '_tag', 'img ',
                sub { not defined $_[0]->attr('title') } );

            print '---';

            print $_->as_HTML
              for $tree->look_down( qw' _tag img ',
                sub { not length $_[0]->attr('alt') } );

            print $_->as_HTML
              for $tree->look_down( qw' _tag img ',
                sub { not length $_[0]->attr('title') } );

            print '---';

            $_->attr( alt => MAlt($_) )
              for $tree->look_down( qw' _tag img ',
                sub { not length $_[0]->attr('alt') } );
            print $_->as_HTML for $tree->look_down(qw' _tag img ');

            $_->attr( title => MAlt($_) )
              for $tree->look_down( qw' _tag img ',
                sub { not length $_[0]->attr('title') } );
            print $_->as_HTML for $tree->look_down(qw' _tag img ');

            # Save output as file based on domain of spidered site
            my $base_domain_name = $url;
            $base_domain_name =~ s/^http:\/\/([^\/?]+)(.+)$/$1/g;
            my $filename = $2;
            $filename =~ s/\//_/g;


            # If we are just an index, call it index.html
            if ($filename =~ /^.*?_$/)
            {
                $filename .= "index.html";
            }

            # remove superfluous first _'s from filenames
            $filename =~ s/^_(.+)$/$1/;

            # Check if this base directory exists
            if (!(-d "$base_domain_name"))
            {
                mkdir $base_domain_name;
            }

            # Move to base directory and write contents to file
            print "Moving to $base_domain_name folder\n";
            print "Using filename $filename\n";

            chdir $base_domain_name;
            open("OUTFILE",">$filename");

            print OUTFILE $tree->as_HTML;
            close(OUTFILE);
            $tree = $tree->delete;
            # Move back out to top directory
            chdir "..";
}

sub Snicklefritz {
    no warnings 'uninitialized';
    my ( $url, $linked_from, $depth ) = @_;
    print "The link $url from the page $linked_from at depth $depth\n
    appears to be broken.  please repair the link manually\n";
}

sub Ebola {
    my( $html, $clip, $text ) = @_;
    if(defined $text and length $text ) {
        $text = substr $text, 0, $clip;
        $html->push_content( $text );
    }
}