Advertisement
socrtwo

seofixer.pl

Nov 12th, 2011
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 6.24 KB | None | 0 0
  1. #!/usr/bin/perl --
  2. use warnings;
  3. use strict;
  4. use WWW::SimpleRobot;
  5. use HTML::Entities;
  6. require HTML::Parser;
  7. use Lingua::EN::Summarize;
  8. use HTML::TreeBuilder;
  9. use Lingua::EN::Keywords;
  10. use HTML::Tree;
  11. use LWP::Simple;
  12.  
  13. Main( @ARGV );
  14. exit( 0 );
  15.  
  16. my $current_url;
  17. my $safe_domain;
  18.  
  19. sub Main {
  20.     no warnings 'uninitialized';
  21.     my @urls = @_;    # or hardcode them here
  22.     my $FOLLOW = m/(((ht|f)tp(s?):\/\/)|(www\.[^ \[\]\(\)\n\r\t]+)|(([012]?[0-9]{1,2}\.){3}[012]?[0-9]{1,2})\/)([^ \[\]\(\),;"'<>\n\r\t]+)([^\. \[\]\(\),;"'<>\n\r\t])|(([012]?[0-9]{1,2}\.){3}[012]?[0-9]{1,2})/;
  23.     my $tmp;
  24.    
  25.     $safe_domain = substr($ARGV[0], 7);
  26.     if(index($safe_domain, "/") > -1) {
  27.         $safe_domain = substr($safe_domain, 0, index($safe_domain, "/"));
  28.     }
  29.    
  30.     foreach $tmp (@urls)
  31.     {
  32.     my @cur_url;
  33.     $cur_url[0] = $tmp;
  34.     $current_url = $tmp;
  35.     print "current url: $current_url\n\n";
  36.    
  37.     my $robot = WWW::SimpleRobot->new(
  38.         URLS                 => \@cur_url,
  39.         FOLLOW_REGEX         => $FOLLOW,
  40.         DEPTH                => 2,
  41.         TRAVERSAL            => 'depth',
  42.         VISIT_CALLBACK       => \&Botulism,
  43.         BROKEN_LINK_CALLBACK => \&Snicklefritz,
  44.        
  45.     );
  46.  
  47.     eval { $robot->traverse; 1 } or warn "robot died, but we caught it: $@ ";
  48.  
  49.     }
  50. }
  51.  
  52. sub MAlt {
  53. my $imgscalar = $_[0];
  54. my $imgsrc = $imgscalar->attr('src');
  55. use File::Basename;
  56. my @suffixlist = qw(.gif .jpg .jpeg .png .bmp .php .ico .GIF .JPG .JPEG .PNG .BMP .PHP .ICO);
  57. my $imgfilenopathnoext = fileparse($imgsrc,@suffixlist);
  58. $imgfilenopathnoext =~ s/[-_]/ /g;
  59. $imgfilenopathnoext.' *';
  60.  }
  61.  
  62. sub Botulism {
  63.     my ( $url, $depth, $html, $links ) = @_;
  64.    
  65.     my $this_domain = substr($url, 7);
  66.     if(index($this_domain, "/") > -1) {
  67.         $this_domain = substr($this_domain, 0, index($this_domain, "/"));
  68.     }
  69.    
  70.     if ($this_domain ne $safe_domain)
  71.     {
  72.         print "\n$url \noutside domain - not crawling";
  73.         return;
  74.     }
  75.    
  76.     print "\nURL: $url - depth $depth\n";
  77.             $html = decode_entities($html);
  78.             $html =~ s/document\.write\(.+?\)\;//g;
  79.             $html =~ s/\&\#.+?\;//g;
  80.             my $tree = HTML::TreeBuilder->new();
  81.             $tree->parse($html);
  82.    
  83.     no warnings 'uninitialized';
  84.     eval {
  85.         my $title = substr $tree->look_down( '_tag', 'title' )->as_text , 0, 65;
  86.         print "Title exists and is: $title.\n";
  87.     } or do {
  88.         my $title;
  89.         for my $tag( qw' h1 h2 h3 h4 p ' ){
  90.             last if eval {
  91.                 $title = substr $tree->look_down( '_tag', $tag )->as_text , 0, 65;
  92.                 if( length $title ){
  93.                     $html->push_content($title);
  94.                     print "No title was found so the first $tag tag contents \n
  95.                     were written to the title field in the header.\n";
  96.                 }
  97.             }
  98.         }
  99.         unless($title){
  100.             print "No title exists and no suitable \ntext
  101.             was found by this bot to use as one.\n";
  102.         }
  103.     };
  104.            
  105.             use HTML::Summary;
  106.             my $summarizer = new HTML::Summary(
  107.                 LENGTH      => 155,
  108.                 USE_META    => 0,
  109.             );
  110.  
  111.             my $summary = $summarizer->generate( $tree );
  112.             my $filteredhtml = summarize( $html, filter => 'html' );
  113.             my $summary2 = summarize( $filteredhtml, maxlength => 500 );
  114.             $summary2 =~ s/\s+/ /gs;
  115.             # my $var = substr($summary, 0, 155);
  116.             print "Using Lingua::EN::Summarize Summary: $summary\n\n";
  117.            
  118.             local $\ = $/;
  119.             my $newdescription = HTML::Element->new('meta', 'content' => "$summary", 'name' => 'description');
  120.             my $head = $tree->look_down( '_tag', 'head' );
  121.             $head->splice_content(0,0,$newdescription);
  122.             # $newdescription = $newdescription -> delete;
  123.                    
  124.             # my $title = substr $tree->look_down( '_tag', 'title' )->as_text , 0, 65;
  125.            
  126.             my @keywords = keywords($summary);
  127.             print "Keywords: " . join(", ", @keywords) . "\n\n";
  128.            
  129.             local $\ = $/;
  130.             my $newkeywords = HTML::Element->new('meta', 'content' => join(", ", @keywords), 'name' => 'keywords');
  131.             $head->splice_content(0,0,$newkeywords);
  132.             # $newkeywords = $newkeywords -> delete;
  133.            
  134.             local $\ = $/;
  135.             print $_->as_HTML
  136.               for $tree->look_down( '_tag', 'img ',
  137.                 sub { not defined $_[0]->attr('alt') } );
  138.            
  139.             local $\ = $/;
  140.             print $_->as_HTML
  141.               for $tree->look_down( '_tag', 'img ',
  142.                 sub { not defined $_[0]->attr('title') } );
  143.  
  144.             print '---';
  145.  
  146.             print $_->as_HTML
  147.               for $tree->look_down( qw' _tag img ',
  148.                 sub { not length $_[0]->attr('alt') } );
  149.                
  150.             print $_->as_HTML
  151.               for $tree->look_down( qw' _tag img ',
  152.                 sub { not length $_[0]->attr('title') } );
  153.  
  154.             print '---';
  155.  
  156.             $_->attr( alt => MAlt($_) )
  157.               for $tree->look_down( qw' _tag img ',
  158.                 sub { not length $_[0]->attr('alt') } );
  159.             print $_->as_HTML for $tree->look_down(qw' _tag img ');
  160.  
  161.             $_->attr( title => MAlt($_) )
  162.               for $tree->look_down( qw' _tag img ',
  163.                 sub { not length $_[0]->attr('title') } );
  164.             print $_->as_HTML for $tree->look_down(qw' _tag img ');
  165.            
  166.             # Save output as file based on domain of spidered site
  167.             my $base_domain_name = $url;
  168.             $base_domain_name =~ s/^http:\/\/([^\/?]+)(.+)$/$1/g;
  169.             my $filename = $2;
  170.             $filename =~ s/\//_/g;
  171.  
  172.            
  173.        
  174.  
  175.             # If we are just an index, call it index.html  
  176.             if ($filename =~ /^.*?_$/)
  177.             {
  178.                 $filename .= "index.html";
  179.             }
  180.  
  181.             # remove superfluous first _'s from filenames
  182.             $filename =~ s/^_(.+)$/$1/;
  183.        
  184.             # Check if this base directory exists  
  185.             if (!(-d "$base_domain_name"))
  186.             {
  187.                 mkdir $base_domain_name;
  188.             }
  189.        
  190.             # Move to base directory and write contents to file
  191.             print "Moving to $base_domain_name folder\n";
  192.             print "Using filename $filename\n";
  193.  
  194.             chdir $base_domain_name;
  195.             open("OUTFILE",">$filename");
  196.  
  197.             print OUTFILE $tree->as_HTML;
  198.             close(OUTFILE);
  199.             $tree = $tree->delete;
  200.             # Move back out to top directory
  201.             chdir "..";
  202. }
  203.  
  204. sub Snicklefritz {
  205.     no warnings 'uninitialized';
  206.     my ( $url, $linked_from, $depth ) = @_;
  207.     print "The link $url from the page $linked_from at depth $depth\n
  208.     appears to be broken.  please repair the link manually\n";
  209. }
  210.  
  211. sub Ebola {
  212.     my( $html, $clip, $text ) = @_;
  213.     if(defined $text and length $text ) {
  214.         $text = substr $text, 0, $clip;
  215.         $html->push_content( $text );
  216.     }
  217. }
  218.  
  219.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement