Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- use strict;
- use warnings;
- use LWP::UserAgent;
- use HTTP::Request::Common qw(GET);
- use List::Util;
- #
- # Globals
- #
- ##############################################
- # Change these two values for specified user
- my $g_targetUser = "Quiv"; # Case sensitive
- my $g_id = "1098787"; # User ID taken from recent posts page. vBulletin changes it every two hours, wtf
- ##############################################
- my $g_start = time;
- my $g_baseIndex = "http://www.postcount.net/forum/search.php?searchid="."$g_id"."&pp=&page="; # Recent posts page of target
- my %g_wordCount = ();
- my %g_threads = ();
- my $g_debug = 0;
- my $g_ua = LWP::UserAgent->new; # UA Object
- #
- # Prototypes
- #
- sub cleanup;
- sub remove_grammar;
- sub remove_common_words;
- sub parse_thread;
- #
- # Get threads
- #
- for (my $pageNumber = 0; $pageNumber < 21; ++$pageNumber)
- {
- my $req = GET $g_baseIndex.$pageNumber;
- my $response = $g_ua->request($req); # HTTP:Response object
- # If valid, begin parsing content
- my $count = 0;
- if ($response->is_success)
- {
- my @lines = split/\n/, $response->content;
- foreach (@lines)
- {
- if (/<h2>Thread:\s<a\shref=\"(http.*\d{2,7})-.*\"\stitle=/)
- {
- ++$g_threads{$1};
- ++$count;
- }
- }
- }
- last if !$count;
- }
- print "\n\n";
- my @aThreads = keys(%g_threads);
- if ($g_debug){ print "Unique thread: $_\n" foreach (@aThreads); }
- #
- # Begin parsing threads
- #
- foreach (@aThreads)
- {
- parse_thread($_);
- }
- my $counts = "$g_targetUser"."_words.txt";
- my $cloud = "$g_targetUser"."_cloud.txt";
- open(my $fh, '>', $counts) or warn "Could not open file '$counts' $!";
- open(my $ph, '>', $cloud) or warn "Could not open file '$cloud' $!";
- foreach my $word (sort { $g_wordCount{$b} <=> $g_wordCount{$a} } keys %g_wordCount)
- {
- print $fh "$word => $g_wordCount{$word}\n";
- for (my $i = 0; $i < $g_wordCount{$word}; ++$i)
- {
- print $ph "$word ";
- }
- print $ph "\n";
- }
- #
- #
- #
- ######################################################################################
- #Subroutines
- ######################################################################################
- #
- # Parses threads and collects posts
- #
- sub parse_thread()
- {
- my $url = $_;
- my $current = 1;
- my $page = 1;
- while ($current <= $page)
- {
- my $threadId = "$url/page$current";
- my $inPost = 0;
- my $foundTarget = 0;
- my $targetString = "alt=\"$g_targetUser";
- my $quoteDepth = 0;
- my $req = GET $threadId;
- my $response = $g_ua->request($req); # Make the request
- my @lines = split/\n/, $response->content;
- print "Beginning parse for $threadId\n";
- foreach (@lines)
- {
- if (/<span\sclass=\"prev_next\"><a\srel=\"next\"\shref=\"(http.*\/page(\d{1,3}))(&|\s).*\stitle=\"Next\sPage/)
- {
- print "\n\nFound next page: $1\n";
- $page = $2;
- }
- if (/<\/blockquote/ and $inPost == 1)
- {
- $inPost = 0;
- $foundTarget = 0;
- #print "\n\n\n";
- #print "End of post\n";
- }
- if ($inPost)
- {
- ++$quoteDepth if (/<div\sclass=\"message">/); # Track quotes
- # Count words if they're not part of a quote
- if ($quoteDepth < 1)
- {
- $_ = cleanup(); # Language filter and HTML cleanup
- $_ = remove_grammar(); # Remove grammar, special characters, and numbers
- $_ = uc($_); # Make all characters uppercase
- $_ = remove_common_words($_); # Pull out common words
- print "$_";
- # Count words
- my @words = split/\s/, $_;
- foreach my $word (@words)
- {
- $word =~ s/\s//g;
- $g_wordCount{$word}++ if (length($word) > 0);
- }
- }
- --$quoteDepth if (/<\/div>/ and $quoteDepth > 0); # Track quotes
- }
- $foundTarget = 1 if (/$targetString/);
- $inPost = 1 if (/<blockquote\sclass="postcontent\srestore\s">/ and $foundTarget);
- }
- ++$current;
- }
- }
- #
- # HTML and language filter cleanup
- #
- sub cleanup()
- {
- s/\$\#\@\!ing/fucking/g;
- s/\$\#\@\!/shit/g;
- s/\"\;/\"/g;
- s/\>\;/\>/g;
- s/\<\;/\</g;
- s/\&\;/\&/g;
- s/<br\s\/>/\n/g;
- s/<\!--\sEND\sTEMPLATE\:.*-->//g;#\sbbcode_quote\s-->//g;
- s/.*<\!--\sBEGIN\sTEMPLATE\:.*-->//g;#\sbbcode_quote\s-->.*//g;
- s/<div\sclass=.*//g;
- s/<img\ssrc=\".*//g;
- s/<a\shref=\".*//g;
- s/<\/div>//g;
- s/<b>//g;
- s/<\/b>//g;
- $_;
- }
- #
- # Remove grammar and special characters from post
- #
- sub remove_grammar()
- {
- s/\./ /g;
- s/\,/ /g;
- s/\;/ /g;
- s/\"/ /g;
- s/\:/ /g;
- s/\'s/ /g;
- s/\'//g;
- s/\?/ /g;
- s/\!/ /g;
- s/\)/ /g;
- s/\(/ /g;
- s/\// /g;
- s/\\/ /g;
- s/\&/ /g;
- s/\$/ /g;
- s/\@/ /g;
- s/\#/ /g;
- s/\%/ /g;
- s/\^/ /g;
- s/\*/ /g;
- s/\~/ /g;
- s/\`/ /g;
- s/\|/ /g;
- s/\[/ /g;
- s/\]/ /g;
- s/\|/ /g;
- s/\-/ /g;
- s/\_/ /g;
- s/\{/ /g;
- s/\}/ /g;
- s/</ /g;
- s/>/ /g;
- s/=/ /g;
- s/\+/ /g;
- s/\d{1,1000}/ /g;
- $_;
- }
- #
- # Remove common words (and, the, an, etc)
- #
- sub remove_common_words()
- {
- s/(^|\s)I($|\s)/ /g;
- s/(^|\s)YOU($|\s)/ /g;
- s/(^|\s)YOURE($|\s)/ /g;
- s/(^|\s)IT($|\s)/ /g;
- s/(^|\s)AND($|\s)/ /g;
- s/(^|\s)THIS($|\s)/ /g;
- s/(^|\s)ON($|\s)/ /g;
- s/(^|\s)THE($|\s)/ /g;
- s/(^|\s)IN($|\s)/ /g;
- s/(^|\s)IF($|\s)/ /g;
- s/(^|\s)TO($|\s)/ /g;
- s/(^|\s)IS($|\s)/ /g;
- s/(^|\s)A($|\s)/ /g;
- s/(^|\s)AS($|\s)/ /g;
- s/(^|\s)SO($|\s)/ /g;
- s/(^|\s)HE($|\s)/ /g;
- s/(^|\s)SHE($|\s)/ /g;
- s/(^|\s)WE($|\s)/ /g;
- s/(^|\s)FOR($|\s)/ /g;
- s/(^|\s)THAT($|\s)/ /g;
- s/(^|\s)ARE($|\s)/ /g;
- s/(^|\s)WILL($|\s)/ /g;
- s/(^|\s)WITH($|\s)/ /g;
- s/(^|\s)WHAT($|\s)/ /g;
- s/(^|\s)HAVE($|\s)/ /g;
- s/(^|\s)WAS($|\s)/ /g;
- s/(^|\s)INTO($|\s)/ /g;
- s/(^|\s)OF($|\s)/ /g;
- s/(^|\s)AT($|\s)/ /g;
- s/(^|\s)AN($|\s)/ /g;
- s/(^|\s)OR($|\s)/ /g;
- s/(^|\s)DO($|\s)/ /g;
- s/(^|\s)BE($|\s)/ /g;
- s/(^|\s)ME($|\s)/ /g;
- s/(^|\s)BY($|\s)/ /g;
- s/(^|\s)MY($|\s)/ /g;
- s/(^|\s)\d($|\s)/ /g;
- $_;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement