Postcount User Word Count

#!/usr/bin/perl
use strict;
use warnings;
use LWP::UserAgent;
use HTTP::Request::Common qw(GET);
use List::Util;
#
# Globals
#
##############################################
# Change these two values for specified user
my $g_targetUser = "Quiv";    # Case sensitive
my $g_id         = "1098787"; # User ID taken from recent posts page. vBulletin changes it every two hours, wtf
##############################################
my $g_start      = time;
my $g_baseIndex  = "http://www.postcount.net/forum/search.php?searchid="."$g_id"."&pp=&page="; # Recent posts page of target
my %g_wordCount  = ();
my %g_threads    = ();
my $g_debug      = 0;
my $g_ua         = LWP::UserAgent->new;            # UA Object
#
# Prototypes
#
sub cleanup;
sub remove_grammar;
sub remove_common_words;
sub parse_thread;
#
# Get threads
#
for (my $pageNumber = 0; $pageNumber < 21; ++$pageNumber)
{
    my $req      = GET $g_baseIndex.$pageNumber;
    my $response = $g_ua->request($req);             # HTTP:Response object

     # If valid, begin parsing content
    my $count = 0;
    if ($response->is_success)
    {
        my @lines = split/\n/, $response->content;
        foreach (@lines)
        {
            if (/<h2>Thread:\s<a\shref=\"(http.*\d{2,7})-.*\"\stitle=/)
            {
                ++$g_threads{$1};
                ++$count;
            }
        }
    }
    last if !$count;
}
print "\n\n";
my @aThreads = keys(%g_threads);
if ($g_debug){ print "Unique thread: $_\n" foreach (@aThreads); }
#
# Begin parsing threads
#
foreach (@aThreads)
{
    parse_thread($_);
}

my $counts = "$g_targetUser"."_words.txt";
my $cloud  = "$g_targetUser"."_cloud.txt";
open(my $fh, '>', $counts) or warn "Could not open file '$counts' $!";
open(my $ph, '>', $cloud) or warn "Could not open file '$cloud' $!";
foreach my $word (sort { $g_wordCount{$b} <=> $g_wordCount{$a} } keys %g_wordCount)
{
    print $fh "$word => $g_wordCount{$word}\n";
    for (my $i = 0; $i < $g_wordCount{$word}; ++$i)
    {
        print $ph "$word ";
    }
    print $ph "\n";
}
#
#
#
######################################################################################
#Subroutines
######################################################################################
#
# Parses threads and collects posts
#
sub parse_thread()
{
    my $url = $_;
    my $current = 1;
    my $page = 1;
    while ($current <= $page)
    {
        my $threadId     = "$url/page$current";
        my $inPost       = 0;
        my $foundTarget  = 0;
        my $targetString = "alt=\"$g_targetUser";
        my $quoteDepth   = 0;
        my $req          = GET $threadId;
        my $response     = $g_ua->request($req);             # Make the request
        my @lines        = split/\n/, $response->content;
        print "Beginning parse for $threadId\n";
        foreach (@lines)
        {
            if (/<span\sclass=\"prev_next\"><a\srel=\"next\"\shref=\"(http.*\/page(\d{1,3}))(&|\s).*\stitle=\"Next\sPage/)
            {
                print "\n\nFound next page: $1\n";
                $page = $2;
            }
            if (/<\/blockquote/ and $inPost == 1)
            {
                $inPost = 0;
                $foundTarget = 0;
                #print "\n\n\n";
                #print "End of post\n";
            }
            if ($inPost)
            {
                ++$quoteDepth if (/<div\sclass=\"message">/); # Track quotes
                # Count words if they're not part of a quote
                if ($quoteDepth < 1)
                {
                    $_ = cleanup();               # Language filter and HTML cleanup
                    $_ = remove_grammar();        # Remove grammar, special characters, and numbers
                    $_ = uc($_);                  # Make all characters uppercase
                    $_ = remove_common_words($_); # Pull out common words
                    print "$_";
                    # Count words
                    my @words = split/\s/, $_;
                    foreach my $word (@words)
                    {
                        $word =~ s/\s//g;
                        $g_wordCount{$word}++ if (length($word) > 0);
                    }
                }
                --$quoteDepth if (/<\/div>/ and $quoteDepth > 0); # Track quotes
            }
            $foundTarget = 1 if (/$targetString/);
            $inPost = 1 if (/<blockquote\sclass="postcontent\srestore\s">/ and $foundTarget);
        }
        ++$current;
    }
}
#
# HTML and language filter cleanup
#
sub cleanup()
{
    s/\$\#\@\!ing/fucking/g;
    s/\$\#\@\!/shit/g;
    s/\&quot\;/\"/g;
    s/\&gt\;/\>/g;
    s/\&lt\;/\</g;
    s/\&amp\;/\&/g;
    s/<br\s\/>/\n/g;
    s/<\!--\sEND\sTEMPLATE\:.*-->//g;#\sbbcode_quote\s-->//g;
    s/.*<\!--\sBEGIN\sTEMPLATE\:.*-->//g;#\sbbcode_quote\s-->.*//g;
    s/<div\sclass=.*//g;
    s/<img\ssrc=\".*//g;
    s/<a\shref=\".*//g;
    s/<\/div>//g;
    s/<b>//g;
    s/<\/b>//g;

    $_;
}
#
# Remove grammar and special characters from post
#
sub remove_grammar()
{
    s/\./ /g;
    s/\,/ /g;
    s/\;/ /g;
    s/\"/ /g;
    s/\:/ /g;
    s/\'s/ /g;
    s/\'//g;
    s/\?/ /g;
    s/\!/ /g;
    s/\)/ /g;
    s/\(/ /g;
    s/\// /g;
    s/\\/ /g;
    s/\&/ /g;
    s/\$/ /g;
    s/\@/ /g;
    s/\#/ /g;
    s/\%/ /g;
    s/\^/ /g;
    s/\*/ /g;
    s/\~/ /g;
    s/\`/ /g;
    s/\|/ /g;
    s/\[/ /g;
    s/\]/ /g;
    s/\|/ /g;
    s/\-/ /g;
    s/\_/ /g;
    s/\{/ /g;
    s/\}/ /g;
    s/</ /g;
    s/>/ /g;
    s/=/ /g;
    s/\+/ /g;
    s/\d{1,1000}/ /g;

    $_;
}
#
# Remove common words (and, the, an, etc)
#
sub remove_common_words()
{
    s/(^|\s)I($|\s)/ /g;
    s/(^|\s)YOU($|\s)/ /g;
    s/(^|\s)YOURE($|\s)/ /g;
    s/(^|\s)IT($|\s)/ /g;
    s/(^|\s)AND($|\s)/ /g;
    s/(^|\s)THIS($|\s)/ /g;
    s/(^|\s)ON($|\s)/ /g;
    s/(^|\s)THE($|\s)/ /g;
    s/(^|\s)IN($|\s)/ /g;
    s/(^|\s)IF($|\s)/ /g;
    s/(^|\s)TO($|\s)/ /g;
    s/(^|\s)IS($|\s)/ /g;
    s/(^|\s)A($|\s)/ /g;
    s/(^|\s)AS($|\s)/ /g;
    s/(^|\s)SO($|\s)/ /g;
    s/(^|\s)HE($|\s)/ /g;
    s/(^|\s)SHE($|\s)/ /g;
    s/(^|\s)WE($|\s)/ /g;
    s/(^|\s)FOR($|\s)/ /g;
    s/(^|\s)THAT($|\s)/ /g;
    s/(^|\s)ARE($|\s)/ /g;
    s/(^|\s)WILL($|\s)/ /g;
    s/(^|\s)WITH($|\s)/ /g;
    s/(^|\s)WHAT($|\s)/ /g;
    s/(^|\s)HAVE($|\s)/ /g;
    s/(^|\s)WAS($|\s)/ /g;
    s/(^|\s)INTO($|\s)/ /g;
    s/(^|\s)OF($|\s)/ /g;
    s/(^|\s)AT($|\s)/ /g;
    s/(^|\s)AN($|\s)/ /g;
    s/(^|\s)OR($|\s)/ /g;
    s/(^|\s)DO($|\s)/ /g;
    s/(^|\s)BE($|\s)/ /g;
    s/(^|\s)ME($|\s)/ /g;
    s/(^|\s)BY($|\s)/ /g;
    s/(^|\s)MY($|\s)/ /g;
    s/(^|\s)\d($|\s)/ /g;

    $_;
}