Advertisement
Quiv

Postcount User Word Count

Jul 1st, 2015
297
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 6.73 KB | None | 0 0
  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;  
  4. use LWP::UserAgent;
  5. use HTTP::Request::Common qw(GET);
  6. use List::Util;
  7. #
  8. # Globals
  9. #
  10. ##############################################
  11. # Change these two values for specified user
  12. my $g_targetUser = "Quiv";    # Case sensitive
  13. my $g_id         = "1098787"; # User ID taken from recent posts page. vBulletin changes it every two hours, wtf
  14. ##############################################
  15. my $g_start      = time;
  16. my $g_baseIndex  = "http://www.postcount.net/forum/search.php?searchid="."$g_id"."&pp=&page="; # Recent posts page of target
  17. my %g_wordCount  = ();
  18. my %g_threads    = ();
  19. my $g_debug      = 0;
  20. my $g_ua         = LWP::UserAgent->new;            # UA Object
  21. #
  22. # Prototypes
  23. #
  24. sub cleanup;
  25. sub remove_grammar;
  26. sub remove_common_words;
  27. sub parse_thread;
  28. #
  29. # Get threads
  30. #
  31. for (my $pageNumber = 0; $pageNumber < 21; ++$pageNumber)
  32. {
  33.     my $req      = GET $g_baseIndex.$pageNumber;    
  34.     my $response = $g_ua->request($req);             # HTTP:Response object
  35.    
  36.      # If valid, begin parsing content
  37.     my $count = 0;
  38.     if ($response->is_success)
  39.     {
  40.         my @lines = split/\n/, $response->content;
  41.         foreach (@lines)
  42.         {
  43.             if (/<h2>Thread:\s<a\shref=\"(http.*\d{2,7})-.*\"\stitle=/)
  44.             {
  45.                 ++$g_threads{$1};
  46.                 ++$count;
  47.             }
  48.         }
  49.     }
  50.     last if !$count;
  51. }
  52. print "\n\n";
  53. my @aThreads = keys(%g_threads);
  54. if ($g_debug){ print "Unique thread: $_\n" foreach (@aThreads); }
  55. #
  56. # Begin parsing threads
  57. #
  58. foreach (@aThreads)
  59. {
  60.     parse_thread($_);
  61. }
  62.  
  63. my $counts = "$g_targetUser"."_words.txt";
  64. my $cloud  = "$g_targetUser"."_cloud.txt";
  65. open(my $fh, '>', $counts) or warn "Could not open file '$counts' $!";
  66. open(my $ph, '>', $cloud) or warn "Could not open file '$cloud' $!";
  67. foreach my $word (sort { $g_wordCount{$b} <=> $g_wordCount{$a} } keys %g_wordCount)
  68. {
  69.     print $fh "$word => $g_wordCount{$word}\n";
  70.     for (my $i = 0; $i < $g_wordCount{$word}; ++$i)
  71.     {
  72.         print $ph "$word ";
  73.     }
  74.     print $ph "\n";
  75. }
  76. #
  77. #
  78. #
  79. ######################################################################################
  80. #Subroutines
  81. ######################################################################################
  82. #
  83. # Parses threads and collects posts
  84. #
  85. sub parse_thread()
  86. {
  87.     my $url = $_;
  88.     my $current = 1;
  89.     my $page = 1;
  90.     while ($current <= $page)
  91.     {
  92.         my $threadId     = "$url/page$current";
  93.         my $inPost       = 0;
  94.         my $foundTarget  = 0;
  95.         my $targetString = "alt=\"$g_targetUser";
  96.         my $quoteDepth   = 0;
  97.         my $req          = GET $threadId;                        
  98.         my $response     = $g_ua->request($req);             # Make the request
  99.         my @lines        = split/\n/, $response->content;
  100.         print "Beginning parse for $threadId\n";
  101.         foreach (@lines)
  102.         {
  103.             if (/<span\sclass=\"prev_next\"><a\srel=\"next\"\shref=\"(http.*\/page(\d{1,3}))(&|\s).*\stitle=\"Next\sPage/)
  104.             {
  105.                 print "\n\nFound next page: $1\n";
  106.                 $page = $2;
  107.             }
  108.             if (/<\/blockquote/ and $inPost == 1)
  109.             {
  110.                 $inPost = 0;
  111.                 $foundTarget = 0;
  112.                 #print "\n\n\n";
  113.                 #print "End of post\n";
  114.             }
  115.             if ($inPost)
  116.             {
  117.                 ++$quoteDepth if (/<div\sclass=\"message">/); # Track quotes
  118.                 # Count words if they're not part of a quote
  119.                 if ($quoteDepth < 1)
  120.                 {
  121.                     $_ = cleanup();               # Language filter and HTML cleanup
  122.                     $_ = remove_grammar();        # Remove grammar, special characters, and numbers
  123.                     $_ = uc($_);                  # Make all characters uppercase
  124.                     $_ = remove_common_words($_); # Pull out common words
  125.                     print "$_";
  126.                     # Count words
  127.                     my @words = split/\s/, $_;
  128.                     foreach my $word (@words)
  129.                     {
  130.                         $word =~ s/\s//g;
  131.                         $g_wordCount{$word}++ if (length($word) > 0);
  132.                     }
  133.                 }
  134.                 --$quoteDepth if (/<\/div>/ and $quoteDepth > 0); # Track quotes
  135.             }
  136.             $foundTarget = 1 if (/$targetString/);
  137.             $inPost = 1 if (/<blockquote\sclass="postcontent\srestore\s">/ and $foundTarget);
  138.         }
  139.         ++$current;
  140.     }
  141. }
  142. #
  143. # HTML and language filter cleanup
  144. #
  145. sub cleanup()
  146. {
  147.     s/\$\#\@\!ing/fucking/g;
  148.     s/\$\#\@\!/shit/g;
  149.     s/\&quot\;/\"/g;
  150.     s/\&gt\;/\>/g;
  151.     s/\&lt\;/\</g;
  152.     s/\&amp\;/\&/g;
  153.     s/<br\s\/>/\n/g;
  154.     s/<\!--\sEND\sTEMPLATE\:.*-->//g;#\sbbcode_quote\s-->//g;
  155.     s/.*<\!--\sBEGIN\sTEMPLATE\:.*-->//g;#\sbbcode_quote\s-->.*//g;
  156.     s/<div\sclass=.*//g;
  157.     s/<img\ssrc=\".*//g;
  158.     s/<a\shref=\".*//g;
  159.     s/<\/div>//g;
  160.     s/<b>//g;
  161.     s/<\/b>//g;
  162.  
  163.     $_;
  164. }
  165. #
  166. # Remove grammar and special characters from post
  167. #
  168. sub remove_grammar()
  169. {
  170.     s/\./ /g;
  171.     s/\,/ /g;
  172.     s/\;/ /g;
  173.     s/\"/ /g;
  174.     s/\:/ /g;
  175.     s/\'s/ /g;
  176.     s/\'//g;
  177.     s/\?/ /g;
  178.     s/\!/ /g;
  179.     s/\)/ /g;
  180.     s/\(/ /g;
  181.     s/\// /g;
  182.     s/\\/ /g;
  183.     s/\&/ /g;
  184.     s/\$/ /g;
  185.     s/\@/ /g;
  186.     s/\#/ /g;
  187.     s/\%/ /g;
  188.     s/\^/ /g;
  189.     s/\*/ /g;
  190.     s/\~/ /g;
  191.     s/\`/ /g;
  192.     s/\|/ /g;
  193.     s/\[/ /g;
  194.     s/\]/ /g;
  195.     s/\|/ /g;
  196.     s/\-/ /g;
  197.     s/\_/ /g;
  198.     s/\{/ /g;
  199.     s/\}/ /g;
  200.     s/</ /g;
  201.     s/>/ /g;
  202.     s/=/ /g;
  203.     s/\+/ /g;
  204.     s/\d{1,1000}/ /g;
  205.  
  206.     $_;
  207. }
  208. #
  209. # Remove common words (and, the, an, etc)
  210. #
  211. sub remove_common_words()
  212. {
  213.     s/(^|\s)I($|\s)/ /g;
  214.     s/(^|\s)YOU($|\s)/ /g;
  215.     s/(^|\s)YOURE($|\s)/ /g;
  216.     s/(^|\s)IT($|\s)/ /g;
  217.     s/(^|\s)AND($|\s)/ /g;
  218.     s/(^|\s)THIS($|\s)/ /g;
  219.     s/(^|\s)ON($|\s)/ /g;
  220.     s/(^|\s)THE($|\s)/ /g;
  221.     s/(^|\s)IN($|\s)/ /g;
  222.     s/(^|\s)IF($|\s)/ /g;
  223.     s/(^|\s)TO($|\s)/ /g;
  224.     s/(^|\s)IS($|\s)/ /g;
  225.     s/(^|\s)A($|\s)/ /g;
  226.     s/(^|\s)AS($|\s)/ /g;
  227.     s/(^|\s)SO($|\s)/ /g;
  228.     s/(^|\s)HE($|\s)/ /g;
  229.     s/(^|\s)SHE($|\s)/ /g;
  230.     s/(^|\s)WE($|\s)/ /g;
  231.     s/(^|\s)FOR($|\s)/ /g;
  232.     s/(^|\s)THAT($|\s)/ /g;
  233.     s/(^|\s)ARE($|\s)/ /g;
  234.     s/(^|\s)WILL($|\s)/ /g;
  235.     s/(^|\s)WITH($|\s)/ /g;
  236.     s/(^|\s)WHAT($|\s)/ /g;
  237.     s/(^|\s)HAVE($|\s)/ /g;
  238.     s/(^|\s)WAS($|\s)/ /g;
  239.     s/(^|\s)INTO($|\s)/ /g;
  240.     s/(^|\s)OF($|\s)/ /g;
  241.     s/(^|\s)AT($|\s)/ /g;
  242.     s/(^|\s)AN($|\s)/ /g;
  243.     s/(^|\s)OR($|\s)/ /g;
  244.     s/(^|\s)DO($|\s)/ /g;
  245.     s/(^|\s)BE($|\s)/ /g;
  246.     s/(^|\s)ME($|\s)/ /g;
  247.     s/(^|\s)BY($|\s)/ /g;
  248.     s/(^|\s)MY($|\s)/ /g;
  249.     s/(^|\s)\d($|\s)/ /g;
  250.  
  251.     $_;
  252. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement