Untitled

import httplib2
import json
import math
from ratelimit import rate_limited
import re
import sys
try:
  # Python 2.6-2.7
  from HTMLParser import HTMLParser
except ImportError:
  # Python 3
  from html.parser import HTMLParser

# A bunch of string constants used below.

# The parsed reddit JSON gets unicode strings.
uAFTER = u'after'
uAUTHOR = u'author'
uBODY = u'body'
uBODYHTML = u'body_html'
uCHILDREN = u'children'
uDATA = u'data'
uPERMALINK = u'permalink'
uREPLIES = u'replies'
uTITLE = u'title'
uSELFTEXTHTML = u'selftext_html'

COMMENTS = 'comments'
POSTS = 'posts'
REQUESTCOUNT = 'request_count'
TOTALWORDS = 'total_words'
WORDCOUNTS = 'word_counts'
REDDIT = 'https://www.reddit.com'
MAM_TOP = 'https://www.reddit.com/r/MakingaMurderer/new/.json?limit=100'
NUM_POST_PAGES = 5
NUM_TOP_WORDS = 25

# A simple HTML Parser that drops text in blockquotes.
class BlockquoteDropper(HTMLParser):
  def __init__(self):
    HTMLParser.__init__(self)
    self.blockquote = 0
    self.parts = []

  def handle_starttag(self, tag, attrs):
    if tag == 'blockquote':
      self.blockquote += 1

  def handle_endtag(self, tag):
    if tag == 'blockquote':
      self.blockquote -= 1

  def handle_data(self, data):
    if self.blockquote <= 0:
      self.parts.append(data)

  def get_text(self):
    return ' '.join(self.parts)


# This is the non-rate-limited call.
def get_url_norl(h, url):
  return h.request(url, 'GET',
                   headers={'user-agent':
                            'no-platform:MakingaMurdererStats:v0.01 ' +
                            '(by /u/MaMStats)'})

# This rate limits to once per second.  Not sure if this is required since
# the rate limit numbers I saw apply to oath calls.  I also don't see the
# rate limit headers in the response.  In any case, it's still polite.
@rate_limited(1)
def get_url_rl(h, url):
  return h.request(url, 'GET',
                   headers={'user-agent': ''})

def get_url(http_info, url):
  # If we got the last request from cache, then we aren't subject to the
  # rate limit for the next call.  In practice, this doesn't matter for
  # reddit since the pages are all marked with a max-age of 0 (i.e. no cache).
  if http_info['last_in_cache']:
    r, content = get_url_norl(http_info['http'], url)
  else:
    r, content = get_url_rl(http_info['http'], url)

  http_info['last_in_cache'] = r.fromcache
  return (r, content)

# This gives a little feedback on the command line.
def update_counter(http_info):
  http_info[REQUESTCOUNT] = http_info[REQUESTCOUNT] + 1
  sys.stderr.write(str(http_info[REQUESTCOUNT]))
  sys.stderr.flush()
  sys.stderr.write('\r')

# Adds the words in the given piece of raw text to the authors word map.
def add_author_text(author_data, text):
  if not text:
    return
  words = re.split('\W+', text)

  legit_words = 0
  if not WORDCOUNTS in author_data:
    author_data[WORDCOUNTS] = {}
  for word in words:
    clean = word.strip().lower()
    if len(clean) == 0:
      continue

    legit_words += 1
    if not clean in author_data[WORDCOUNTS]:
      author_data[WORDCOUNTS][clean] = 1
    else:
      author_data[WORDCOUNTS][clean] += 1

  if not TOTALWORDS in author_data:
    author_data[TOTALWORDS] = 0
  author_data[TOTALWORDS] += legit_words

# As above, but parses HTML and drops blockquotes.  This is important because
# there's a lot of quoting of previous responses.
def add_author_html(author_data, encoded_html):
  if not encoded_html:
    return
  bd = BlockquoteDropper()
  bd.feed(bd.unescape(encoded_html))
  text = bd.get_text()
  add_author_text(author_data, text)

# Read the list of posts after the specified id and previous count.
def read_post_list(http_info, author_map, after, count):
  after_param = '&after=' + after if after else ''
  url = (MAM_TOP + after_param + '&count=' + str(count) if after else
         MAM_TOP)
  r, content = get_url(http_info, url);
  update_counter(http_info)
  data = json.loads(content)

  posts = data[uDATA][uCHILDREN]
  for post in posts:
    author = post[uDATA][uAUTHOR]
    if not author in author_map:
      author_map[author] = {}
    if not POSTS in author_map[author]:
      author_map[author][POSTS] = []
    author_map[author][POSTS].append(post)
    add_author_text(author_map[author], post[uDATA][uTITLE])
    if uSELFTEXTHTML in post[uDATA]:
      add_author_html(author_map[author], post[uDATA][uSELFTEXTHTML])

  return data[uDATA][uAFTER]

# Save the comment text with the author's data.
def store_comment(author_map, comment):
  if not uDATA in comment or not uAUTHOR in comment[uDATA]:
    return

  author = comment[uDATA][uAUTHOR]
  if not author in author_map:
    author_map[author] = {}
  if not COMMENTS in author_map[author]:
    author_map[author][COMMENTS] = []
  author_map[author][COMMENTS].append(comment)
  if uBODYHTML in comment[uDATA]:
    add_author_html(author_map[author], comment[uDATA][uBODYHTML])
  elif uBODY in comment[uDATA]:
    add_author_text(author_map[author], comment[uDATA][uBODY])

  if comment[uDATA][uREPLIES]:
    for reply in comment[uDATA][uREPLIES][uDATA][uCHILDREN]:
      store_comment(author_map, reply)

# Read the list of top 200 comments for the given post.
def get_comments(http_info, post, author_map):
  # Limit to the 200 latest comments on every post
  url = REDDIT + post[uDATA][uPERMALINK] + '.json?sort=new&limit=200'
  r, content = get_url(http_info, url)
  update_counter(http_info)
  data = json.loads(content)

  if len(data) < 1:
    return

  for comment in data[1][uDATA][uCHILDREN]:
    store_comment(author_map, comment)


if __name__ == '__main__':
  h = httplib2.Http('.cache')
  http_info = {
    'http': h,
    'last_in_cache': False,
    'request_count': 0
  }
  author_map = {}

  # Read the newest NUM_POST_PAGES of posts, with 100 posts per page.
  count = 0
  after = None
  for i in xrange(0, NUM_POST_PAGES):
    after = read_post_list(http_info, author_map, after, count)
    count += 100

  # For each post, read the top 200 comments.
  for kv in author_map.items():
    for post in kv[1][POSTS]:
      get_comments(http_info, post, author_map)

  # Loop over all the authors and generate the global word frequencies.
  global_freq = {}
  for kv in author_map.items():
    if not WORDCOUNTS in kv[1]:
      continue

    for wc in kv[1][WORDCOUNTS].items():
      if not wc[0] in global_freq:
        global_freq[wc[0]] = wc[1]
      else:
        global_freq[wc[0]] += wc[1]

  # Compute the uniqueness of the author's usage to the global.
  author_word_freq = {}
  for kv in author_map.items():
    freq = []
    for wc in kv[1][WORDCOUNTS].items():
      freq.append((
        max(0.5, math.log(wc[1])) * float(wc[1]) / global_freq[wc[0]], wc[0],
        wc[1]))

    # Sort each author's words by uniqueness score, descending.
    freq.sort()
    freq.reverse()
    author_word_freq[kv[0]] = freq

  # Get the total words, posts, and comments for each author.
  author_totals = []
  for kv in author_map.items():
    posts = len(kv[1][POSTS]) if POSTS in kv[1] else 0
    comments = len(kv[1][COMMENTS]) if COMMENTS in kv[1] else 0
    words = kv[1][TOTALWORDS] if TOTALWORDS in kv[1] else 0
    author_totals.append((words, posts, comments, kv[0]))

  # This will sort based on total words.
  author_totals.sort()
  author_totals.reverse()

  # Print it all.
  for at in author_totals:
    author = at[3]
    print (author + ': words=' + str(at[0]) + ' posts=' + str(at[1]) +
           ' comments=' + str(at[2]))
    author_words = author_word_freq[author]
    words = []
    for i in xrange(0, NUM_TOP_WORDS):
      if i < len(author_words):
        words.append(author_words[i][1] + '(' + str(author_words[i][2]) + ')')
    print '  ' + ','.join(words)