Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import httplib2
- import json
- import math
- from ratelimit import rate_limited
- import re
- import sys
- try:
- # Python 2.6-2.7
- from HTMLParser import HTMLParser
- except ImportError:
- # Python 3
- from html.parser import HTMLParser
- # A bunch of string constants used below.
- # The parsed reddit JSON gets unicode strings.
- uAFTER = u'after'
- uAUTHOR = u'author'
- uBODY = u'body'
- uBODYHTML = u'body_html'
- uCHILDREN = u'children'
- uDATA = u'data'
- uPERMALINK = u'permalink'
- uREPLIES = u'replies'
- uTITLE = u'title'
- uSELFTEXTHTML = u'selftext_html'
- COMMENTS = 'comments'
- POSTS = 'posts'
- REQUESTCOUNT = 'request_count'
- TOTALWORDS = 'total_words'
- WORDCOUNTS = 'word_counts'
- REDDIT = 'https://www.reddit.com'
- MAM_TOP = 'https://www.reddit.com/r/MakingaMurderer/new/.json?limit=100'
- NUM_POST_PAGES = 5
- NUM_TOP_WORDS = 25
- # A simple HTML Parser that drops text in blockquotes.
- class BlockquoteDropper(HTMLParser):
- def __init__(self):
- HTMLParser.__init__(self)
- self.blockquote = 0
- self.parts = []
- def handle_starttag(self, tag, attrs):
- if tag == 'blockquote':
- self.blockquote += 1
- def handle_endtag(self, tag):
- if tag == 'blockquote':
- self.blockquote -= 1
- def handle_data(self, data):
- if self.blockquote <= 0:
- self.parts.append(data)
- def get_text(self):
- return ' '.join(self.parts)
- # This is the non-rate-limited call.
- def get_url_norl(h, url):
- return h.request(url, 'GET',
- headers={'user-agent':
- 'no-platform:MakingaMurdererStats:v0.01 ' +
- '(by /u/MaMStats)'})
- # This rate limits to once per second. Not sure if this is required since
- # the rate limit numbers I saw apply to oath calls. I also don't see the
- # rate limit headers in the response. In any case, it's still polite.
- @rate_limited(1)
- def get_url_rl(h, url):
- return h.request(url, 'GET',
- headers={'user-agent': ''})
- def get_url(http_info, url):
- # If we got the last request from cache, then we aren't subject to the
- # rate limit for the next call. In practice, this doesn't matter for
- # reddit since the pages are all marked with a max-age of 0 (i.e. no cache).
- if http_info['last_in_cache']:
- r, content = get_url_norl(http_info['http'], url)
- else:
- r, content = get_url_rl(http_info['http'], url)
- http_info['last_in_cache'] = r.fromcache
- return (r, content)
- # This gives a little feedback on the command line.
- def update_counter(http_info):
- http_info[REQUESTCOUNT] = http_info[REQUESTCOUNT] + 1
- sys.stderr.write(str(http_info[REQUESTCOUNT]))
- sys.stderr.flush()
- sys.stderr.write('\r')
- # Adds the words in the given piece of raw text to the authors word map.
- def add_author_text(author_data, text):
- if not text:
- return
- words = re.split('\W+', text)
- legit_words = 0
- if not WORDCOUNTS in author_data:
- author_data[WORDCOUNTS] = {}
- for word in words:
- clean = word.strip().lower()
- if len(clean) == 0:
- continue
- legit_words += 1
- if not clean in author_data[WORDCOUNTS]:
- author_data[WORDCOUNTS][clean] = 1
- else:
- author_data[WORDCOUNTS][clean] += 1
- if not TOTALWORDS in author_data:
- author_data[TOTALWORDS] = 0
- author_data[TOTALWORDS] += legit_words
- # As above, but parses HTML and drops blockquotes. This is important because
- # there's a lot of quoting of previous responses.
- def add_author_html(author_data, encoded_html):
- if not encoded_html:
- return
- bd = BlockquoteDropper()
- bd.feed(bd.unescape(encoded_html))
- text = bd.get_text()
- add_author_text(author_data, text)
- # Read the list of posts after the specified id and previous count.
- def read_post_list(http_info, author_map, after, count):
- after_param = '&after=' + after if after else ''
- url = (MAM_TOP + after_param + '&count=' + str(count) if after else
- MAM_TOP)
- r, content = get_url(http_info, url);
- update_counter(http_info)
- data = json.loads(content)
- posts = data[uDATA][uCHILDREN]
- for post in posts:
- author = post[uDATA][uAUTHOR]
- if not author in author_map:
- author_map[author] = {}
- if not POSTS in author_map[author]:
- author_map[author][POSTS] = []
- author_map[author][POSTS].append(post)
- add_author_text(author_map[author], post[uDATA][uTITLE])
- if uSELFTEXTHTML in post[uDATA]:
- add_author_html(author_map[author], post[uDATA][uSELFTEXTHTML])
- return data[uDATA][uAFTER]
- # Save the comment text with the author's data.
- def store_comment(author_map, comment):
- if not uDATA in comment or not uAUTHOR in comment[uDATA]:
- return
- author = comment[uDATA][uAUTHOR]
- if not author in author_map:
- author_map[author] = {}
- if not COMMENTS in author_map[author]:
- author_map[author][COMMENTS] = []
- author_map[author][COMMENTS].append(comment)
- if uBODYHTML in comment[uDATA]:
- add_author_html(author_map[author], comment[uDATA][uBODYHTML])
- elif uBODY in comment[uDATA]:
- add_author_text(author_map[author], comment[uDATA][uBODY])
- if comment[uDATA][uREPLIES]:
- for reply in comment[uDATA][uREPLIES][uDATA][uCHILDREN]:
- store_comment(author_map, reply)
- # Read the list of top 200 comments for the given post.
- def get_comments(http_info, post, author_map):
- # Limit to the 200 latest comments on every post
- url = REDDIT + post[uDATA][uPERMALINK] + '.json?sort=new&limit=200'
- r, content = get_url(http_info, url)
- update_counter(http_info)
- data = json.loads(content)
- if len(data) < 1:
- return
- for comment in data[1][uDATA][uCHILDREN]:
- store_comment(author_map, comment)
- if __name__ == '__main__':
- h = httplib2.Http('.cache')
- http_info = {
- 'http': h,
- 'last_in_cache': False,
- 'request_count': 0
- }
- author_map = {}
- # Read the newest NUM_POST_PAGES of posts, with 100 posts per page.
- count = 0
- after = None
- for i in xrange(0, NUM_POST_PAGES):
- after = read_post_list(http_info, author_map, after, count)
- count += 100
- # For each post, read the top 200 comments.
- for kv in author_map.items():
- for post in kv[1][POSTS]:
- get_comments(http_info, post, author_map)
- # Loop over all the authors and generate the global word frequencies.
- global_freq = {}
- for kv in author_map.items():
- if not WORDCOUNTS in kv[1]:
- continue
- for wc in kv[1][WORDCOUNTS].items():
- if not wc[0] in global_freq:
- global_freq[wc[0]] = wc[1]
- else:
- global_freq[wc[0]] += wc[1]
- # Compute the uniqueness of the author's usage to the global.
- author_word_freq = {}
- for kv in author_map.items():
- freq = []
- for wc in kv[1][WORDCOUNTS].items():
- freq.append((
- max(0.5, math.log(wc[1])) * float(wc[1]) / global_freq[wc[0]], wc[0],
- wc[1]))
- # Sort each author's words by uniqueness score, descending.
- freq.sort()
- freq.reverse()
- author_word_freq[kv[0]] = freq
- # Get the total words, posts, and comments for each author.
- author_totals = []
- for kv in author_map.items():
- posts = len(kv[1][POSTS]) if POSTS in kv[1] else 0
- comments = len(kv[1][COMMENTS]) if COMMENTS in kv[1] else 0
- words = kv[1][TOTALWORDS] if TOTALWORDS in kv[1] else 0
- author_totals.append((words, posts, comments, kv[0]))
- # This will sort based on total words.
- author_totals.sort()
- author_totals.reverse()
- # Print it all.
- for at in author_totals:
- author = at[3]
- print (author + ': words=' + str(at[0]) + ' posts=' + str(at[1]) +
- ' comments=' + str(at[2]))
- author_words = author_word_freq[author]
- words = []
- for i in xrange(0, NUM_TOP_WORDS):
- if i < len(author_words):
- words.append(author_words[i][1] + '(' + str(author_words[i][2]) + ')')
- print ' ' + ','.join(words)
Add Comment
Please, Sign In to add comment