Guest User

Untitled

a guest
Feb 10th, 2018
235
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.81 KB | None | 0 0
  1. import httplib2
  2. import json
  3. import math
  4. from ratelimit import rate_limited
  5. import re
  6. import sys
  7. try:
  8.   # Python 2.6-2.7
  9.   from HTMLParser import HTMLParser
  10. except ImportError:
  11.   # Python 3
  12.   from html.parser import HTMLParser
  13.  
  14. # A bunch of string constants used below.
  15.  
  16. # The parsed reddit JSON gets unicode strings.
  17. uAFTER = u'after'
  18. uAUTHOR = u'author'
  19. uBODY = u'body'
  20. uBODYHTML = u'body_html'
  21. uCHILDREN = u'children'
  22. uDATA = u'data'
  23. uPERMALINK = u'permalink'
  24. uREPLIES = u'replies'
  25. uTITLE = u'title'
  26. uSELFTEXTHTML = u'selftext_html'
  27.  
  28. COMMENTS = 'comments'
  29. POSTS = 'posts'
  30. REQUESTCOUNT = 'request_count'
  31. TOTALWORDS = 'total_words'
  32. WORDCOUNTS = 'word_counts'
  33. REDDIT = 'https://www.reddit.com'
  34. MAM_TOP = 'https://www.reddit.com/r/MakingaMurderer/new/.json?limit=100'
  35. NUM_POST_PAGES = 5
  36. NUM_TOP_WORDS = 25
  37.  
  38. # A simple HTML Parser that drops text in blockquotes.
  39. class BlockquoteDropper(HTMLParser):
  40.   def __init__(self):
  41.     HTMLParser.__init__(self)
  42.     self.blockquote = 0
  43.     self.parts = []
  44.  
  45.   def handle_starttag(self, tag, attrs):
  46.     if tag == 'blockquote':
  47.       self.blockquote += 1
  48.  
  49.   def handle_endtag(self, tag):
  50.     if tag == 'blockquote':
  51.       self.blockquote -= 1
  52.  
  53.   def handle_data(self, data):
  54.     if self.blockquote <= 0:
  55.       self.parts.append(data)
  56.  
  57.   def get_text(self):
  58.     return ' '.join(self.parts)
  59.  
  60.  
  61. # This is the non-rate-limited call.
  62. def get_url_norl(h, url):
  63.   return h.request(url, 'GET',
  64.                    headers={'user-agent':
  65.                             'no-platform:MakingaMurdererStats:v0.01 ' +
  66.                             '(by /u/MaMStats)'})
  67.  
  68. # This rate limits to once per second.  Not sure if this is required since
  69. # the rate limit numbers I saw apply to oath calls.  I also don't see the
  70. # rate limit headers in the response.  In any case, it's still polite.
  71. @rate_limited(1)
  72. def get_url_rl(h, url):
  73.   return h.request(url, 'GET',
  74.                    headers={'user-agent': ''})
  75.  
  76. def get_url(http_info, url):
  77.   # If we got the last request from cache, then we aren't subject to the
  78.   # rate limit for the next call.  In practice, this doesn't matter for
  79.   # reddit since the pages are all marked with a max-age of 0 (i.e. no cache).
  80.   if http_info['last_in_cache']:
  81.     r, content = get_url_norl(http_info['http'], url)
  82.   else:
  83.     r, content = get_url_rl(http_info['http'], url)
  84.  
  85.   http_info['last_in_cache'] = r.fromcache
  86.   return (r, content)
  87.  
  88. # This gives a little feedback on the command line.
  89. def update_counter(http_info):
  90.   http_info[REQUESTCOUNT] = http_info[REQUESTCOUNT] + 1
  91.   sys.stderr.write(str(http_info[REQUESTCOUNT]))
  92.   sys.stderr.flush()
  93.   sys.stderr.write('\r')
  94.  
  95. # Adds the words in the given piece of raw text to the authors word map.
  96. def add_author_text(author_data, text):
  97.   if not text:
  98.     return
  99.   words = re.split('\W+', text)
  100.  
  101.   legit_words = 0
  102.   if not WORDCOUNTS in author_data:
  103.     author_data[WORDCOUNTS] = {}
  104.   for word in words:
  105.     clean = word.strip().lower()
  106.     if len(clean) == 0:
  107.       continue
  108.  
  109.     legit_words += 1
  110.     if not clean in author_data[WORDCOUNTS]:
  111.       author_data[WORDCOUNTS][clean] = 1
  112.     else:
  113.       author_data[WORDCOUNTS][clean] += 1  
  114.  
  115.   if not TOTALWORDS in author_data:
  116.     author_data[TOTALWORDS] = 0
  117.   author_data[TOTALWORDS] += legit_words
  118.  
  119. # As above, but parses HTML and drops blockquotes.  This is important because
  120. # there's a lot of quoting of previous responses.
  121. def add_author_html(author_data, encoded_html):
  122.   if not encoded_html:
  123.     return
  124.   bd = BlockquoteDropper()
  125.   bd.feed(bd.unescape(encoded_html))
  126.   text = bd.get_text()
  127.   add_author_text(author_data, text)
  128.      
  129. # Read the list of posts after the specified id and previous count.
  130. def read_post_list(http_info, author_map, after, count):
  131.   after_param = '&after=' + after if after else ''
  132.   url = (MAM_TOP + after_param + '&count=' + str(count) if after else
  133.          MAM_TOP)
  134.   r, content = get_url(http_info, url);
  135.   update_counter(http_info)
  136.   data = json.loads(content)
  137.  
  138.   posts = data[uDATA][uCHILDREN]
  139.   for post in posts:
  140.     author = post[uDATA][uAUTHOR]
  141.     if not author in author_map:
  142.       author_map[author] = {}
  143.     if not POSTS in author_map[author]:
  144.       author_map[author][POSTS] = []
  145.     author_map[author][POSTS].append(post)
  146.     add_author_text(author_map[author], post[uDATA][uTITLE])
  147.     if uSELFTEXTHTML in post[uDATA]:
  148.       add_author_html(author_map[author], post[uDATA][uSELFTEXTHTML])    
  149.  
  150.   return data[uDATA][uAFTER]
  151.  
  152. # Save the comment text with the author's data.
  153. def store_comment(author_map, comment):
  154.   if not uDATA in comment or not uAUTHOR in comment[uDATA]:
  155.     return
  156.  
  157.   author = comment[uDATA][uAUTHOR]
  158.   if not author in author_map:
  159.     author_map[author] = {}
  160.   if not COMMENTS in author_map[author]:
  161.     author_map[author][COMMENTS] = []
  162.   author_map[author][COMMENTS].append(comment)
  163.   if uBODYHTML in comment[uDATA]:
  164.     add_author_html(author_map[author], comment[uDATA][uBODYHTML])
  165.   elif uBODY in comment[uDATA]:
  166.     add_author_text(author_map[author], comment[uDATA][uBODY])
  167.  
  168.   if comment[uDATA][uREPLIES]:
  169.     for reply in comment[uDATA][uREPLIES][uDATA][uCHILDREN]:
  170.       store_comment(author_map, reply)
  171.  
  172. # Read the list of top 200 comments for the given post.
  173. def get_comments(http_info, post, author_map):
  174.   # Limit to the 200 latest comments on every post
  175.   url = REDDIT + post[uDATA][uPERMALINK] + '.json?sort=new&limit=200'
  176.   r, content = get_url(http_info, url)
  177.   update_counter(http_info)
  178.   data = json.loads(content)
  179.  
  180.   if len(data) < 1:
  181.     return
  182.  
  183.   for comment in data[1][uDATA][uCHILDREN]:
  184.     store_comment(author_map, comment)
  185.  
  186.  
  187. if __name__ == '__main__':
  188.   h = httplib2.Http('.cache')
  189.   http_info = {
  190.     'http': h,
  191.     'last_in_cache': False,
  192.     'request_count': 0
  193.   }
  194.   author_map = {}
  195.  
  196.   # Read the newest NUM_POST_PAGES of posts, with 100 posts per page.
  197.   count = 0
  198.   after = None
  199.   for i in xrange(0, NUM_POST_PAGES):
  200.     after = read_post_list(http_info, author_map, after, count)
  201.     count += 100
  202.  
  203.   # For each post, read the top 200 comments.
  204.   for kv in author_map.items():
  205.     for post in kv[1][POSTS]:
  206.       get_comments(http_info, post, author_map)
  207.  
  208.   # Loop over all the authors and generate the global word frequencies.
  209.   global_freq = {}
  210.   for kv in author_map.items():
  211.     if not WORDCOUNTS in kv[1]:
  212.       continue
  213.    
  214.     for wc in kv[1][WORDCOUNTS].items():
  215.       if not wc[0] in global_freq:
  216.         global_freq[wc[0]] = wc[1]
  217.       else:
  218.         global_freq[wc[0]] += wc[1]
  219.  
  220.   # Compute the uniqueness of the author's usage to the global.
  221.   author_word_freq = {}
  222.   for kv in author_map.items():
  223.     freq = []
  224.     for wc in kv[1][WORDCOUNTS].items():
  225.       freq.append((
  226.         max(0.5, math.log(wc[1])) * float(wc[1]) / global_freq[wc[0]], wc[0],
  227.         wc[1]))
  228.  
  229.     # Sort each author's words by uniqueness score, descending.
  230.     freq.sort()
  231.     freq.reverse()
  232.     author_word_freq[kv[0]] = freq
  233.  
  234.   # Get the total words, posts, and comments for each author.
  235.   author_totals = []
  236.   for kv in author_map.items():
  237.     posts = len(kv[1][POSTS]) if POSTS in kv[1] else 0
  238.     comments = len(kv[1][COMMENTS]) if COMMENTS in kv[1] else 0
  239.     words = kv[1][TOTALWORDS] if TOTALWORDS in kv[1] else 0
  240.     author_totals.append((words, posts, comments, kv[0]))
  241.  
  242.   # This will sort based on total words.
  243.   author_totals.sort()
  244.   author_totals.reverse()
  245.  
  246.   # Print it all.
  247.   for at in author_totals:
  248.     author = at[3]
  249.     print (author + ': words=' + str(at[0]) + ' posts=' + str(at[1]) +
  250.            ' comments=' + str(at[2]))
  251.     author_words = author_word_freq[author]
  252.     words = []
  253.     for i in xrange(0, NUM_TOP_WORDS):
  254.       if i < len(author_words):
  255.         words.append(author_words[i][1] + '(' + str(author_words[i][2]) + ')')
  256.     print '  ' + ','.join(words)
Add Comment
Please, Sign In to add comment