Advertisement
erktheerk

overlap

Jun 3rd, 2015
324
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.12 KB | None | 0 0
  1. import praw
  2. import traceback
  3. import time
  4. import types
  5. import os
  6.  
  7. ''' CONFIG '''
  8.  
  9. USERAGENT = 'Looking for overlapping subreddits users also post /u/erktheerk'
  10. MAXPOSTS = 100
  11.  
  12. DROPOUT = [404]
  13. # This error code will cause the nofailrequest to exit.
  14. # There's no reason to repeatedly request a 404.
  15.  
  16. ''' END CONFIG '''
  17.  
  18. try:
  19.     import bot
  20.     USERAGENT = bot.uG
  21. except ImportError:
  22.     pass
  23.  
  24. r = praw.Reddit(USERAGENT)
  25.  
  26. def nfr2(function, *fargs, **fkwargs):
  27.     '''
  28.     Different version of NFR.
  29.     The first was having problems with generators and lazyload
  30.     objects, because those functions return successfully
  31.     even though the data isn't checked
  32.     '''
  33.     while True:
  34.         try:
  35.             results = function(*fargs, **fkwargs)
  36.             if isinstance(results, types.GeneratorType):
  37.                 results = list(results)
  38.             return results
  39.         except praw.requests.exceptions.HTTPError as e:
  40.             if e.response.status_code == DROPOUT:
  41.                 return None
  42.             if isinstance(DROPOUT, list) and e.response.status_code in DROPOUT:
  43.                 return None
  44.             traceback.print_exc()
  45.             print('Retrying in 2...')
  46.             time.sleep(2)
  47.         except KeyboardInterrupt:
  48.             return None
  49.         except:
  50.             traceback.print_exc()
  51.             print('Retrying in 2...')
  52.             time.sleep(2)
  53.  
  54. def nfr(function, dropout=None):
  55.     '''
  56.     "No Fail Request"
  57.     Creates a function that will retry until it succeeds.
  58.     This function accepts 1 parameter, a function, and returns a modified
  59.     version of that function that will try-catch, sleep, and loop until it
  60.     finally returns.
  61.     '''
  62.     def b():
  63.         traceback.print_exc()
  64.         print('Retrying in 2...')
  65.         time.sleep(2)
  66.     def a(*args, **kwargs):
  67.         while True:
  68.             try:
  69.                 result = function(*args, **kwargs)
  70.                 return result
  71.             except praw.requests.exceptions.HTTPError as e:
  72.                 if e.response.status_code == dropout:
  73.                     return None
  74.                 if isinstance(dropout, list) and e.response.status_code in dropout:
  75.                     return None
  76.                 else:
  77.                     b()
  78.             except requests.exceptions.ConnectionError:
  79.                 b()
  80.             except AssertionError:
  81.                 # Strange PRAW bug causes certain MoreComments
  82.                 # To throw assertion error, so just ignore it
  83.                 # And get onto the next one.
  84.                 return []
  85.             except KeyboardInterrupt:
  86.                 raise Exception("KeyboardInterrupt")
  87.             except:
  88.                 b()
  89.     return a
  90.  
  91. def get_subreddit_authors(sr):
  92.     '''
  93.     Given a subreddit name, go to /r/subreddit/new
  94.     and /r/subreddit/comments, and return the names of post
  95.     authors.
  96.     '''
  97.     sr = sr.lower()
  98.     subreddit = nfr(r.get_subreddit)(sr)
  99.     print('/r/%s/new' % sr)
  100.     #posts = list(nfr(subreddit.get_new)(limit=MAXPOSTS))
  101.     posts = nfr2(subreddit.get_new, limit=MAXPOSTS)
  102.     print('/r/%s/comments' % sr)
  103.     #posts += list(nfr(subreddit.get_comments)(limit=MAXPOSTS))
  104.     posts += nfr2(subreddit.get_comments, limit=MAXPOSTS)
  105.  
  106.     authors = [post.author.name for post in posts if post.author is not None]
  107.     authors = list(set(authors))
  108.     authors.sort(key=lambda x: x.lower())
  109.     print('Found %d authors' % len(authors))
  110.     return authors
  111.  
  112. def process_userlist(authors, fromsubreddit=''):
  113.     '''
  114.     Given a list of usernames, put each into process_user()
  115.     and collect a total dictionary of subreddits
  116.  
  117.     If this list of names comes from scanning a subreddit, you
  118.     can provide `fromsubreddit`, which will be removed from the dict
  119.     at the end, since it's useless data if everyone has it in common.
  120.     '''
  121.     authors = list(set(authors))
  122.     fromsubreddit = fromsubreddit.lower()
  123.     count = len(authors)
  124.     i = 1
  125.     userreddits = {}
  126.     totalreddits = {}
  127.     for username in authors:
  128.         pre = '(%0{l}d/%0{l}d) '.format(l=len(str(count))) % (i, count)
  129.         thisuser = process_user(username, pre=pre)
  130.         userreddits[username] = thisuser
  131.         for sub in thisuser:
  132.             totalreddits[sub] = totalreddits.get(sub, 0) + thisuser[sub]
  133.         #print(totalreddits)
  134.         i += 1
  135.  
  136.     if fromsubreddit in totalreddits:
  137.         del totalreddits[fromsubreddit]
  138.     # -1 because of %totalposts%
  139.     totalreddits['%totalsubs%'] = (len(totalreddits) - 1)
  140.     return totalreddits
  141.  
  142. def process_subreddit(sr):
  143.     '''
  144.     Given a subreddit name, collect authors from submissions
  145.     and comments, then pass them into process_userlist
  146.     '''
  147.     authors = get_subreddit_authors(sr)
  148.     results = process_userlist(authors, fromsubreddit=sr)
  149.     return results
  150.  
  151. def process_user(username, pre=''):
  152.     '''
  153.     Given a username, go to /u/username/submitted
  154.     and /u/username/comments, and return the names
  155.     of subreddits he has posted to, with their frequencies
  156.     '''
  157.     user = nfr(r.get_redditor, dropout=404)(username)
  158.     if user is None:
  159.         return {}
  160.     print('\t%s/u/%s/submitted' % (pre, username))
  161.     #userposts = list(nfr(user.get_submitted)(limit=MAXPOSTS))
  162.     userposts = nfr2(user.get_submitted, limit=MAXPOSTS)
  163.     print('\t%s/u/%s/comments' % (pre, username))
  164.     #userposts += list(nfr(user.get_comments)(limit=MAXPOSTS))
  165.     userposts += nfr2(user.get_comments, limit=MAXPOSTS)
  166.  
  167.     userreddits = {'%totalposts%':len(userposts)}
  168.     for post in userposts:
  169.         subreddit = post.subreddit.display_name.lower()
  170.         userreddits[subreddit] = userreddits.get(subreddit, 0) + 1
  171.  
  172.     return userreddits
  173.  
  174. def write_json(filename, totalreddits):
  175.     '''
  176.     Given a dictionary totalreddits, sort by freq
  177.     and write it to filename.json
  178.     '''
  179.     if filename[-5:] != '.json':
  180.         filename += combined.json'
  181.     keys = list(totalreddits.keys())
  182.     keys.sort(key=lambda x: (totalreddits.get(x), x.lower()), reverse=True)
  183.  
  184.     print('Creating %s' % filename)
  185.     outfile = open(filename, 'w')
  186.     outfile.write('{\n')
  187.     for key in keys:
  188.         val = totalreddits[key]
  189.         outfile.write('\t"%s" : %d,\n' % (key, val))
  190.     outfile.write('}')
  191.     outfile.close()
  192.  
  193. def process_and_write(sr):
  194.     '''
  195.     shortcut to process_subreddit and write_json
  196.     '''
  197.     totalreddits = process_subreddit(sr)
  198.     write_json(sr, totalreddits)
  199.  
  200. def file_lines(filename):
  201.     textfile = open(filename, 'r')
  202.     textlines = [line.strip() for line in textfile.readlines()]
  203.     textfile.close()
  204.     return textlines
  205.  
  206. def process_subfile(filename):
  207.     '''
  208.     Shortcut to open a txt file containing subreddit names
  209.     automatically put each one into process_and_write
  210.     '''
  211.     sublines = file_lines(filename)
  212.  
  213.     for subname in sublines:
  214.         process_and_write(subname)
  215.  
  216. def process_userfile(filename, jsonfilename):
  217.     '''
  218.     Shortcut to open a txt file containing user names
  219.     automatically put each one into process_userlist
  220.  
  221.     jsonfilename is required since we don't have any subreddit
  222.     to go off of.
  223.     '''
  224.     userlines = file_lines(filename)
  225.  
  226.     for username in userlines:
  227.         results = process_userlist(userlines)
  228.         write_json(jsonfilename, results)
  229.  
  230. if __name__ == '__main__':
  231.     #process_and_write('goldtesting')
  232.     #--Run on a single subreddit--
  233.     #
  234.     sublist = open('sublistsmaller.txt', 'r')
  235.     lines = [line.strip() for line in sublist.readlines()]
  236.     for line in lines:
  237.         process_and_write(line)
  238.     os._exit(0)
  239.     #--Run from a sublist.txt file--
  240.     #
  241.     #import sqlite3
  242.     #sql = sqlite3.connect('coontown.db')
  243.     #cur = sql.cursor()
  244.     #cur.execute('SELECT author FROM posts WHERE author != "[deleted]"')
  245.     #authors = [x[0] for x in cur.fetchall()]
  246.     #
  247.     #results = process_userlist(authors)
  248.     #write_json('subreddit', results)
  249.     #--Run with DB file from--
  250.     #
  251.     #process_userfile('coontownusers.txt', 'coontowndeepscan.json')
  252.     #--Run from userlist.txt file--
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement