Advertisement
Guest User

Untitled

a guest
Jan 21st, 2020
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.09 KB | None | 0 0
  1. import re
  2. import requests
  3. from time import *
  4. from collections import *
  5.  
  6. subDict = defaultdict(int)
  7. pat = re.compile(r"\/?r\/ ?((\w|_){3,21})")
  8. unsorted = []
  9.  
  10. #span of time to grab posts is 12 hours, starting from now
  11. timestep = 12*60*60
  12. before = int(time())
  13.  
  14. sub = "JustUnsubbed"
  15.  
  16. grabbing = True
  17. hits = 0
  18. while(grabbing):
  19.     #Adjust timeframe to be the 12 hours prior to "before" timestamp
  20.     after = before - timestep
  21.     #If the api has timed me out, wait a few seconds before querying again
  22.     while("Too Many Requests" in (r := requests.get(f"https://api.pushshift.io/reddit/search/submission/?subreddit={sub}&sort=desc&sort_type=created_utc&after={after}&before={before}&size=1000")).text):
  23.         print("Timed out")
  24.         sleep(5)
  25.    
  26.     posts = r.json()["data"]
  27.  
  28.     if(len(posts) == 0):
  29.         grabbing = False
  30.         break
  31.  
  32.     for post in posts:
  33.         title = post["title"].lower()
  34.         regexResult = pat.search(title)
  35.  
  36.         #If there is a subreddit (prefixed by r/, with some optional syntax sugar to catch as many subs as possible)
  37.         if(regexResult):
  38.             #Print the grabbed subreddit name, because words scrolling by on a console is cool B)
  39.             print(regexResult.group(0))
  40.  
  41.             #Ignore r/justunsubbed because a significant number of people post "I just r/unsubbed from r/<subreddit>", causing error in the data
  42.             if(regexResult.group(1) != 'justunsubbed'):
  43.                 subDict[regexResult.group(1)] += 1
  44.             hits += 1
  45.         else:
  46.             #Otherwise, the sub can't be immediately parsed. Log it
  47.             print(f"MISSED: {title}")
  48.             unsorted.append(title)
  49.  
  50.         print("----------------------")
  51.  
  52.     #Set the end of the timeframe to the oldest post in the list
  53.     before = posts[-1]["created_utc"]
  54.  
  55.  
  56. print(f"got {hits} matches and {len(unsorted)} misses")
  57.  
  58. #Compose the dict based on the edit distance of a given key to it's neigbors
  59. delete = []
  60. for referenceKey in subDict.keys():
  61.     if(referenceKey not in delete):
  62.         #Grab all close keys that haven't already been composed and aren't the referencekey
  63.         matches = get_close_matches(referenceKey,[key for key in subDict.keys() if ((key != referenceKey) and (key not in delete))],cutoff=.8)
  64.         print(matches)
  65.         for match in matches:
  66.             answer = input(f"Is {match} probably {referenceKey}?")
  67.             #If any input is grabbed, mark the matched key for deletion and compose it into the reference
  68.             if(answer != ""):
  69.                 delete.append(match)
  70.                 subDict[referenceKey] += subDict[match]
  71.  
  72. #Build a new dict without the deleted items and replace it
  73. newDict = {}
  74. for k,v in subDict.items():
  75.     if(k not in delete):
  76.         newDict[k] = v
  77. subDict = newDict
  78.  
  79. #Sort the dict and print it to a csv for excel
  80. with open("out.csv", "w") as f:
  81.     for k in sorted(subDict, key=subDict.get, reverse=True):
  82.         f.write(f"{k},{subDict[k]}\n")
  83.  
  84. #Print out all of the failures
  85. with open("fails.txt", "w", encoding="utf8") as f:
  86.     f.writelines([l + '\n' for l in unsorted])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement