Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import requests
- from time import *
- from collections import *
- subDict = defaultdict(int)
- pat = re.compile(r"\/?r\/ ?((\w|_){3,21})")
- unsorted = []
- #span of time to grab posts is 12 hours, starting from now
- timestep = 12*60*60
- before = int(time())
- sub = "JustUnsubbed"
- grabbing = True
- hits = 0
- while(grabbing):
- #Adjust timeframe to be the 12 hours prior to "before" timestamp
- after = before - timestep
- #If the api has timed me out, wait a few seconds before querying again
- while("Too Many Requests" in (r := requests.get(f"https://api.pushshift.io/reddit/search/submission/?subreddit={sub}&sort=desc&sort_type=created_utc&after={after}&before={before}&size=1000")).text):
- print("Timed out")
- sleep(5)
- posts = r.json()["data"]
- if(len(posts) == 0):
- grabbing = False
- break
- for post in posts:
- title = post["title"].lower()
- regexResult = pat.search(title)
- #If there is a subreddit (prefixed by r/, with some optional syntax sugar to catch as many subs as possible)
- if(regexResult):
- #Print the grabbed subreddit name, because words scrolling by on a console is cool B)
- print(regexResult.group(0))
- #Ignore r/justunsubbed because a significant number of people post "I just r/unsubbed from r/<subreddit>", causing error in the data
- if(regexResult.group(1) != 'justunsubbed'):
- subDict[regexResult.group(1)] += 1
- hits += 1
- else:
- #Otherwise, the sub can't be immediately parsed. Log it
- print(f"MISSED: {title}")
- unsorted.append(title)
- print("----------------------")
- #Set the end of the timeframe to the oldest post in the list
- before = posts[-1]["created_utc"]
- print(f"got {hits} matches and {len(unsorted)} misses")
- #Compose the dict based on the edit distance of a given key to it's neigbors
- delete = []
- for referenceKey in subDict.keys():
- if(referenceKey not in delete):
- #Grab all close keys that haven't already been composed and aren't the referencekey
- matches = get_close_matches(referenceKey,[key for key in subDict.keys() if ((key != referenceKey) and (key not in delete))],cutoff=.8)
- print(matches)
- for match in matches:
- answer = input(f"Is {match} probably {referenceKey}?")
- #If any input is grabbed, mark the matched key for deletion and compose it into the reference
- if(answer != ""):
- delete.append(match)
- subDict[referenceKey] += subDict[match]
- #Build a new dict without the deleted items and replace it
- newDict = {}
- for k,v in subDict.items():
- if(k not in delete):
- newDict[k] = v
- subDict = newDict
- #Sort the dict and print it to a csv for excel
- with open("out.csv", "w") as f:
- for k in sorted(subDict, key=subDict.get, reverse=True):
- f.write(f"{k},{subDict[k]}\n")
- #Print out all of the failures
- with open("fails.txt", "w", encoding="utf8") as f:
- f.writelines([l + '\n' for l in unsorted])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement