Untitled

import re
import requests
from time import *
from collections import *

subDict = defaultdict(int)
pat = re.compile(r"\/?r\/ ?((\w|_){3,21})")
unsorted = []

#span of time to grab posts is 12 hours, starting from now
timestep = 12*60*60
before = int(time())

sub = "JustUnsubbed"

grabbing = True
hits = 0
while(grabbing):
    #Adjust timeframe to be the 12 hours prior to "before" timestamp
    after = before - timestep
    #If the api has timed me out, wait a few seconds before querying again
    while("Too Many Requests" in (r := requests.get(f"https://api.pushshift.io/reddit/search/submission/?subreddit={sub}&sort=desc&sort_type=created_utc&after={after}&before={before}&size=1000")).text):
        print("Timed out")
        sleep(5)

    posts = r.json()["data"]

    if(len(posts) == 0):
        grabbing = False
        break

    for post in posts:
        title = post["title"].lower()
        regexResult = pat.search(title)

        #If there is a subreddit (prefixed by r/, with some optional syntax sugar to catch as many subs as possible)
        if(regexResult):
            #Print the grabbed subreddit name, because words scrolling by on a console is cool B)
            print(regexResult.group(0))

            #Ignore r/justunsubbed because a significant number of people post "I just r/unsubbed from r/<subreddit>", causing error in the data
            if(regexResult.group(1) != 'justunsubbed'):
                subDict[regexResult.group(1)] += 1
            hits += 1
        else:
            #Otherwise, the sub can't be immediately parsed. Log it
            print(f"MISSED: {title}")
            unsorted.append(title)

        print("----------------------")

    #Set the end of the timeframe to the oldest post in the list
    before = posts[-1]["created_utc"]


print(f"got {hits} matches and {len(unsorted)} misses")

#Compose the dict based on the edit distance of a given key to it's neigbors
delete = []
for referenceKey in subDict.keys():
    if(referenceKey not in delete):
        #Grab all close keys that haven't already been composed and aren't the referencekey
        matches = get_close_matches(referenceKey,[key for key in subDict.keys() if ((key != referenceKey) and (key not in delete))],cutoff=.8)
        print(matches)
        for match in matches:
            answer = input(f"Is {match} probably {referenceKey}?")
            #If any input is grabbed, mark the matched key for deletion and compose it into the reference
            if(answer != ""):
                delete.append(match)
                subDict[referenceKey] += subDict[match]

#Build a new dict without the deleted items and replace it
newDict = {}
for k,v in subDict.items():
    if(k not in delete):
        newDict[k] = v
subDict = newDict

#Sort the dict and print it to a csv for excel
with open("out.csv", "w") as f:
    for k in sorted(subDict, key=subDict.get, reverse=True):
        f.write(f"{k},{subDict[k]}\n")

#Print out all of the failures
with open("fails.txt", "w", encoding="utf8") as f:
    f.writelines([l + '\n' for l in unsorted])