Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/env python2.7
- """This application will go through all the comments on a specific subreddit and parse all
- the comments and save them to a local text file"""
- import threading
- import praw
- import time
- import cleaner
- import glob
- #reffer to the praw documentation for more info on clientID and client_secret. (API Keys)
- r = praw.Reddit(username = "xxxxxxxxx", password = "xxxxxxxx", client_id = "xxxxxxxx", client_secret = "xxxxxxxx", user_agent = "Subreddit Comment parser")
- print ("Logging in...")
- counter = 0
- cache = []
- #this will get all the comments for the last 500 posts for that specific subreddit.
- def run_bot(subredditName):
- print "Parsing comments from " + subredditName
- #just type the subreddit name, do not use /r/ or anything else
- subreddit = r.subreddit(subredditName)
- fileName = subredditName + ".txt"
- print "Grabbing comments from " + subredditName
- submissions = subreddit.hot(limit=500)
- for submission in submissions:
- submission.comments.replace_more(limit=0)
- comment_queue = submission.comments[:]
- while comment_queue:
- comment = comment_queue.pop(0)
- comment_text = comment.body.lower()
- if comment.id not in cache:
- print comment_text
- with open(fileName, "a") as myfile:
- try:
- myfile.write(" " + comment_text + "\n")
- except UnicodeEncodeError:
- pass
- myfile.close()
- cache.append(comment.id)
- #this is a 10 minute break to give reddit servers a break. THe action above will be executed 1000 times, but duplicates will be ignored using the cache variable.
- subredditsToParse = ["trumpgret", "sandersforpresident", "funny", "wholesomememes", "news", "todayilearned", "interestingasfuck", "wtf", "gifs",\
- "highqualitygifs", "jokes", "the_donald", "keepournetfree", "nintendoswitch", "atbge", "mildlyinfuriating", \
- "rage", "blackpeoplegifs", "pcmasterrace", "evilbuildings", "upliftingnews", "fellowkids", "whitepeopletwitter", \
- "atheism", "beholdthemasterrace", "enoughtrumpspam", "political_revolution", "worldnews", \
- "hillaryforprison", "liberal", "politics", "esist", "fuckthealtright", "sjwhate", "imgoingtohellforthis"]
- while counter < 10:
- for i in subredditsToParse:
- run_bot(i)
- print "Taking a little break"
- time.sleep(30)
- counter += 1
- #this line will gather all the txt files in the directory, might need some tweaking to work in your environment.
- txt_file_list = glob.glob("*.txt")
- #this will clean all the txt files in the current directory.
- #By cleaning I mean removing all the spaces and unsupported symbols.
- for i in txt_file_list:
- cleaner.cleanemptylines(i)
Add Comment
Please, Sign In to add comment