Guest User

Untitled

a guest
Jul 16th, 2018
111
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.73 KB | None | 0 0
  1. #! /usr/bin/env python2.7
  2. """This application will go through all the comments on a specific subreddit and parse all
  3. the comments and save them to a local text file"""
  4. import threading
  5. import praw
  6. import time
  7. import cleaner
  8. import glob
  9.  
  10. #reffer to the praw documentation for more info on clientID and client_secret. (API Keys)
  11. r = praw.Reddit(username = "xxxxxxxxx", password = "xxxxxxxx", client_id = "xxxxxxxx", client_secret = "xxxxxxxx", user_agent = "Subreddit Comment parser")
  12. print ("Logging in...")
  13.  
  14.  
  15.  
  16. counter = 0
  17. cache = []
  18.  
  19.  
  20. #this will get all the comments for the last 500 posts for that specific subreddit.
  21. def run_bot(subredditName):
  22. print "Parsing comments from " + subredditName
  23. #just type the subreddit name, do not use /r/ or anything else
  24. subreddit = r.subreddit(subredditName)
  25. fileName = subredditName + ".txt"
  26. print "Grabbing comments from " + subredditName
  27. submissions = subreddit.hot(limit=500)
  28. for submission in submissions:
  29. submission.comments.replace_more(limit=0)
  30. comment_queue = submission.comments[:]
  31.  
  32. while comment_queue:
  33. comment = comment_queue.pop(0)
  34. comment_text = comment.body.lower()
  35. if comment.id not in cache:
  36. print comment_text
  37. with open(fileName, "a") as myfile:
  38. try:
  39. myfile.write(" " + comment_text + "\n")
  40. except UnicodeEncodeError:
  41. pass
  42. myfile.close()
  43. cache.append(comment.id)
  44.  
  45. #this is a 10 minute break to give reddit servers a break. THe action above will be executed 1000 times, but duplicates will be ignored using the cache variable.
  46.  
  47. subredditsToParse = ["trumpgret", "sandersforpresident", "funny", "wholesomememes", "news", "todayilearned", "interestingasfuck", "wtf", "gifs",\
  48. "highqualitygifs", "jokes", "the_donald", "keepournetfree", "nintendoswitch", "atbge", "mildlyinfuriating", \
  49. "rage", "blackpeoplegifs", "pcmasterrace", "evilbuildings", "upliftingnews", "fellowkids", "whitepeopletwitter", \
  50. "atheism", "beholdthemasterrace", "enoughtrumpspam", "political_revolution", "worldnews", \
  51. "hillaryforprison", "liberal", "politics", "esist", "fuckthealtright", "sjwhate", "imgoingtohellforthis"]
  52.  
  53.  
  54.  
  55. while counter < 10:
  56. for i in subredditsToParse:
  57. run_bot(i)
  58.  
  59.  
  60.  
  61. print "Taking a little break"
  62. time.sleep(30)
  63. counter += 1
  64.  
  65.  
  66.  
  67.  
  68.  
  69.  
  70. #this line will gather all the txt files in the directory, might need some tweaking to work in your environment.
  71. txt_file_list = glob.glob("*.txt")
  72.  
  73.  
  74.  
  75. #this will clean all the txt files in the current directory.
  76. #By cleaning I mean removing all the spaces and unsupported symbols.
  77. for i in txt_file_list:
  78. cleaner.cleanemptylines(i)
Add Comment
Please, Sign In to add comment