sredmore

Lexalytics Reddit Basic Comment Scraping with PRAW

Sep 3rd, 2014
590
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.26 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import praw
  4. import csv
  5. import datetime
  6.  
  7.  
  8. #replace Comment Scraping Bot with your own user agent ID - be unique
  9. r = praw.Reddit('Comment Scraping Bot')
  10.  
  11. #change if you want to change where the output file goes
  12. output_file = csv.writer(open("comments.csv", "w",0), dialect='excel')
  13. output_file.writerow(["id", "date", "comment"])
  14.  
  15. id=0
  16.  
  17. #replace submission_id with the id you want to scrape yeah I was too lazy to do args
  18. #i am a marketing guy what do you expect
  19. submission = r.get_submission(submission_id = '2eq0ca')
  20. print "got submission"
  21. print "\n"
  22.  
  23. #this gets all the comments
  24. #set limit to the number of more_comments pages you want to get
  25. #this can take a while if there's a lot as praw obeys reddit api request timing limits
  26. submission.replace_more_comments (limit=None, threshold=10)
  27. print "comments replaced"
  28. print "\n"
  29.  
  30. #this dumps out into a CSV with a date time format Tableau can parse
  31. #note that this only dumps top level comments and not replies to comments
  32. for comment in submission.comments:
  33.     comment_body = comment.body.encode('utf-8')
  34.     comment_date = datetime.datetime.fromtimestamp(int(comment.created_utc)).strftime('%Y-%m-%d %H:%M')
  35.     id+=1
  36.     output_file.writerow([id, comment_date, comment_body])
Add Comment
Please, Sign In to add comment