Lexalytics Reddit Basic Comment Scraping with PRAW
Sep 3rd, 2014
- #!/usr/bin/env python
- import praw
- import csv
- import datetime
- #replace Comment Scraping Bot with your own user agent ID - be unique
- r = praw.Reddit('Comment Scraping Bot')
- #change if you want to change where the output file goes
- output_file = csv.writer(open("comments.csv", "w",0), dialect='excel')
- output_file.writerow(["id", "date", "comment"])
- #replace submission_id with the id you want to scrape yeah I was too lazy to do args
- #i am a marketing guy what do you expect
- submission = r.get_submission(submission_id = '2eq0ca')
- print "got submission"
- print "\n"
- #this gets all the comments
- #set limit to the number of more_comments pages you want to get
- #this can take a while if there's a lot as praw obeys reddit api request timing limits
- submission.replace_more_comments (limit=None, threshold=10)
- print "comments replaced"
- print "\n"
- #this dumps out into a CSV with a date time format Tableau can parse
- #note that this only dumps top level comments and not replies to comments
- for comment in submission.comments:
- comment_body = comment.body.encode('utf-8')
- comment_date = datetime.datetime.fromtimestamp(int(comment.created_utc)).strftime('%Y-%m-%d %H:%M')
- output_file.writerow([id, comment_date, comment_body])
Please, Sign In to add comment