Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import psaw
- import ujson as json
- import urllib3
- http = urllib3.PoolManager()
- api = psaw.PushshiftAPI()
- lastdate = 1286870618 #epoch time of r/sn first post
- def fetchdata(lastdate):
- fields =('selftext',
- 'author',
- 'id',
- 'permalink',
- 'created_utc',
- 'title',
- 'subreddit_id')
- subrsubs = http.request('GET', 'https://api.pushshift.io/reddit/search/submission',
- fields = {'subreddit': 'suicidenotes',
- 'sort': 'asc',
- 'limit': '50',
- 'after': str(lastdate)})
- parsed = json.loads(subrsubs.data.decode('utf-8'))['data']
- if parsed.__len__() == 0:
- return False
- suidict = {'data': [], 'nonselfposts': []}
- for i in parsed:
- if 'selftext' not in i:
- suidict['nonselfposts'].append(i)
- else:
- suidict['data'].append({})
- for key in fields:
- suidict['data'][suidict['data'].__len__()-1][key] = i[key]
- suidict['firstdate'] = suidict['data'][0]['created_utc']
- suidict['lastdate'] = suidict['data'][suidict['data'].__len__()-1]['created_utc']
- suidict['sub_id'] = suidict['data'][0]['subreddit_id']
- return suidict
- fetch = True
- while fetch == True:
- dataout = fetchdata(lastdate)
- if dataout == False:
- fetch = False
- else:
- lastdate = dataout['lastdate']
- fname = "_".join([dataout['sub_id'],
- str(dataout['firstdate']),
- str(dataout['lastdate'])])
- with open(fname + ".json", 'w') as outfile:
- json.dump(dataout, outfile)
- # with open('sui.json', 'w') as outfile:
- # ujson.dump(suinotes, outfile)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement