Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/env python
- import datetime, itertools, json, os, re, sys, urllib2
- def get_query(url):
- f = urllib2.urlopen(url)
- data = f.read()
- f.close()
- return json.loads(data)
- '''
- def get_most_recent_post(site):
- url = 'http://api.stackexchange.com/2.0/posts?order=desc&sort=creation&site=%s&pagesize=1' % (site,)
- return get_query(url)['items'][0]
- def get_posts(site, min_id, max_id):
- posts = items
- start = min_id
- while start < max_id:
- stop = start + 100
- if stop > max_id: stop = max_id
- id_list = ';'.join(map(str, range(start, stop)))
- url = 'http://api.stackexchange.com/2.0/posts/%s?order=asc&sort=creation&site=%s&pagesize=100' % (id_list, site)
- posts += get_query(url)['items']
- start = stop
- def get_all_posts(site):
- last_post = get_most_recent_post(site)
- '''
- def download_site(site):
- unsafe = True
- # get everything that /posts can get: default + body, comments, vote breakdown
- filter_string = '4Jw0Lzyv.2_8S' if unsafe else '!bULULThFh9yPVb'
- url_base = 'http://api.stackexchange.com/2.0/posts?order=asc&sort=creation&site=%s&filter=%s&pagesize=100&page=' % (site, filter_string)
- page_number = 0
- has_more = True
- posts = []
- while has_more:
- page_number += 1
- data = get_query(url_base + str(page_number))
- posts += data['items']
- has_more = data['has_more']
- posts.sort(key=lambda post: post['post_id'])
- return posts
- if __name__ == "__main__":
- site_name = sys.argv[1]
- out = open(site_name + ".json", "w") if sys.stdout.isatty() else sys.stdout
- posts = download_site(sys.argv[1])
- json.dump(posts, out, indent=2)
- if not out is sys.stdout: out.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement