Advertisement
stackexchange-gilles

stackexchange-api-download-all

May 17th, 2013
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.74 KB | None | 0 0
  1. #! /usr/bin/env python
  2. import datetime, itertools, json, os, re, sys, urllib2
  3.  
  4. def get_query(url):
  5.     f = urllib2.urlopen(url)
  6.     data = f.read()
  7.     f.close()
  8.     return json.loads(data)
  9.  
  10. '''
  11. def get_most_recent_post(site):
  12.    url = 'http://api.stackexchange.com/2.0/posts?order=desc&sort=creation&site=%s&pagesize=1' % (site,)
  13.    return get_query(url)['items'][0]
  14.  
  15. def get_posts(site, min_id, max_id):
  16.    posts = items
  17.    start = min_id
  18.    while start < max_id:
  19.        stop = start + 100
  20.        if stop > max_id: stop = max_id
  21.        id_list = ';'.join(map(str, range(start, stop)))
  22.        url = 'http://api.stackexchange.com/2.0/posts/%s?order=asc&sort=creation&site=%s&pagesize=100' % (id_list, site)
  23.        posts += get_query(url)['items']
  24.        start = stop
  25.  
  26. def get_all_posts(site):
  27.    last_post = get_most_recent_post(site)
  28. '''
  29.  
  30. def download_site(site):
  31.     unsafe = True
  32.     # get everything that /posts can get: default + body, comments, vote breakdown
  33.     filter_string = '4Jw0Lzyv.2_8S' if unsafe else '!bULULThFh9yPVb'
  34.     url_base = 'http://api.stackexchange.com/2.0/posts?order=asc&sort=creation&site=%s&filter=%s&pagesize=100&page=' % (site, filter_string)
  35.     page_number = 0
  36.     has_more = True
  37.     posts = []
  38.     while has_more:
  39.         page_number += 1
  40.         data = get_query(url_base + str(page_number))
  41.         posts += data['items']
  42.         has_more = data['has_more']
  43.     posts.sort(key=lambda post: post['post_id'])
  44.     return posts
  45.  
  46. if __name__ == "__main__":
  47.     site_name = sys.argv[1]
  48.     out = open(site_name + ".json", "w") if sys.stdout.isatty() else sys.stdout
  49.     posts = download_site(sys.argv[1])
  50.     json.dump(posts, out, indent=2)
  51.     if not out is sys.stdout: out.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement