stackexchange-gilles

stackexchange-api-download-all

May 17th, 2013
53
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #! /usr/bin/env python
  2. import datetime, itertools, json, os, re, sys, urllib2
  3.  
  4. def get_query(url):
  5.     f = urllib2.urlopen(url)
  6.     data = f.read()
  7.     f.close()
  8.     return json.loads(data)
  9.  
  10. '''
  11. def get_most_recent_post(site):
  12.    url = 'http://api.stackexchange.com/2.0/posts?order=desc&sort=creation&site=%s&pagesize=1' % (site,)
  13.    return get_query(url)['items'][0]
  14.  
  15. def get_posts(site, min_id, max_id):
  16.    posts = items
  17.    start = min_id
  18.    while start < max_id:
  19.        stop = start + 100
  20.        if stop > max_id: stop = max_id
  21.        id_list = ';'.join(map(str, range(start, stop)))
  22.        url = 'http://api.stackexchange.com/2.0/posts/%s?order=asc&sort=creation&site=%s&pagesize=100' % (id_list, site)
  23.        posts += get_query(url)['items']
  24.        start = stop
  25.  
  26. def get_all_posts(site):
  27.    last_post = get_most_recent_post(site)
  28. '''
  29.  
  30. def download_site(site):
  31.     unsafe = True
  32.     # get everything that /posts can get: default + body, comments, vote breakdown
  33.     filter_string = '4Jw0Lzyv.2_8S' if unsafe else '!bULULThFh9yPVb'
  34.     url_base = 'http://api.stackexchange.com/2.0/posts?order=asc&sort=creation&site=%s&filter=%s&pagesize=100&page=' % (site, filter_string)
  35.     page_number = 0
  36.     has_more = True
  37.     posts = []
  38.     while has_more:
  39.         page_number += 1
  40.         data = get_query(url_base + str(page_number))
  41.         posts += data['items']
  42.         has_more = data['has_more']
  43.     posts.sort(key=lambda post: post['post_id'])
  44.     return posts
  45.  
  46. if __name__ == "__main__":
  47.     site_name = sys.argv[1]
  48.     out = open(site_name + ".json", "w") if sys.stdout.isatty() else sys.stdout
  49.     posts = download_site(sys.argv[1])
  50.     json.dump(posts, out, indent=2)
  51.     if not out is sys.stdout: out.close()
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×