daily pastebin goal
28%
SHARE
TWEET

get coordinates of pages in en.wikipedia

a guest Sep 11th, 2015 59 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # -- coding: utf-8 --
  2. import urllib2
  3. import json
  4. from pprint import pprint
  5. import codecs
  6. import re
  7.  
  8. DEBUG = True
  9.  
  10. def debug(str):
  11.   if DEBUG:
  12.     print("  "+str)
  13.  
  14. def get_coordinates(pagename):
  15.  
  16.   apiurl = "https://en.wikipedia.org/w/api.php?action=query&prop=coordinates&titles=%s&format=xml" % urllib2.quote(pagename.encode('utf-8'))
  17.  
  18.   f = urllib2.urlopen(apiurl).readlines()[0]
  19.  
  20.   m = re.match('.*<coordinates>.*?<co lat="(\-?\d+\.?\d*)" lon="(-?\d+\.?\d*)".*\/>',f)
  21.   if m is not None:
  22.     return m.groups()
  23.   else:
  24.     return None
  25.  
  26.  
  27.  
  28.  
  29.  
  30. def get_all_pages_recursive(wikicat):
  31.   debug("Recursing to " + wikicat)
  32.   cmlimit = 500 #for production
  33.   #cmlimit = 2 # for testing
  34.   full_url = "http://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtype=page|subcat&cmtitle=%s&format=json&cmlimit=%d" % (urllib2.quote(wikicat.encode('utf-8')),cmlimit)
  35.  
  36.   f = urllib2.urlopen(full_url).readlines()[0]
  37.  
  38.   dc = json.JSONDecoder()
  39.   categoryContents = dc.decode(f)
  40.  
  41.   pages = []
  42.  
  43.   for member in categoryContents['query']['categorymembers']:
  44.     if member['ns'] == 0:
  45.       if member['title'][0:7] == 'List of':
  46.         #debug(member['title'] + ' is a list. Ignoring...')
  47.         pass
  48.       else:
  49.         pages.append(member['title'])
  50.     elif member['ns'] == 14:
  51.       if member['title'][0:28] == 'Category:Populated places in':
  52.         pages.extend(get_all_pages_recursive(member['title']))
  53.       else:
  54.         #debug(member['title'] + ' is not a cat of populated places. Ignoring...')
  55.         pass
  56.  
  57.   return pages
  58.  
  59.  
  60. if __name__ == "__main__":
  61.   c = u"Category:Populated places in South Africa by province"
  62.   pages = get_all_pages_recursive(c)
  63.  
  64.   maxlen = max(map(len, pages))
  65.  
  66.   for f in pages:
  67.     loc = get_coordinates(f)
  68.     #loc = None
  69.     print("%s is at %s" % (f.rjust(maxlen), loc))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top