Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -- coding: utf-8 --
- import urllib2
- import json
- from pprint import pprint
- import codecs
- import re
- DEBUG = True
- def debug(str):
- if DEBUG:
- print(" "+str)
- def get_coordinates(pagename):
- apiurl = "https://en.wikipedia.org/w/api.php?action=query&prop=coordinates&titles=%s&format=xml" % urllib2.quote(pagename.encode('utf-8'))
- f = urllib2.urlopen(apiurl).readlines()[0]
- m = re.match('.*<coordinates>.*?<co lat="(\-?\d+\.?\d*)" lon="(-?\d+\.?\d*)".*\/>',f)
- if m is not None:
- return m.groups()
- else:
- return None
- def get_all_pages_recursive(wikicat):
- debug("Recursing to " + wikicat)
- cmlimit = 500 #for production
- #cmlimit = 2 # for testing
- full_url = "http://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtype=page|subcat&cmtitle=%s&format=json&cmlimit=%d" % (urllib2.quote(wikicat.encode('utf-8')),cmlimit)
- f = urllib2.urlopen(full_url).readlines()[0]
- dc = json.JSONDecoder()
- categoryContents = dc.decode(f)
- pages = []
- for member in categoryContents['query']['categorymembers']:
- if member['ns'] == 0:
- if member['title'][0:7] == 'List of':
- #debug(member['title'] + ' is a list. Ignoring...')
- pass
- else:
- pages.append(member['title'])
- elif member['ns'] == 14:
- if member['title'][0:28] == 'Category:Populated places in':
- pages.extend(get_all_pages_recursive(member['title']))
- else:
- #debug(member['title'] + ' is not a cat of populated places. Ignoring...')
- pass
- return pages
- if __name__ == "__main__":
- c = u"Category:Populated places in South Africa by province"
- pages = get_all_pages_recursive(c)
- maxlen = max(map(len, pages))
- for f in pages:
- loc = get_coordinates(f)
- #loc = None
- print("%s is at %s" % (f.rjust(maxlen), loc))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement