Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from urlparse import urlparse
- from threading import Thread
- import httplib, sys
- from Queue import Queue
- import json
- import urllib2
- import pycurl
- desc_topic = []
- concurrent = 20
- def do_rest_call(url):
- try:
- response = urllib2.urlopen(url).read()
- return response
- except urllib2.HTTPError as ex:
- if ex.code == 404 or ex.code == 400:
- raise Exception
- def get_all_countries():
- url = "http://api.worldbank.org/countries?format=json"
- response = do_rest_call(url)
- data = json.loads(response)
- data = data[1]
- info = []
- for element in data:
- info.append(element["iso2Code"])
- return info
- def get_string(list_topic):
- info = ""
- for element in list_topic:
- info += str(element)
- return info
- country = get_all_countries()
- def get_all_urls():
- urls = []
- search_url ="http://api.worldbank.org/indicators?format=json&per_page=20000"
- response = do_rest_call(search_url)
- data = json.loads(response)
- data = data[1]
- for element in data:
- urls.append("http://api.worldbank.org/countries/all/indicators/"+str(element["id"]))
- global desc_topic
- desc_topic.append((element["id"], element["sourceNote"], element["topics"]))
- return urls
- def doWork():
- while True:
- url = q.get()
- body = getBody(url)
- field_description = ""
- field_topic = ""
- info = []
- try:
- data = json.loads(body)
- data = data[1]
- key = data[0]["indicator"]["id"]
- name = data[0]["indicator"]["value"]
- for element in data:
- if element["country"]["id"] in country:
- info.append("worldbank/"+element["country"]["id"]+"/"+key)
- except:
- key = ""
- name = ""
- if key != "" and name != "" and info != []:
- for element in info:
- put_element = {"id": element,
- "name": name,
- "description": field_description,
- "topic": field_topic,
- "source": "WORLDBANK",
- "country": ""
- }
- elastic_url = "localhost:9200/timeseries/external/"+key.replace("/", ".")+"?pretty"
- c = pycurl.Curl()
- c.setopt(pycurl.URL, elastic_url)
- c.setopt(pycurl.POSTFIELDS, json.dumps(put_element))
- c.perform()
- q.task_done()
- def getBody(ourl):
- url = urlparse(ourl)
- conn = httplib.HTTPConnection(url.netloc)
- conn.request("GET", url.path+"?format=json&per_page=20000")
- res = conn.getresponse()
- return res.read()
- global q
- q = Queue(concurrent * 2)
- for i in range(concurrent):
- t = Thread(target=doWork)
- t.daemon = True
- t.start()
- try:
- for url in get_all_urls():
- q.put(url.strip())
- q.join()
- except KeyboardInterrupt:
- sys.exit(1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement