Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Fri Sep 19 12:49:47 2014
- @author: vikramk3
- """
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import xml.etree.ElementTree as ET
- import pprint
- import re
- import codecs
- import json
- """
- Your task is to wrangle the data and transform the shape of the data
- into the model we mentioned earlier. The output should be a list of dictionaries
- that look like this:
- {
- "id": "2406124091",
- "type: "node",
- "visible":"true",
- "created": {
- "version":"2",
- "changeset":"17206049",
- "timestamp":"2013-08-03T16:43:42Z",
- "user":"linuxUser16",
- "uid":"1219059"
- },
- "pos": [41.9757030, -87.6921867],
- "address": {
- "housenumber": "5157",
- "postcode": "60625",
- "street": "North Lincoln Ave"
- },
- "amenity": "restaurant",
- "cuisine": "mexican",
- "name": "La Cabana De Don Luis",
- "phone": "1 (773)-271-5176"
- }
- You have to complete the function 'shape_element'.
- We have provided a function that will parse the map file, and call the function with the element
- as an argument. You should return a dictionary, containing the shaped data for that element.
- We have also provided a way to save the data in a file, so that you could use
- mongoimport later on to import the shaped data into MongoDB. You could also do some cleaning
- before doing that, like in the previous exercise, but for this exercise you just have to
- shape the structure.
- In particular the following things should be done:
- - you should process only 2 types of top level tags: "node" and "way"
- - all attributes of "node" and "way" should be turned into regular key/value pairs, except:
- - attributes in the CREATED array should be added under a key "created"
- - attributes for latitude and longitude should be added to a "pos" array,
- for use in geospacial indexing. Make sure the values inside "pos" array are floats
- and not strings.
- - if second level tag "k" value contains problematic characters, it should be ignored
- - if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- - if second level tag "k" value does not start with "addr:", but contains ":", you can process it
- same as any other tag.
- - if there is a second ":" that separates the type/direction of a street,
- the tag should be ignored, for example:
- <tag k="addr:housenumber" v="5158"/>
- <tag k="addr:street" v="North Lincoln Avenue"/>
- <tag k="addr:street:name" v="Lincoln"/>
- <tag k="addr:street:prefix" v="North"/>
- <tag k="addr:street:type" v="Avenue"/>
- <tag k="amenity" v="pharmacy"/>
- should be turned into:
- {...
- "address": {
- "housenumber": 5158,
- "street": "North Lincoln Avenue"
- }
- "amenity": "pharmacy",
- ...
- }
- - for "way" specifically:
- <nd ref="305896090"/>
- <nd ref="1719825889"/>
- should be turned into
- "node_ref": ["305896090", "1719825889"]
- """
- lower = re.compile(r'^([a-z]|_)*$')
- lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
- problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
- CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
- # These are the expected "proper names for the street types
- expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
- "Trail", "Parkway", "Commons", "Circle", "Cove", "Highway"]
- # After studying the Austin, Tx OSM data,, these non-street elements were present in the street address
- # I will use this list to remove these non-street elements from the street address
- #These non-street elements belong in the "addr:unit" element. I based this on my evaluation of the Austin, Texas OSM data
- non_st_elements=["Ste.","Ste","STE", "Suite", "#", "Bldg", "Bldg.", "Bld"]
- # I updated this dictionary to include all the abbreviations I observed in the Austin, Tx OSM data
- mapping = { "St": "Street", "st ":"Street",
- "St.": "Street", "Ave": "Avenue","Avene":"Avenue", "Rd.":"Road", "Rd":"Road","RD":"Road", "Dr.":"Drive", "Dr":"Drive", "Cir":"Circle", "Blvd":"Boulevard", "Blvd.":"Boulevard", "Blvd,":"Boulevard","Blvd.,":"Boulevard", "Cv":"Cove", "Hwy":"Highway", "Ct":"Court","Ctr":"Court", "CR":"Court", "Ln":"Lane", "Pkwy":"Parkway"
- }
- #I created this second dictionary to correct the abbreviations of the non-street elements.
- #These audited elements will be placed in "addr:unit"
- mapping2 ={"Ste.":"Suite","Ste":"Suite","Bldg":"Building","Bldg.":"Building","Bld":"Building"}
- def update_name(name, mapping):
- # In this for loop, I first created a regular expression to find all the abbreviations in the Austin,TX OSM data
- #Secondly, I replaced these abbreviations with the proper full length suffixes
- for st_element in mapping.keys():
- if re.search(r'\s*' + st_element + r'\s*(?!\S)', name):
- name = name.replace(st_element, mapping[st_element])
- break
- #In this section, I initialize four variables used to collect all the non-street elements
- #For example, suite_bldg_data collects any suite or buliding info during each loop
- #Then the suite_bldg_data collected from each loop is aggregated into all_suite_bldg_data
- #This is also the case for house_num_info and it's aggregator all_house_num_data
- suite_bldg_data=""
- house_num_data=""
- all_suite_bldg_data=""
- all_house_num_data=""
- # In this for loop, I first created a regular expression to find all suite, building and "#" elements along with their numbers
- #Secondly, I deleted these non-street expressions and their numbers (ex. Ste. 212) from "addr:street"
- #Based on my study of the Austin,TX OSM data, this information belongs in "addr:unit"
- for each_element in non_st_elements:
- all_suites=re.search(r'\s*' + each_element + r'\S*\s*\S*\d+', name)
- if all_suites:
- suite_bldg_data=all_suites.group()
- all_suite_bldg_data+= suite_bldg_data
- name = name.replace(suite_bldg_data,"")
- #I also found some house numbers in the "addr:street" field. I deleted it from "addr:street"
- #Based on my study of the Austin,TX OSM data, this information belongs in "addr:housenumber"
- house_numbers=re.match(r'\d\d\d+\s*',name)
- if house_numbers:
- house_num_data=house_numbers.group()
- all_house_num_data += house_num_data
- name = name.replace(house_num_data,"")
- #I also found that some "addr:street" had the state and zip code. I deleted this from the "addr:street"
- #This data belongs in "addr:state" and "addr:postcode"
- state_zip=re.search(r'\s\S*\s\S*\s\TX\S*\s*\d\d\d\d\d',name)
- if state_zip:
- state_zip_info=state_zip.group()
- name = name.replace(state_zip_info,"")
- output={"st_name":name, "suites_bldg":all_suite_bldg_data, "house_num":all_house_num_data}
- return output
- def shape_element(element):
- #Here, I initalized four variables:
- #node is the primary output of this procedure
- #all_address is used to temporarily hold the "addr:" field.
- #all_address data is transferred to node if a street address exists,
- # a housenumber exists, and two confilcting house numbers do not exist
- node = {}
- all_address={}
- all_node_ref=[]
- all_other_features={}
- #Here I initialize six variables:
- #state_value, street_value, house_unit_value, and house_num_value are used to verify if these values exist for each element
- #house_num_error is used to determine if conflicting house numbers exist
- #house_num_info and suite_info are used to collect house number and suite/bldg info from "addr:street"
- state_value=0
- street_value=0
- house_num_value=0
- house_num_error=0
- house_num_info=''
- suite_info=''
- house_unit_value=0
- if element.tag == "node" or element.tag == "way" :
- #Here I run through the tag children in each element and extract k_value and v_value
- for tag in element.iter("tag"):
- v_value=tag.get("v")
- k_value=tag.get("k")
- search_res=re.search(problemchars,k_value)
- #I check if there are no problem characters in k_value
- if search_res==None:
- #Here, for "addr:street", I audit the street type and extract suite/bldg and housenumbers
- if k_value=="addr:street":
- k_value=k_value.replace("addr:","")
- add_update=update_name(v_value,mapping)
- all_address[k_value]=add_update["st_name"]
- suite_info=add_update["suites_bldg"]
- house_num_info=add_update["house_num"]
- #here is acknowldege that we have a street value
- street_value=1
- #For all other "addr:" elments excluding addr elements with two colons, I extract k_value and v_value
- elif ("addr:" in k_value) and (k_value != "addr:street") and (len(k_value.split(":"))<=2):
- if k_value=="addr:state":
- #acknowledge that we have a state value
- state_value=1
- if k_value=="addr:unit":
- house_unit_value==1
- if k_value=="addr:housenumber":
- #acknowledge we have a housenumber value
- house_num_value=1
- one_house_num=v_value
- k_value=k_value.replace("addr:","")
- all_address[k_value]=v_value
- else:
- #here I extract all other k_values such as "amenity", "cuisine', "name",etc..
- all_other_features[k_value]=v_value
- #Here, I added the state value of TX if it did not exist. All gps data is in TX
- if state_value==0:
- all_address["state"]="TX"
- #If a housenumber did not exist and there was a housenumber from "addr:street"
- #it was added to "adddr:housenumber"
- if (house_num_value==0) and house_num_info:
- all_address["housenumber"]=house_num_info
- house_num_value=1
- #if two conflicting housenumbers exist, house_num_error=1
- elif (house_num_value==1) and house_num_info:
- if one_house_num not in house_num_info:
- house_num_error=1
- #auditing non-street elements to replace abbreviations
- for each_suite in mapping2.keys():
- if each_suite in suite_info:
- suite_info=suite_info.replace(each_suite,mapping2[each_suite])
- #adding suite/bldg info in "addr:unit"
- if (house_unit_value==0) and suite_info:
- all_address["unit"]=suite_info
- elif (house_unit_value==1) and (suite_info not in all_address["unit"]):
- all_address["unit"]=all_address["unit"] + suite_info
- #Only if an element has a street value, a house number, and no house number error,
- #will the data from all_address be transferred to the output, "node"
- if (street_value==1) and (house_num_value==1) and (house_num_error==0):
- node["address"]=all_address
- for features in all_other_features.keys():
- node[features]=all_other_features[features]
- #if we have a street name, then we can place additional info in the output, "node"
- if street_value==1:
- node["created"]={}
- node["pos"]=[]
- for name, value in element.attrib.items():
- if name in CREATED:
- node["created"][name]=value
- elif name=="lat":
- node["pos"].append(float(element.get("lon")))
- elif name=="lon":
- node["pos"].append(float(element.get("lat")))
- else:
- node[name]=value
- node["type"]=element.tag
- for tag2 in element.iter("nd"):
- n_value=tag2.get("ref")
- all_node_ref.append(n_value)
- if all_node_ref !=[]:
- node["node_refs"]=all_node_ref
- return node
- else:
- return None
- def process_map(file_in, pretty = False):
- # I added element.clear() to avoid any memory issues during iterative parsing
- file_out = "{0}.json".format(file_in)
- data = []
- with codecs.open(file_out, "w") as fo:
- for _, element in ET.iterparse(file_in):
- el = shape_element(element)
- if el:
- data.append(el)
- if pretty:
- fo.write(json.dumps(el, indent=2)+"\n")
- else:
- fo.write(json.dumps(el) + "\n")
- element.clear()
- return data
- def test():
- data = process_map('C:\\Users\\vikramk3\\Documents\\Courses\\Data_Wrangling\\austin_texas.osm.txt', True)
- #pprint.pprint(data)
- if __name__ == "__main__":
- test()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement