data.py

# -*- coding: utf-8 -*-
"""
Created on Fri Sep 19 12:49:47 2014

@author: vikramk3
"""

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. You could also do some cleaning
before doing that, like in the previous exercise, but for this exercise you just have to
shape the structure.

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings.
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
  same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_ref": ["305896090", "1719825889"]
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

# These are the expected "proper names for the street types
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
            "Trail", "Parkway", "Commons", "Circle", "Cove", "Highway"]

# After studying the Austin, Tx OSM data,, these non-street elements were present in the street address
# I will use this list to remove these non-street elements from the street address
#These non-street elements belong in the "addr:unit" element. I based this on my evaluation of the Austin, Texas OSM data
non_st_elements=["Ste.","Ste","STE", "Suite", "#", "Bldg", "Bldg.", "Bld"]

# I updated this dictionary to include all the abbreviations I observed in the Austin, Tx OSM data
mapping = { "St": "Street", "st ":"Street",
            "St.": "Street", "Ave": "Avenue","Avene":"Avenue", "Rd.":"Road", "Rd":"Road","RD":"Road", "Dr.":"Drive", "Dr":"Drive", "Cir":"Circle", "Blvd":"Boulevard", "Blvd.":"Boulevard", "Blvd,":"Boulevard","Blvd.,":"Boulevard", "Cv":"Cove", "Hwy":"Highway", "Ct":"Court","Ctr":"Court", "CR":"Court", "Ln":"Lane", "Pkwy":"Parkway"
            }

#I created this second dictionary to correct the abbreviations of the non-street elements.
#These audited elements will be placed in "addr:unit"
mapping2 ={"Ste.":"Suite","Ste":"Suite","Bldg":"Building","Bldg.":"Building","Bld":"Building"}


def update_name(name, mapping):
    # In this for loop, I first created a regular expression to find all the abbreviations in the Austin,TX OSM data
    #Secondly, I replaced these abbreviations with the proper full length suffixes
    for st_element in mapping.keys():
        if re.search(r'\s*' + st_element + r'\s*(?!\S)', name):
            name = name.replace(st_element, mapping[st_element])
            break

    #In this section, I initialize four variables used to collect all the non-street elements
    #For example, suite_bldg_data collects any suite or buliding info during each loop
    #Then the suite_bldg_data collected from each loop is aggregated into all_suite_bldg_data
    #This is also the case for house_num_info and it's aggregator all_house_num_data
    suite_bldg_data=""
    house_num_data=""
    all_suite_bldg_data=""
    all_house_num_data=""

     # In this for loop, I first created a regular expression to find all suite, building and "#" elements along with their numbers
     #Secondly, I deleted these non-street expressions and their numbers (ex. Ste. 212) from "addr:street"
     #Based on my study of the Austin,TX OSM data, this information belongs in "addr:unit"
    for each_element in non_st_elements:
        all_suites=re.search(r'\s*' + each_element + r'\S*\s*\S*\d+', name)
        if all_suites:
            suite_bldg_data=all_suites.group()
            all_suite_bldg_data+= suite_bldg_data
            name = name.replace(suite_bldg_data,"")

    #I also found some house numbers in the "addr:street" field. I deleted it from "addr:street"
    #Based on my study of the Austin,TX OSM data, this information belongs in "addr:housenumber"
    house_numbers=re.match(r'\d\d\d+\s*',name)
    if house_numbers:
        house_num_data=house_numbers.group()
        all_house_num_data += house_num_data
        name = name.replace(house_num_data,"")

    #I also found that some "addr:street" had the state and zip code. I deleted this from the "addr:street"
    #This data belongs in "addr:state" and "addr:postcode"
    state_zip=re.search(r'\s\S*\s\S*\s\TX\S*\s*\d\d\d\d\d',name)
    if state_zip:
        state_zip_info=state_zip.group()
        name = name.replace(state_zip_info,"")

    output={"st_name":name, "suites_bldg":all_suite_bldg_data, "house_num":all_house_num_data}

    return output


def shape_element(element):

    #Here, I initalized four variables:
    #node is the primary output of this procedure
    #all_address is used to temporarily hold the "addr:" field.
    #all_address data is transferred to node if a street address exists,
    # a housenumber exists, and two confilcting house numbers do not exist
    node = {}
    all_address={}
    all_node_ref=[]
    all_other_features={}

    #Here I initialize six variables:
    #state_value, street_value, house_unit_value, and house_num_value are used to verify if these values exist for each element
    #house_num_error is used to determine if conflicting house numbers exist
    #house_num_info and suite_info are used to collect house number and suite/bldg info from "addr:street"
    state_value=0
    street_value=0
    house_num_value=0
    house_num_error=0
    house_num_info=''
    suite_info=''
    house_unit_value=0

    if element.tag == "node" or element.tag == "way" :

        #Here I run through the tag children in each element and extract k_value and v_value
        for tag in element.iter("tag"):
            v_value=tag.get("v")
            k_value=tag.get("k")
            search_res=re.search(problemchars,k_value)
            #I check if there are no problem characters in k_value
            if search_res==None:
                #Here, for "addr:street", I audit the street type and extract suite/bldg and housenumbers
                if k_value=="addr:street":
                    k_value=k_value.replace("addr:","")
                    add_update=update_name(v_value,mapping)
                    all_address[k_value]=add_update["st_name"]
                    suite_info=add_update["suites_bldg"]
                    house_num_info=add_update["house_num"]
                    #here is acknowldege that we have a street value
                    street_value=1

                #For all other "addr:" elments excluding addr elements with two colons, I extract k_value and v_value
                elif ("addr:" in k_value) and (k_value != "addr:street")  and (len(k_value.split(":"))<=2):
                    if k_value=="addr:state":
                        #acknowledge that we have a state value
                        state_value=1
                    if k_value=="addr:unit":
                        house_unit_value==1
                    if k_value=="addr:housenumber":
                        #acknowledge we have a housenumber value
                        house_num_value=1
                        one_house_num=v_value
                    k_value=k_value.replace("addr:","")
                    all_address[k_value]=v_value

                else:
                    #here I extract all other k_values such as "amenity", "cuisine', "name",etc..
                    all_other_features[k_value]=v_value

        #Here, I added the state value of TX if it did not exist. All gps data is in TX
        if state_value==0:
            all_address["state"]="TX"

        #If a housenumber did not exist and there was a housenumber from "addr:street"
        #it was added to "adddr:housenumber"
        if (house_num_value==0) and house_num_info:
            all_address["housenumber"]=house_num_info
            house_num_value=1
        #if two conflicting housenumbers exist, house_num_error=1
        elif (house_num_value==1) and house_num_info:
            if  one_house_num not in house_num_info:
                house_num_error=1

        #auditing non-street elements to replace abbreviations
        for each_suite in mapping2.keys():
            if each_suite in suite_info:
                suite_info=suite_info.replace(each_suite,mapping2[each_suite])

        #adding suite/bldg info in "addr:unit"
        if (house_unit_value==0) and suite_info:
            all_address["unit"]=suite_info
        elif (house_unit_value==1) and (suite_info not in all_address["unit"]):
            all_address["unit"]=all_address["unit"] + suite_info

        #Only if an element has a street value, a house number, and no house number error,
        #will the data from all_address be transferred to the output, "node"
        if (street_value==1) and (house_num_value==1) and (house_num_error==0):
            node["address"]=all_address
            for features in all_other_features.keys():
                node[features]=all_other_features[features]

        #if we have a street name, then we can place additional info in the output, "node"
        if street_value==1:
            node["created"]={}
            node["pos"]=[]
            for name, value in element.attrib.items():
                if name in CREATED:
                    node["created"][name]=value
                elif name=="lat":
                    node["pos"].append(float(element.get("lon")))
                elif name=="lon":
                    node["pos"].append(float(element.get("lat")))
                else:
                    node[name]=value

            node["type"]=element.tag

            for tag2 in element.iter("nd"):
                n_value=tag2.get("ref")
                all_node_ref.append(n_value)

            if all_node_ref !=[]:
                node["node_refs"]=all_node_ref

        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # I added element.clear() to avoid any memory issues during iterative parsing
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
                element.clear()
    return data

def test():

    data = process_map('C:\\Users\\vikramk3\\Documents\\Courses\\Data_Wrangling\\austin_texas.osm.txt', True)
    #pprint.pprint(data)

if __name__ == "__main__":
    test()