audit.py

# -*- coding: utf-8 -*-
"""
Created on Wed Sep 24 22:32:06 2014

@author: vikramk3
"""

"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "C:\\Users\\vikramk3\\Documents\\Courses\\Data_Wrangling\\austin_texas.osm.txt"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

# These are the expected "proper names for the street types
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
            "Trail", "Parkway", "Commons", "Circle", "Cove", "Highway"]

# After studying the Austin, Tx OSM data,, these non-street elements were present in the street address
# I will use this list to remove these non-street elements from the street address
#These non-street elements belong in the "addr:unit" element. I based this on my evaluation of the Austin, Texas OSM data
non_st_elements=["Ste.","Ste","STE", "Suite", "#", "Bldg", "Bldg.", "Bld"]

# I updated this dictionary to include all the abbreviations I observed in the Austin, Tx OSM data
mapping = { "St": "Street", "st ":"Street",
            "St.": "Street", "Ave": "Avenue","Avene":"Avenue", "Rd.":"Road", "Rd":"Road","RD":"Road", "Dr.":"Drive", "Dr":"Drive", "Cir":"Circle", "Blvd":"Boulevard", "Blvd.":"Boulevard", "Blvd,":"Boulevard","Blvd.,":"Boulevard", "Cv":"Cove", "Hwy":"Highway", "Ct":"Court","Ctr":"Court", "CR":"Court", "Ln":"Lane", "Pkwy":"Parkway"
            }
#The Austin,TX OSM data has a wide variety of "addr:street" elements. It is very difficult to validate every element.
#I did my best to focus on auditing the suffixes as well as delete non-street elements from "addr:street"

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    #I added elem.clear() after each iteration to prevent any memory problems with iterative parsing
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

        elem.clear()

    return street_types


def update_name(name, mapping):
    # In this for loop, I first created a regular expression to find all the abbreviations in the Austin,TX OSM data
    #Secondly, I replaced these abbreviations with the proper full length suffixes
    for st_element in mapping.keys():
        if re.search(r'\s*' + st_element + r'\s*(?!\S)', name):
            name = name.replace(st_element, mapping[st_element])

     # In this for loop, I first created a regular expression to find all suite, building and "#" elements along with their numbers
     #Secondly, I deleted these non-street expressions and their numbers (ex. Ste. 212) from "addr:street"
     #Based on my study of the Austin,TX OSM data, this information belongs in "addr:unit"
    for each_element in non_st_elements:
        all_suites=re.search(r'\s*' + each_element + r'\S*\s*\S*\d+', name)
        if all_suites:
            suite_bldg=all_suites.group()
            name = name.replace(suite_bldg,"")

    #I also found some house numbers in the "addr:street" field. I deleted it from "addr:street"
    #Based on my study of the Austin,TX OSM data, this information belongs in "addr:housenumber"
        house_numbers=re.match(r'\d\d\d+\s*',name)
        if house_numbers:
            temp1=house_numbers.group()
            name = name.replace(temp1,"")

    #I also found that some "addr:street" had the state and zip code. I deleted this from the "addr:street"
    #This data belongs in "addr:state" and "addr:postcode"
        state_zip=re.search(r'\s\S*\s\S*\s\TX\S*\s*\d\d\d\d\d',name)
        if state_zip:
            temp2=state_zip.group()
            name = name.replace(temp2,"")

    return name


def test():
    st_types = audit(OSMFILE)
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name


if __name__ == '__main__':
    test()