Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Wed Sep 24 22:32:06 2014
- @author: vikramk3
- """
- """
- Your task in this exercise has two steps:
- - audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix
- the unexpected street types to the appropriate ones in the expected list.
- You have to add mappings only for the actual problems you find in this OSMFILE,
- not a generalized solution, since that may and will depend on the particular area you are auditing.
- - write the update_name function, to actually fix the street name.
- The function takes a string with street name as an argument and should return the fixed name
- We have provided a simple test so that you see what exactly is expected
- """
- import xml.etree.cElementTree as ET
- from collections import defaultdict
- import re
- import pprint
- OSMFILE = "C:\\Users\\vikramk3\\Documents\\Courses\\Data_Wrangling\\austin_texas.osm.txt"
- street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
- # These are the expected "proper names for the street types
- expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
- "Trail", "Parkway", "Commons", "Circle", "Cove", "Highway"]
- # After studying the Austin, Tx OSM data,, these non-street elements were present in the street address
- # I will use this list to remove these non-street elements from the street address
- #These non-street elements belong in the "addr:unit" element. I based this on my evaluation of the Austin, Texas OSM data
- non_st_elements=["Ste.","Ste","STE", "Suite", "#", "Bldg", "Bldg.", "Bld"]
- # I updated this dictionary to include all the abbreviations I observed in the Austin, Tx OSM data
- mapping = { "St": "Street", "st ":"Street",
- "St.": "Street", "Ave": "Avenue","Avene":"Avenue", "Rd.":"Road", "Rd":"Road","RD":"Road", "Dr.":"Drive", "Dr":"Drive", "Cir":"Circle", "Blvd":"Boulevard", "Blvd.":"Boulevard", "Blvd,":"Boulevard","Blvd.,":"Boulevard", "Cv":"Cove", "Hwy":"Highway", "Ct":"Court","Ctr":"Court", "CR":"Court", "Ln":"Lane", "Pkwy":"Parkway"
- }
- #The Austin,TX OSM data has a wide variety of "addr:street" elements. It is very difficult to validate every element.
- #I did my best to focus on auditing the suffixes as well as delete non-street elements from "addr:street"
- def audit_street_type(street_types, street_name):
- m = street_type_re.search(street_name)
- if m:
- street_type = m.group()
- if street_type not in expected:
- street_types[street_type].add(street_name)
- def is_street_name(elem):
- return (elem.attrib['k'] == "addr:street")
- def audit(osmfile):
- #I added elem.clear() after each iteration to prevent any memory problems with iterative parsing
- osm_file = open(osmfile, "r")
- street_types = defaultdict(set)
- for event, elem in ET.iterparse(osm_file, events=("start",)):
- if elem.tag == "node" or elem.tag == "way":
- for tag in elem.iter("tag"):
- if is_street_name(tag):
- audit_street_type(street_types, tag.attrib['v'])
- elem.clear()
- return street_types
- def update_name(name, mapping):
- # In this for loop, I first created a regular expression to find all the abbreviations in the Austin,TX OSM data
- #Secondly, I replaced these abbreviations with the proper full length suffixes
- for st_element in mapping.keys():
- if re.search(r'\s*' + st_element + r'\s*(?!\S)', name):
- name = name.replace(st_element, mapping[st_element])
- # In this for loop, I first created a regular expression to find all suite, building and "#" elements along with their numbers
- #Secondly, I deleted these non-street expressions and their numbers (ex. Ste. 212) from "addr:street"
- #Based on my study of the Austin,TX OSM data, this information belongs in "addr:unit"
- for each_element in non_st_elements:
- all_suites=re.search(r'\s*' + each_element + r'\S*\s*\S*\d+', name)
- if all_suites:
- suite_bldg=all_suites.group()
- name = name.replace(suite_bldg,"")
- #I also found some house numbers in the "addr:street" field. I deleted it from "addr:street"
- #Based on my study of the Austin,TX OSM data, this information belongs in "addr:housenumber"
- house_numbers=re.match(r'\d\d\d+\s*',name)
- if house_numbers:
- temp1=house_numbers.group()
- name = name.replace(temp1,"")
- #I also found that some "addr:street" had the state and zip code. I deleted this from the "addr:street"
- #This data belongs in "addr:state" and "addr:postcode"
- state_zip=re.search(r'\s\S*\s\S*\s\TX\S*\s*\d\d\d\d\d',name)
- if state_zip:
- temp2=state_zip.group()
- name = name.replace(temp2,"")
- return name
- def test():
- st_types = audit(OSMFILE)
- pprint.pprint(dict(st_types))
- for st_type, ways in st_types.iteritems():
- for name in ways:
- better_name = update_name(name, mapping)
- print name, "=>", better_name
- if __name__ == '__main__':
- test()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement