Advertisement
vikramk3

audit.py

Oct 15th, 2014
522
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.13 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Wed Sep 24 22:32:06 2014
  4.  
  5. @author: vikramk3
  6. """
  7.  
  8. """
  9. Your task in this exercise has two steps:
  10.  
  11. - audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix
  12.    the unexpected street types to the appropriate ones in the expected list.
  13.    You have to add mappings only for the actual problems you find in this OSMFILE,
  14.    not a generalized solution, since that may and will depend on the particular area you are auditing.
  15. - write the update_name function, to actually fix the street name.
  16.    The function takes a string with street name as an argument and should return the fixed name
  17.    We have provided a simple test so that you see what exactly is expected
  18. """
  19. import xml.etree.cElementTree as ET
  20. from collections import defaultdict
  21. import re
  22. import pprint
  23.  
  24. OSMFILE = "C:\\Users\\vikramk3\\Documents\\Courses\\Data_Wrangling\\austin_texas.osm.txt"
  25. street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
  26.  
  27. # These are the expected "proper names for the street types
  28. expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
  29.             "Trail", "Parkway", "Commons", "Circle", "Cove", "Highway"]
  30.  
  31. # After studying the Austin, Tx OSM data,, these non-street elements were present in the street address
  32. # I will use this list to remove these non-street elements from the street address
  33. #These non-street elements belong in the "addr:unit" element. I based this on my evaluation of the Austin, Texas OSM data            
  34. non_st_elements=["Ste.","Ste","STE", "Suite", "#", "Bldg", "Bldg.", "Bld"]
  35.  
  36. # I updated this dictionary to include all the abbreviations I observed in the Austin, Tx OSM data
  37. mapping = { "St": "Street", "st ":"Street",
  38.             "St.": "Street", "Ave": "Avenue","Avene":"Avenue", "Rd.":"Road", "Rd":"Road","RD":"Road", "Dr.":"Drive", "Dr":"Drive", "Cir":"Circle", "Blvd":"Boulevard", "Blvd.":"Boulevard", "Blvd,":"Boulevard","Blvd.,":"Boulevard", "Cv":"Cove", "Hwy":"Highway", "Ct":"Court","Ctr":"Court", "CR":"Court", "Ln":"Lane", "Pkwy":"Parkway"
  39.             }
  40. #The Austin,TX OSM data has a wide variety of "addr:street" elements. It is very difficult to validate every element.
  41. #I did my best to focus on auditing the suffixes as well as delete non-street elements from "addr:street"
  42.  
  43. def audit_street_type(street_types, street_name):
  44.     m = street_type_re.search(street_name)
  45.     if m:
  46.         street_type = m.group()
  47.         if street_type not in expected:
  48.             street_types[street_type].add(street_name)
  49.  
  50.  
  51. def is_street_name(elem):
  52.     return (elem.attrib['k'] == "addr:street")
  53.    
  54.  
  55. def audit(osmfile):
  56.     #I added elem.clear() after each iteration to prevent any memory problems with iterative parsing
  57.     osm_file = open(osmfile, "r")
  58.     street_types = defaultdict(set)
  59.     for event, elem in ET.iterparse(osm_file, events=("start",)):
  60.  
  61.         if elem.tag == "node" or elem.tag == "way":
  62.             for tag in elem.iter("tag"):
  63.                 if is_street_name(tag):
  64.                     audit_street_type(street_types, tag.attrib['v'])
  65.  
  66.         elem.clear()
  67.  
  68.     return street_types
  69.  
  70.  
  71. def update_name(name, mapping):
  72.     # In this for loop, I first created a regular expression to find all the abbreviations in the Austin,TX OSM data
  73.     #Secondly, I replaced these abbreviations with the proper full length suffixes
  74.     for st_element in mapping.keys():
  75.         if re.search(r'\s*' + st_element + r'\s*(?!\S)', name):
  76.             name = name.replace(st_element, mapping[st_element])
  77.            
  78.      # In this for loop, I first created a regular expression to find all suite, building and "#" elements along with their numbers
  79.      #Secondly, I deleted these non-street expressions and their numbers (ex. Ste. 212) from "addr:street"
  80.      #Based on my study of the Austin,TX OSM data, this information belongs in "addr:unit"
  81.     for each_element in non_st_elements:
  82.         all_suites=re.search(r'\s*' + each_element + r'\S*\s*\S*\d+', name)
  83.         if all_suites:
  84.             suite_bldg=all_suites.group()
  85.             name = name.replace(suite_bldg,"")
  86.            
  87.     #I also found some house numbers in the "addr:street" field. I deleted it from "addr:street"
  88.     #Based on my study of the Austin,TX OSM data, this information belongs in "addr:housenumber"
  89.         house_numbers=re.match(r'\d\d\d+\s*',name)
  90.         if house_numbers:
  91.             temp1=house_numbers.group()
  92.             name = name.replace(temp1,"")
  93.            
  94.     #I also found that some "addr:street" had the state and zip code. I deleted this from the "addr:street"
  95.     #This data belongs in "addr:state" and "addr:postcode"
  96.         state_zip=re.search(r'\s\S*\s\S*\s\TX\S*\s*\d\d\d\d\d',name)
  97.         if state_zip:
  98.             temp2=state_zip.group()
  99.             name = name.replace(temp2,"")
  100.  
  101.     return name
  102.  
  103.  
  104. def test():
  105.     st_types = audit(OSMFILE)
  106.     pprint.pprint(dict(st_types))
  107.  
  108.     for st_type, ways in st_types.iteritems():
  109.         for name in ways:
  110.             better_name = update_name(name, mapping)
  111.             print name, "=>", better_name
  112.  
  113.  
  114.  
  115. if __name__ == '__main__':
  116.     test()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement