Advertisement
vikramk3

data.py

Oct 15th, 2014
301
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.98 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Fri Sep 19 12:49:47 2014
  4.  
  5. @author: vikramk3
  6. """
  7.  
  8. #!/usr/bin/env python
  9. # -*- coding: utf-8 -*-
  10. import xml.etree.ElementTree as ET
  11. import pprint
  12. import re
  13. import codecs
  14. import json
  15. """
  16. Your task is to wrangle the data and transform the shape of the data
  17. into the model we mentioned earlier. The output should be a list of dictionaries
  18. that look like this:
  19.  
  20. {
  21. "id": "2406124091",
  22. "type: "node",
  23. "visible":"true",
  24. "created": {
  25.          "version":"2",
  26.          "changeset":"17206049",
  27.          "timestamp":"2013-08-03T16:43:42Z",
  28.          "user":"linuxUser16",
  29.          "uid":"1219059"
  30.        },
  31. "pos": [41.9757030, -87.6921867],
  32. "address": {
  33.          "housenumber": "5157",
  34.          "postcode": "60625",
  35.          "street": "North Lincoln Ave"
  36.        },
  37. "amenity": "restaurant",
  38. "cuisine": "mexican",
  39. "name": "La Cabana De Don Luis",
  40. "phone": "1 (773)-271-5176"
  41. }
  42.  
  43. You have to complete the function 'shape_element'.
  44. We have provided a function that will parse the map file, and call the function with the element
  45. as an argument. You should return a dictionary, containing the shaped data for that element.
  46. We have also provided a way to save the data in a file, so that you could use
  47. mongoimport later on to import the shaped data into MongoDB. You could also do some cleaning
  48. before doing that, like in the previous exercise, but for this exercise you just have to
  49. shape the structure.
  50.  
  51. In particular the following things should be done:
  52. - you should process only 2 types of top level tags: "node" and "way"
  53. - all attributes of "node" and "way" should be turned into regular key/value pairs, except:
  54.    - attributes in the CREATED array should be added under a key "created"
  55.    - attributes for latitude and longitude should be added to a "pos" array,
  56.      for use in geospacial indexing. Make sure the values inside "pos" array are floats
  57.      and not strings.
  58. - if second level tag "k" value contains problematic characters, it should be ignored
  59. - if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
  60. - if second level tag "k" value does not start with "addr:", but contains ":", you can process it
  61.  same as any other tag.
  62. - if there is a second ":" that separates the type/direction of a street,
  63.  the tag should be ignored, for example:
  64.  
  65. <tag k="addr:housenumber" v="5158"/>
  66. <tag k="addr:street" v="North Lincoln Avenue"/>
  67. <tag k="addr:street:name" v="Lincoln"/>
  68. <tag k="addr:street:prefix" v="North"/>
  69. <tag k="addr:street:type" v="Avenue"/>
  70. <tag k="amenity" v="pharmacy"/>
  71.  
  72.  should be turned into:
  73.  
  74. {...
  75. "address": {
  76.    "housenumber": 5158,
  77.    "street": "North Lincoln Avenue"
  78. }
  79. "amenity": "pharmacy",
  80. ...
  81. }
  82.  
  83. - for "way" specifically:
  84.  
  85.  <nd ref="305896090"/>
  86.  <nd ref="1719825889"/>
  87.  
  88. should be turned into
  89. "node_ref": ["305896090", "1719825889"]
  90. """
  91.  
  92.  
  93. lower = re.compile(r'^([a-z]|_)*$')
  94. lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
  95. problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
  96.  
  97. CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
  98.  
  99. # These are the expected "proper names for the street types
  100. expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
  101.             "Trail", "Parkway", "Commons", "Circle", "Cove", "Highway"]
  102.  
  103. # After studying the Austin, Tx OSM data,, these non-street elements were present in the street address
  104. # I will use this list to remove these non-street elements from the street address
  105. #These non-street elements belong in the "addr:unit" element. I based this on my evaluation of the Austin, Texas OSM data                  
  106. non_st_elements=["Ste.","Ste","STE", "Suite", "#", "Bldg", "Bldg.", "Bld"]
  107.  
  108. # I updated this dictionary to include all the abbreviations I observed in the Austin, Tx OSM data
  109. mapping = { "St": "Street", "st ":"Street",
  110.             "St.": "Street", "Ave": "Avenue","Avene":"Avenue", "Rd.":"Road", "Rd":"Road","RD":"Road", "Dr.":"Drive", "Dr":"Drive", "Cir":"Circle", "Blvd":"Boulevard", "Blvd.":"Boulevard", "Blvd,":"Boulevard","Blvd.,":"Boulevard", "Cv":"Cove", "Hwy":"Highway", "Ct":"Court","Ctr":"Court", "CR":"Court", "Ln":"Lane", "Pkwy":"Parkway"
  111.             }
  112.  
  113. #I created this second dictionary to correct the abbreviations of the non-street elements.
  114. #These audited elements will be placed in "addr:unit"
  115. mapping2 ={"Ste.":"Suite","Ste":"Suite","Bldg":"Building","Bldg.":"Building","Bld":"Building"}
  116.  
  117.  
  118. def update_name(name, mapping):
  119.     # In this for loop, I first created a regular expression to find all the abbreviations in the Austin,TX OSM data
  120.     #Secondly, I replaced these abbreviations with the proper full length suffixes
  121.     for st_element in mapping.keys():
  122.         if re.search(r'\s*' + st_element + r'\s*(?!\S)', name):
  123.             name = name.replace(st_element, mapping[st_element])
  124.             break
  125.  
  126.     #In this section, I initialize four variables used to collect all the non-street elements
  127.     #For example, suite_bldg_data collects any suite or buliding info during each loop
  128.     #Then the suite_bldg_data collected from each loop is aggregated into all_suite_bldg_data
  129.     #This is also the case for house_num_info and it's aggregator all_house_num_data      
  130.     suite_bldg_data=""
  131.     house_num_data=""
  132.     all_suite_bldg_data=""
  133.     all_house_num_data=""
  134.  
  135.      # In this for loop, I first created a regular expression to find all suite, building and "#" elements along with their numbers
  136.      #Secondly, I deleted these non-street expressions and their numbers (ex. Ste. 212) from "addr:street"
  137.      #Based on my study of the Austin,TX OSM data, this information belongs in "addr:unit"
  138.     for each_element in non_st_elements:
  139.         all_suites=re.search(r'\s*' + each_element + r'\S*\s*\S*\d+', name)
  140.         if all_suites:
  141.             suite_bldg_data=all_suites.group()
  142.             all_suite_bldg_data+= suite_bldg_data
  143.             name = name.replace(suite_bldg_data,"")
  144.  
  145.     #I also found some house numbers in the "addr:street" field. I deleted it from "addr:street"
  146.     #Based on my study of the Austin,TX OSM data, this information belongs in "addr:housenumber"      
  147.     house_numbers=re.match(r'\d\d\d+\s*',name)
  148.     if house_numbers:
  149.         house_num_data=house_numbers.group()
  150.         all_house_num_data += house_num_data
  151.         name = name.replace(house_num_data,"")
  152.  
  153.     #I also found that some "addr:street" had the state and zip code. I deleted this from the "addr:street"
  154.     #This data belongs in "addr:state" and "addr:postcode"    
  155.     state_zip=re.search(r'\s\S*\s\S*\s\TX\S*\s*\d\d\d\d\d',name)
  156.     if state_zip:
  157.         state_zip_info=state_zip.group()
  158.         name = name.replace(state_zip_info,"")
  159.        
  160.     output={"st_name":name, "suites_bldg":all_suite_bldg_data, "house_num":all_house_num_data}
  161.  
  162.     return output
  163.  
  164.  
  165.  
  166. def shape_element(element):
  167.    
  168.     #Here, I initalized four variables:
  169.     #node is the primary output of this procedure
  170.     #all_address is used to temporarily hold the "addr:" field.
  171.     #all_address data is transferred to node if a street address exists,
  172.     # a housenumber exists, and two confilcting house numbers do not exist
  173.     node = {}
  174.     all_address={}
  175.     all_node_ref=[]
  176.     all_other_features={}
  177.  
  178.     #Here I initialize six variables:
  179.     #state_value, street_value, house_unit_value, and house_num_value are used to verify if these values exist for each element
  180.     #house_num_error is used to determine if conflicting house numbers exist
  181.     #house_num_info and suite_info are used to collect house number and suite/bldg info from "addr:street"
  182.     state_value=0
  183.     street_value=0
  184.     house_num_value=0
  185.     house_num_error=0
  186.     house_num_info=''
  187.     suite_info=''
  188.     house_unit_value=0
  189.    
  190.     if element.tag == "node" or element.tag == "way" :
  191.        
  192.         #Here I run through the tag children in each element and extract k_value and v_value
  193.         for tag in element.iter("tag"):
  194.             v_value=tag.get("v")
  195.             k_value=tag.get("k")
  196.             search_res=re.search(problemchars,k_value)
  197.             #I check if there are no problem characters in k_value
  198.             if search_res==None:
  199.                 #Here, for "addr:street", I audit the street type and extract suite/bldg and housenumbers
  200.                 if k_value=="addr:street":
  201.                     k_value=k_value.replace("addr:","")
  202.                     add_update=update_name(v_value,mapping)
  203.                     all_address[k_value]=add_update["st_name"]
  204.                     suite_info=add_update["suites_bldg"]
  205.                     house_num_info=add_update["house_num"]
  206.                     #here is acknowldege that we have a street value
  207.                     street_value=1
  208.  
  209.                 #For all other "addr:" elments excluding addr elements with two colons, I extract k_value and v_value
  210.                 elif ("addr:" in k_value) and (k_value != "addr:street")  and (len(k_value.split(":"))<=2):
  211.                     if k_value=="addr:state":
  212.                         #acknowledge that we have a state value
  213.                         state_value=1
  214.                     if k_value=="addr:unit":
  215.                         house_unit_value==1
  216.                     if k_value=="addr:housenumber":
  217.                         #acknowledge we have a housenumber value
  218.                         house_num_value=1
  219.                         one_house_num=v_value
  220.                     k_value=k_value.replace("addr:","")
  221.                     all_address[k_value]=v_value
  222.                    
  223.                 else:
  224.                     #here I extract all other k_values such as "amenity", "cuisine', "name",etc..
  225.                     all_other_features[k_value]=v_value
  226.  
  227.         #Here, I added the state value of TX if it did not exist. All gps data is in TX      
  228.         if state_value==0:
  229.             all_address["state"]="TX"
  230.        
  231.         #If a housenumber did not exist and there was a housenumber from "addr:street"
  232.         #it was added to "adddr:housenumber"
  233.         if (house_num_value==0) and house_num_info:
  234.             all_address["housenumber"]=house_num_info
  235.             house_num_value=1
  236.         #if two conflicting housenumbers exist, house_num_error=1
  237.         elif (house_num_value==1) and house_num_info:
  238.             if  one_house_num not in house_num_info:
  239.                 house_num_error=1
  240.  
  241.         #auditing non-street elements to replace abbreviations      
  242.         for each_suite in mapping2.keys():
  243.             if each_suite in suite_info:
  244.                 suite_info=suite_info.replace(each_suite,mapping2[each_suite])
  245.  
  246.         #adding suite/bldg info in "addr:unit"            
  247.         if (house_unit_value==0) and suite_info:
  248.             all_address["unit"]=suite_info
  249.         elif (house_unit_value==1) and (suite_info not in all_address["unit"]):
  250.             all_address["unit"]=all_address["unit"] + suite_info
  251.  
  252.         #Only if an element has a street value, a house number, and no house number error,
  253.         #will the data from all_address be transferred to the output, "node"    
  254.         if (street_value==1) and (house_num_value==1) and (house_num_error==0):
  255.             node["address"]=all_address
  256.             for features in all_other_features.keys():
  257.                 node[features]=all_other_features[features]
  258.        
  259.         #if we have a street name, then we can place additional info in the output, "node"        
  260.         if street_value==1:
  261.             node["created"]={}
  262.             node["pos"]=[]
  263.             for name, value in element.attrib.items():
  264.                 if name in CREATED:
  265.                     node["created"][name]=value
  266.                 elif name=="lat":
  267.                     node["pos"].append(float(element.get("lon")))
  268.                 elif name=="lon":
  269.                     node["pos"].append(float(element.get("lat")))
  270.                 else:
  271.                     node[name]=value
  272.                    
  273.             node["type"]=element.tag
  274.                    
  275.             for tag2 in element.iter("nd"):
  276.                 n_value=tag2.get("ref")
  277.                 all_node_ref.append(n_value)
  278.                
  279.             if all_node_ref !=[]:
  280.                 node["node_refs"]=all_node_ref
  281.        
  282.         return node
  283.     else:
  284.         return None
  285.  
  286.  
  287. def process_map(file_in, pretty = False):
  288.     # I added element.clear() to avoid any memory issues during iterative parsing
  289.     file_out = "{0}.json".format(file_in)
  290.     data = []
  291.     with codecs.open(file_out, "w") as fo:
  292.         for _, element in ET.iterparse(file_in):
  293.             el = shape_element(element)
  294.             if el:
  295.                 data.append(el)
  296.                 if pretty:
  297.                     fo.write(json.dumps(el, indent=2)+"\n")
  298.                 else:
  299.                     fo.write(json.dumps(el) + "\n")
  300.                 element.clear()
  301.     return data
  302.  
  303. def test():
  304.  
  305.     data = process_map('C:\\Users\\vikramk3\\Documents\\Courses\\Data_Wrangling\\austin_texas.osm.txt', True)
  306.     #pprint.pprint(data)
  307.  
  308. if __name__ == "__main__":
  309.     test()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement