Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Wed Sep 24 21:56:27 2014
- @author: vikramk3
- """
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import xml.etree.ElementTree as ET
- import pprint
- import re
- """
- Your task is to explore the data a bit more.
- Before you process the data and add it into MongoDB, you should
- check the "k" value for each "<tag>" and see if they can be valid keys in MongoDB,
- as well as see if there are any other potential problems.
- We have provided you with 3 regular expressions to check for certain patterns
- in the tags. As we saw in the quiz earlier, we would like to change the data model
- and expand the "addr:street" type of keys to a dictionary like this:
- {"address": {"street": "Some value"}}
- So, we have to see if we have such tags, and if we have any tags with problematic characters.
- Please complete the function 'key_type'.
- """
- lower = re.compile(r'^([a-z]|_)*$')
- lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
- problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
- def key_type(element, keys):
- if element.tag == "tag":
- # I obtain the k_value for the tag
- k_value=element.get("k")
- # I perform three searches for the three types of patterns
- search_res1=re.search(lower,k_value)
- search_res2=re.search(lower_colon,k_value)
- search_res3=re.search(problemchars,k_value)
- # I perform if-elseif-else to identify and count the key_types
- if search_res1:
- keys["lower"]=keys["lower"]+1
- elif search_res2:
- keys["lower_colon"]=keys["lower_colon"] +1
- elif search_res3:
- keys["problemchars"]=keys["problemchars"] +1
- else:
- keys["other"]=keys["other"] + 1
- pass
- return keys
- def process_map(filename):
- # I added element.clear() after each iteration to avoid memory problems with the iterative parsing
- keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
- for _, element in ET.iterparse(filename):
- keys = key_type(element, keys)
- element.clear()
- return keys
- def test():
- keys = process_map('C:\\Users\\vikramk3\\Documents\\Courses\\Data_Wrangling\\austin_texas.osm')
- pprint.pprint(keys)
- if __name__ == "__main__":
- test()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement