tags.py

# -*- coding: utf-8 -*-
"""
Created on Wed Sep 24 21:56:27 2014

@author: vikramk3
"""

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into MongoDB, you should
check the "k" value for each "<tag>" and see if they can be valid keys in MongoDB,
as well as see if there are any other potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data model
and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with problematic characters.
Please complete the function 'key_type'.
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        # I obtain the k_value for the tag
        k_value=element.get("k")
        # I perform three searches for the three types of patterns
        search_res1=re.search(lower,k_value)
        search_res2=re.search(lower_colon,k_value)
        search_res3=re.search(problemchars,k_value)
        # I perform if-elseif-else to identify and count the key_types
        if search_res1:
            keys["lower"]=keys["lower"]+1
        elif search_res2:
            keys["lower_colon"]=keys["lower_colon"] +1
        elif search_res3:
            keys["problemchars"]=keys["problemchars"] +1
        else:
            keys["other"]=keys["other"] + 1

        pass

    return keys


def process_map(filename):
    # I added element.clear() after each iteration to avoid memory problems with the iterative parsing
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
        element.clear()

    return keys


def test():

    keys = process_map('C:\\Users\\vikramk3\\Documents\\Courses\\Data_Wrangling\\austin_texas.osm')
    pprint.pprint(keys)


if __name__ == "__main__":
    test()