Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- In this problem set you work with cities infobox data, audit it, come up with a cleaning idea and then clean it up.
- In the first exercise we want you to audit the datatypes that can be found in some particular fields in the dataset.
- The possible types of values can be:
- - 'NoneType' if the value is a string "NULL" or an empty string ""
- - 'list', if the value starts with "{"
- - 'int', if the value can be cast to int
- - 'float', if the value can be cast to float, but is not an int
- - 'str', for all other values
- The audit_file function should return a dictionary containing fieldnames and the datatypes that can be found in the field.
- All the data initially is a string, so you have to do some checks on the values first.
- """
- import codecs
- import csv
- import json
- import pprint
- CITIES = 'cities.csv'
- FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label", "isPartOf_label", "areaCode", "populationTotal",
- "elevation", "maximumElevation", "minimumElevation", "populationDensity", "wgs84_pos#lat", "wgs84_pos#long",
- "areaLand", "areaMetro", "areaUrban"]
- #type(1.1) instead of 'float"
- #type(1) instead of 'int'
- #type([]) instead of list 'list'
- #type(None) instead of 'NoneType"
- def is_int(s):
- try:
- int(s)
- return True
- except ValueError:
- return False
- def is_float(s):
- try:
- float(s)
- return True
- except ValueError:
- return False
- def audit_file(filename, fields):
- with open(filename,'rb') as f:
- fieldtypes={}
- index_fields={}
- for field in f:
- all_fields = field.split('","')
- all_fields[0]=all_fields[0].strip('"')
- if (all_fields[0]=='URI') and (all_fields[1]=='rdf-schema#label'):
- for each_field in fields:
- print each_field
- index_fields[each_field]=all_fields.index(each_field)
- fieldtypes[each_field]=[]
- else:
- for each_field in fields:
- print each_field
- cities_index=index_fields[each_field]
- if (all_fields[cities_index]=='NULL') or (all_fields[cities_index]==''):
- if type(None) not in fieldtypes[each_field]:
- fieldtypes[each_field].append(type(None))
- elif all_fields[cities_index].startswith('{'):
- if type([]) not in fieldtypes[each_field]:
- fieldtypes[each_field].append(type([]))
- elif is_int(all_fields[cities_index]):
- if type(1) not in fieldtypes[each_field]:
- fieldtypes[each_field].append(type(1))
- elif is_float(all_fields[cities_index]):
- if type(1.1) not in fieldtypes[each_field]:
- fieldtypes[each_field].append(type(1.1))
- # YOUR CODE HERE
- return fieldtypes
- def test():
- fieldtypes = audit_file(CITIES, FIELDS)
- pprint.pprint(fieldtypes)
- assert fieldtypes["areaLand"] == set([type(None),type([]),type(1.1)])
- assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
- if __name__ == "__main__":
- test()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement