Advertisement
vikramk3

Untitled

Aug 17th, 2014
228
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.37 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. In this problem set you work with cities infobox data, audit it, come up with a cleaning idea and then clean it up.
  5. In the first exercise we want you to audit the datatypes that can be found in some particular fields in the dataset.
  6. The possible types of values can be:
  7. - 'NoneType' if the value is a string "NULL" or an empty string ""
  8. - 'list', if the value starts with "{"
  9. - 'int', if the value can be cast to int
  10. - 'float', if the value can be cast to float, but is not an int
  11. - 'str', for all other values
  12.  
  13. The audit_file function should return a dictionary containing fieldnames and the datatypes that can be found in the field.
  14. All the data initially is a string, so you have to do some checks on the values first.
  15.  
  16. """
  17. import codecs
  18. import csv
  19. import json
  20. import pprint
  21.  
  22. CITIES = 'cities.csv'
  23.  
  24. FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label", "isPartOf_label", "areaCode", "populationTotal",
  25.           "elevation", "maximumElevation", "minimumElevation", "populationDensity", "wgs84_pos#lat", "wgs84_pos#long",
  26.           "areaLand", "areaMetro", "areaUrban"]
  27.          
  28. #type(1.1) instead of 'float"
  29. #type(1) instead of 'int'
  30. #type([]) instead of list 'list'
  31. #type(None) instead of 'NoneType"          
  32.          
  33. def is_int(s):
  34.     try:
  35.         int(s)
  36.         return True
  37.     except ValueError:
  38.         return False
  39.        
  40. def is_float(s):
  41.     try:
  42.         float(s)
  43.         return True
  44.     except ValueError:
  45.         return False
  46.  
  47. def audit_file(filename, fields):
  48.     with open(filename,'rb') as f:
  49.         fieldtypes={}
  50.         index_fields={}
  51.         for field in f:
  52.             all_fields = field.split('","')
  53.             all_fields[0]=all_fields[0].strip('"')
  54.             if (all_fields[0]=='URI') and (all_fields[1]=='rdf-schema#label'):
  55.                 for each_field in fields:
  56.                     print each_field
  57.                     index_fields[each_field]=all_fields.index(each_field)
  58.                     fieldtypes[each_field]=[]
  59.             else:
  60.                 for each_field in fields:
  61.                     print each_field
  62.                     cities_index=index_fields[each_field]
  63.                     if (all_fields[cities_index]=='NULL') or (all_fields[cities_index]==''):
  64.                         if type(None) not in fieldtypes[each_field]:
  65.                             fieldtypes[each_field].append(type(None))
  66.                     elif all_fields[cities_index].startswith('{'):
  67.                         if type([]) not in fieldtypes[each_field]:
  68.                             fieldtypes[each_field].append(type([]))
  69.                     elif is_int(all_fields[cities_index]):
  70.                         if type(1) not in fieldtypes[each_field]:
  71.                             fieldtypes[each_field].append(type(1))
  72.                     elif is_float(all_fields[cities_index]):
  73.                         if type(1.1) not in fieldtypes[each_field]:
  74.                             fieldtypes[each_field].append(type(1.1))
  75.                    
  76.  
  77.     # YOUR CODE HERE
  78.  
  79.  
  80.     return fieldtypes
  81.  
  82.  
  83. def test():
  84.     fieldtypes = audit_file(CITIES, FIELDS)
  85.  
  86.     pprint.pprint(fieldtypes)
  87.  
  88.     assert fieldtypes["areaLand"] == set([type(None),type([]),type(1.1)])
  89.     assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
  90.    
  91. if __name__ == "__main__":
  92.     test()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement