Advertisement
Guest User

Untitled

a guest
Apr 25th, 2015
252
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.26 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. In this problem set you work with another type of infobox data, audit it, clean it,
  5. come up with a data model, insert it into a MongoDB and then run some queries against your database.
  6. The set contains data about Arachnid class.
  7. Your task in this exercise is to parse the file, process only the fields that are listed in the
  8. FIELDS dictionary as keys, and return a list of dictionaries of cleaned values.
  9.  
  10. The following things should be done:
  11. - keys of the dictionary changed according to the mapping in FIELDS dictionary
  12. - trim out redundant description in parenthesis from the 'rdf-schema#label' field, like "(spider)"
  13. - if 'name' is "NULL" or contains non-alphanumeric characters, set it to the same value as 'label'.
  14. - if a value of a field is "NULL", convert it to None
  15. - if there is a value in 'synonym', it should be converted to an array (list)
  16.  by stripping the "{}" characters and splitting the string on "|". Rest of the cleanup is up to you,
  17.  eg removing "*" prefixes etc. If there is a singular synonym, the value should still be formatted
  18.  in a list.
  19. - strip leading and ending whitespace from all fields, if there is any
  20. - the output structure should be as follows:
  21. { 'label': 'Argiope',
  22.  'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
  23.  'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
  24.  'name': 'Argiope',
  25.  'synonym': ["One", "Two"],
  26.  'classification': {
  27.                    'family': 'Orb-weaver spider',
  28.                    'class': 'Arachnid',
  29.                    'phylum': 'Arthropod',
  30.                    'order': 'Spider',
  31.                    'kingdom': 'Animal',
  32.                    'genus': None
  33.                    }
  34. }
  35.  * Note that the value associated with the classification key is a dictionary with
  36.    taxonomic labels.
  37. """
  38. import codecs
  39. import csv
  40. import json
  41. import pprint
  42. import re
  43.  
  44. DATAFILE = 'arachnid.csv'
  45. FIELDS ={'rdf-schema#label': 'label',
  46.          'URI': 'uri',
  47.          'rdf-schema#comment': 'description',
  48.          'synonym': 'synonym',
  49.          'name': 'name',
  50.          'family_label': 'family',
  51.          'class_label': 'class',
  52.          'phylum_label': 'phylum',
  53.          'order_label': 'order',
  54.          'kingdom_label': 'kingdom',
  55.          'genus_label': 'genus'}
  56.  
  57.  
  58. def process_file(filename, fields):
  59.  
  60.     process_fields = fields.keys()
  61.     data = []
  62.     with open(filename, "r") as f:
  63.         reader = csv.DictReader(f)
  64.         #print process_fields
  65.         for i in range(3):
  66.             l = reader.next()
  67.  
  68.         for line in reader:
  69.             helperdict={}        
  70.             for col in process_fields:
  71.                 if col=="name":
  72.                     if line[col]=="NULL" or line[col].isalnum()==False:
  73.                         helperdict[col]=line['synonym']
  74.                     else:
  75.                         helperdict[col]=line[col].strip()
  76.                 if col=="synonym" and line[col]!="":
  77.                     if line[col].count('|')==1:
  78.                         clean=line[col].replace("{","")
  79.                         clean2=clean.replace("}","")
  80.                         hlist=clean2.split("|")
  81.                         hlist2=[item.strip() for item in hlist]
  82.                         helperdict[col]=hlist2
  83.                     else:
  84.                         helperlist=[]
  85.                         helperlist.append(line[col].strip())
  86.                         helperdict[col]=helperlist
  87.                 else:
  88.                     if line[col]=="NULL":
  89.                          helperdict[col]=None
  90.                     else:
  91.                          helperdict[col]=line[col].strip()
  92.  
  93.                 #print type(line[col])
  94.                 #print helperdict
  95.                 #print line[col]
  96.             data.append(helperdict)
  97.  
  98.            
  99.             pass
  100.     print data
  101.     return data
  102.  
  103.  
  104. def parse_array(v):
  105.     if (v[0] == "{") and (v[-1] == "}"):
  106.         v = v.lstrip("{")
  107.         v = v.rstrip("}")
  108.         v_array = v.split("|")
  109.         v_array = [i.strip() for i in v_array]
  110.         return v_array
  111.     return [v]
  112.  
  113.  
  114. def test():
  115.     data = process_file(DATAFILE, FIELDS)
  116.     print "Your first entry:"
  117.     pprint.pprint(data[0])
  118.     first_entry = {
  119.         "synonym": None,
  120.         "name": "Argiope",
  121.         "classification": {
  122.             "kingdom": "Animal",
  123.             "family": "Orb-weaver spider",
  124.             "order": "Spider",
  125.             "phylum": "Arthropod",
  126.             "genus": None,
  127.             "class": "Arachnid"
  128.         },
  129.         "uri": "http://dbpedia.org/resource/Argiope_(spider)",
  130.         "label": "Argiope",
  131.         "description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
  132.     }
  133.  
  134.     assert len(data) == 76
  135.     assert data[0] == first_entry
  136.     assert data[17]["name"] == "Ogdenia"
  137.     assert data[48]["label"] == "Hydrachnidiae"
  138.     assert data[14]["synonym"] == ["Cyrene Peckham & Peckham"]
  139.  
  140. if __name__ == "__main__":
  141.     test()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement