Abhisek92

File Manipulation

Jan 15th, 2018
273
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.74 KB | None | 0 0
  1. import os
  2. import copy
  3.  
  4. os.chdir("D:\\Module_5-Lab\\Python\\KP\\") #Change this according to the location of the File
  5.  
  6. #Properly Store the File in Memory
  7. def import_data(fileURI):
  8.     fileMap=[]
  9.     fileObject=open(fileURI, "r")
  10.     header_list=((fileObject.readline().decode("utf-8-sig").encode("utf-8")).strip()).split(';')
  11.     contents=fileObject.readlines()
  12.     for j in contents:
  13.         j=j.decode("utf-8-sig").encode("utf-8").strip()
  14.     for line in contents:
  15.         element_list=line.strip().split(';')
  16.         if len(element_list)==len(header_list):
  17.             record={}
  18.             for i in range(len(header_list)):
  19.                 record[header_list[i]]=element_list[i].strip()
  20.             fileMap.append(record)
  21.         else:
  22.             print "Data Format Error!"
  23.             return None
  24.             break
  25.     return fileMap
  26.  
  27. #Remove Derived Attributes
  28. def remove_attributes(filemap, attribute_set):
  29.     if isinstance(filemap, list) and isinstance(attribute_set, set):
  30.         for record in filemap:
  31.             if isinstance(record, dict):
  32.                 if attribute_set.issubset(set(record.keys())):
  33.                     for key in attribute_set:
  34.                         del record[key]
  35.                 else:
  36.                     print "Input Error / Data Format Error"
  37.                     return None
  38.             else:
  39.                 print "Data Format Error"
  40.                 return None
  41.     else:
  42.         print "Input Error"
  43.     return filemap
  44.  
  45. #Organize Clubbed Attributes
  46. def rectify(fileMap):
  47.     filemap=copy.deepcopy(fileMap)
  48.     if isinstance(filemap, list):
  49.         for record in filemap:
  50.             if isinstance(record, dict):
  51.                 if 'AUTHORSHIP' in record.keys():
  52.                     authorship=record['AUTHORSHIP']
  53.                     if isinstance(authorship, str):
  54.                         author_list=(authorship.strip("()")).split(',')
  55.                         author_list = [a.strip().upper() for a in author_list]
  56.                         for index in range(len(author_list)):
  57.                             researcher = author_list[index]
  58.                             if r'&' in researcher:
  59.                                 new_authors = researcher.split('&')
  60.                                 new_authors = [auth.strip().upper() for auth in new_authors]
  61.                                 del author_list[index]
  62.                                 author_list[index:index] = new_authors
  63.                         del record['AUTHORSHIP']
  64.                         year=author_list[-1]
  65.                         record['AUTHORS']=author_list[:(len(author_list)-1)]
  66.                         record['YEAR']=year
  67.                     else:
  68.                         print "Rectify: Data Format Error"
  69.                         return None
  70.                 else:
  71.                     print "Rectify: Wrong Data"
  72.             else:
  73.                 "Rectify: Wrong Data"
  74.         return filemap
  75.     else:
  76.         print "Rectify: Input Error"
  77.         return None
  78.  
  79.  
  80. def count_field(filemap, attr):
  81.     field_set=set()
  82.     for i in filemap:
  83.         field_set.add(i[attr])
  84.     return len(field_set)
  85.  
  86.  
  87. def freq_dist(parent, child, filemap):
  88.     list_child = dict() #Family Wise Species List
  89.     parent_child_count=dict() #Frequency of Species per Family
  90.     for i in filemap:
  91.         if i[child] != '' and i[parent] != '':
  92.             if i[parent].upper() in list_child.keys():
  93.                 list_child[i[parent].upper()].add(i[child].upper())
  94.             else:
  95.                 list_child[i[parent].upper()]={i[child].upper()}
  96.     for j in list_child.keys():
  97.         parent_child_count[j]=len(list_child[j])
  98.     return list_child, parent_child_count
  99.  
  100.  
  101. def list_max_freq(parent, child):
  102.     parent_child_count = freq_dist(parent, child, filemap)[1]
  103.     max_freq=max(parent_child_count.values()) #family_species_count.values())
  104.     xparent=set()
  105.     for key in parent_child_count.keys():
  106.         if parent_child_count[key]==max_freq:
  107.             xparent.add(key.upper())
  108.     return (xparent, max_freq)
  109.  
  110. #print freq_dist("FAMILY_NAME", "SPECIES_NAME", filemap)[1]
  111.  
  112. def discovered_by(discovery, author):
  113.     unique_discoveries=set()
  114.     for record in filemap:
  115.         if author.upper() in [element.upper() for element in record['AUTHORS']]:
  116.             if record[discovery] != '':
  117.                 unique_discoveries.add(record[discovery].upper())
  118.     return unique_discoveries
  119.  
  120.  
  121. def get_all_author(filemap):
  122.     scientists = set() #Set of All Authors
  123.     for record in filemap:
  124.         scientists = scientists.union(set(record["AUTHORS"]))
  125.     return scientists
  126.  
  127. def duration(filemap):
  128.     author_active=dict()
  129.     for scientist in get_all_author(filemap):
  130.         unique_years=set()
  131.         for record in filemap:
  132.             if scientist.upper() in (author.upper() for author in record['AUTHORS']):
  133.                 unique_years.add(int(record['YEAR']))
  134.         author_active[scientist.upper()]=(min(unique_years),max(unique_years))
  135.     return author_active
  136.  
  137.  
  138. def active_community(author, filemap):
  139.     author_active = duration(filemap)
  140.     ref_years=author_active[author.upper()]
  141.     active_scientists=set()
  142.     for key in duration(filemap).keys():
  143.         active_years=author_active[key]
  144.         if (active_years[0]>=ref_years[0]) and (active_years[1]<=ref_years[1]):
  145.             active_scientists.add(key)
  146.     active_scientists.remove(author.upper())
  147.     return active_scientists
  148.  
  149. def get_discovery_table(filemap):
  150.     discovery_table=dict() #Scientists with their Corresponding Discoveries
  151.     for scientist in get_all_author(filemap):
  152.         discovered=set()
  153.         for record in filemap:
  154.             if scientist.upper() in map(str.upper, record['AUTHORS']):
  155.                 name=record['SPECIES_NAME'].strip().upper()+" "+record['SUBSPECIES_NAME'].strip().upper()
  156.                 discovered.add(name)
  157.         discovery_table[scientist]=discovered
  158.     return discovery_table
  159.  
  160.  
  161. def most_recent_discovery(filemap):
  162.     years=set()
  163.     for r in filemap:
  164.         if r['YEAR']!='':
  165.             years.add(int(r['YEAR']))
  166.  
  167.     #Year of Most Recent Discovery
  168.     most_recent=max(years)
  169.  
  170.     #Most Recent Discoveries
  171.     recent_discoveries=set()
  172.     for r in filemap:
  173.         if int(r['YEAR'])==most_recent:
  174.             recent_discoveries.add((r["SPECIES_NAME"]).upper())
  175.     return recent_discoveries, most_recent
  176.  
  177.  
  178. def find_co_authors(author, filemap):
  179.     author_set = {author.upper()}
  180.     co_authors = set()
  181.     for record in filemap:
  182.         if (author.upper() in record["AUTHORS"]) and (len(record["AUTHORS"]) > 1):
  183.             co_authors = co_authors.union(set(record["AUTHORS"]))
  184.     return co_authors - author_set
  185.  
  186.  
  187. #Read the File and Organize it
  188. filemap=rectify(remove_attributes(import_data("orthoptera.txt"),{'SPECIES'}))
  189.  
  190. #Count Unique Species
  191. print "No. of Unique Species: ", count_field(filemap, 'SPECIES_NAME')
  192.  
  193. #Families with most Species
  194. print "Families with most Species: ",list(list_max_freq("FAMILY_NAME", "SPECIES_NAME")[0]),".\tCount: ",list_max_freq("FAMILY_NAME", "SPECIES_NAME")[1]
  195.  
  196. #No. of Unique Species discovered by Linnaeus
  197. print "No. of Unique Species discovered by Linnaeus: ", len(discovered_by("SPECIES_NAME", "Linnaeus")), "\t", list(discovered_by("SPECIES_NAME", "Linnaeus"))
  198.  
  199. #No. of Unique Subspecies discovered by Linnaeus
  200. print "No. of Unique Subspecies discovered by Linnaeus: ", list(discovered_by("SUBSPECIES_NAME", "Linnaeus")), "\tCount: ", len(discovered_by("SUBSPECIES_NAME", "Linnaeus"))
  201.  
  202. #Active Years of Linnaeus
  203. print "Active Years of Linnaeus:",duration(filemap)["Linnaeus".upper()][0]," - ",duration(filemap)["Linnaeus".upper()][1],". Duration: ",duration(filemap)["Linnaeus".upper()][1]-duration(filemap)["Linnaeus".upper()][0]," years."
  204.  
  205. #Active Scientists during active years of Linnaeus
  206. if not active_community('Linnaeus', filemap):
  207.     print "Active Scientists during active years of Linnaeus: ", active_community('Linnaeus', filemap)
  208. else:
  209.     print "No Scientists were active during Active Years of Linnaeus"
  210.  
  211. #Co-authors of Linnaeus
  212. if find_co_authors('Linnaeus', filemap):
  213.     print "Co-authors of Linnaeus: ", list(find_co_authors('Linnaeus'.upper(), filemap))
  214. else:
  215.     print "Linnaeus has no Co-author"
  216.  
  217. #Co-authors of Willemse
  218. if find_co_authors('F. Willemse'.upper(), filemap):
  219.     print "Co-authors of F. Willemse: ", list(find_co_authors('F. Willemse'.upper(), filemap))
  220. else:
  221.     print "F. Willemse has no Co-author"
  222.  
  223. #Discoveries made by Willemse
  224. print "Discoveries made by Willemse"," :",list(get_discovery_table(filemap)["Willemse".upper()]),". Count: ",len(get_discovery_table(filemap)["Willemse".upper()])
  225.  
  226. #Most Recent Discovery
  227. print "Most Recent(",most_recent_discovery(filemap)[1],") Discoveries: ",list(most_recent_discovery(filemap)[0])
Add Comment
Please, Sign In to add comment