Abhisek92

File Manipulation Assignment

Jan 12th, 2018
274
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.11 KB | None | 0 0
  1. import os
  2. import copy
  3.  
  4. os.chdir("D:\\Module_5-Lab\\Python\\KP\\") #Change this according to the location of the File
  5.  
  6. #Properly Store the File in Memory
  7. def import_data(fileURI):
  8.     fileMap=[]
  9.     fileObject=open(fileURI,"r")
  10.     header_list=((fileObject.readline().decode("utf-8-sig").encode("utf-8")).strip()).split(';')
  11.     contents=fileObject.readlines()
  12.     for j in contents:
  13.         j=j.decode("utf-8-sig").encode("utf-8").strip()
  14.     for line in contents:
  15.         element_list=line.strip().split(';')
  16.         if len(element_list)==len(header_list):
  17.             record={}
  18.             for i in range(len(header_list)):
  19.                 record[header_list[i]]=element_list[i].strip()
  20.             fileMap.append(record)
  21.         else:
  22.             print "Data Format Error!"
  23.             return None
  24.             break
  25.     return fileMap
  26.  
  27. #Remove Derived Attributes
  28. def remove_attributes(filemap, attribute_set):
  29.     if isinstance(filemap, list) and isinstance(attribute_set, set):
  30.         for record in filemap:
  31.             if isinstance(record, dict):
  32.                 if attribute_set.issubset(set(record.keys())):
  33.                     for key in attribute_set:
  34.                         del record[key]
  35.                 else:
  36.                     print "Input Error / Data Format Error"
  37.                     return None
  38.             else:
  39.                 print "Data Format Error"
  40.                 return None
  41.     else:
  42.         print "Input Error"
  43.     return filemap
  44.  
  45. #Organize Clubbed Attributes
  46. def rectify(fileMap):
  47.     filemap=copy.deepcopy(fileMap)
  48.     if isinstance(filemap, list):
  49.         for record in filemap:
  50.             if isinstance(record, dict):
  51.                 if 'AUTHORSHIP' in record.keys():
  52.                     authorship=record['AUTHORSHIP']
  53.                     if isinstance(authorship, str):
  54.                         author_list=(authorship.strip("()")).split(',')
  55.                         for element in author_list:
  56.                             element.strip()
  57.                         del record['AUTHORSHIP']
  58.                         year=author_list[-1]
  59.                         record['AUTHORS']=author_list[:(len(author_list)-1)]
  60.                         record['YEAR']=year
  61.                     else:
  62.                         print "Rectify: Data Format Error"
  63.                         return None
  64.                 else:
  65.                     print "Rectify: Wrong Data"
  66.             else:
  67.                 "Rectify: Wrong Data"
  68.         return filemap
  69.     else:
  70.         print "Rectify: Input Error"
  71.         return None
  72.  
  73.  
  74. #Read the File and Organize it
  75. filemap=rectify(remove_attributes(import_data("orthoptera.txt"),{'SPECIES'}))
  76.  
  77. #Count Unique Species
  78. species_set=set()
  79. for i in filemap:
  80.     species_set.add(i['SPECIES_NAME'])
  81. print "Unique Species Count: ",len(species_set)
  82.  
  83. family_species=dict() #Family Wise Species List
  84. family_species_count=dict() #Frequency of Species per Family
  85. for i in filemap:
  86.     if i['FAMILY_NAME'] in family_species.keys():
  87.         family_species[i['FAMILY_NAME']].append(i['SPECIES_NAME'])
  88.     else:
  89.         family_species[i['FAMILY_NAME']]=[i['SPECIES_NAME']]
  90. for j in family_species.keys():
  91.     family_species_count[j]=len(set(family_species[j]))
  92.  
  93. #Family with most Species
  94. max_sfreq=max(family_species_count.values())
  95. xfamily=set()
  96. for key in family_species_count.keys():
  97.     if family_species_count[key]==max_sfreq:
  98.         xfamily.add(key.upper())
  99. print  "Family with most Species: ",list(xfamily),".\tCount: ",max_sfreq
  100.  
  101. genus_subspecies=dict() #Genus Wise Subspecies List
  102. genus_subspecies_count=dict() #Frequency of Subspecies per Genus
  103. for i in filemap:
  104.     if i['GENUS_NAME'].upper() in genus_subspecies.keys():
  105.         genus_subspecies[i['GENUS_NAME'].upper()].add(i['SUBSPECIES_NAME'].upper())
  106.     else:
  107.         subspecies=set()
  108.         subspecies.add(i['SUBSPECIES_NAME'].upper())
  109.         genus_subspecies[i['GENUS_NAME'].upper()]=subspecies
  110. for j in genus_subspecies.keys():
  111.     genus_subspecies_count[j]=len(set(genus_subspecies[j]))
  112.  
  113. #Genus with most Subspecies
  114. max_sgfreq=max(genus_subspecies_count.values())
  115. sgset=set()
  116. for key in genus_subspecies_count.keys():
  117.     if genus_subspecies_count[key]==max_sgfreq:
  118.         sgset.add(key)
  119. print  "Genus with most Subspecies: ",list(sgset),".\tCount: ",max_sgfreq
  120.  
  121. #No. of Unique Species discovered by Linnaeus
  122. unique_species=set()
  123. for record in filemap:
  124.     if 'Linnaeus'.upper() in map(str.upper, record['AUTHORS']):
  125.         unique_species.add(record['SPECIES_NAME'])
  126. print "No. of Unique Species discovered by Linnaeus is: ",len(unique_species)
  127.  
  128. #No. of Unique Subspecies discovered by Linnaeus
  129. unique_subspecies=set()
  130. for record in filemap:
  131.     if 'Linnaeus'.upper() in map(str.upper, record['AUTHORS']):
  132.         if record['SUBSPECIES_NAME']!='':
  133.             unique_subspecies.add(record['SUBSPECIES_NAME'])
  134. print "No. of Unique Subspecies discovered by Linnaeus is: ",len(unique_subspecies)
  135.  
  136.  
  137. scientists=set()#Set of All Authors
  138. for record in filemap:
  139.     scientists=scientists.union(set(record["AUTHORS"]))
  140.  
  141. #Active Years of Linnaeus
  142. author_active=dict()
  143. for scientist in scientists:
  144.     unique_years=set()
  145.     for record in filemap:
  146.         if scientist.upper() in (author.upper() for author in record['AUTHORS']):
  147.             unique_years.add(int(record['YEAR']))
  148.     author_active[scientist.upper()]=(min(unique_years),max(unique_years))
  149. print "Active Years of Linnaeus:",author_active["Linnaeus".upper()][0]," - ",author_active["Linnaeus".upper()][1],". Duration: ",author_active["Linnaeus".upper()][1]-author_active["Linnaeus".upper()][0]," years."
  150.  
  151. #Active Scientists during active years of Linnaeus
  152. author="Linnaeus"
  153. ref_years=author_active[author.upper()]
  154. active_scientists=set()
  155. for key in author_active.keys():
  156.     active_years=author_active[key]
  157.     if (active_years[0]>=ref_years[0]) and (active_years[1]<=ref_years[1]):
  158.         active_scientists.add(key)
  159. active_scientists.remove(author.upper())
  160. if len(active_scientists)!=0:
  161.     print "Active Scientists during active years of Linnaeus: ",list(active_scientists),"\tCount: ",len(active_scientists)
  162. else:
  163.     print "No Scientists active during active years of Linnaeus"
  164.  
  165.  
  166. discovery_table=dict() #Scientists with their Corresponding Discoveries
  167. for scientist in scientists:
  168.     discovered=set()
  169.     for record in filemap:
  170.         if scientist.upper() in map(str.upper, record['AUTHORS']):
  171.             name=record['SPECIES_NAME'].strip().upper()+" "+record['SUBSPECIES_NAME'].strip().upper()
  172.             discovered.add(name)
  173.     discovery_table[scientist]=discovered
  174.  
  175. #Discoveries made by Willemse
  176. authorx="Willemse"
  177. print "Discoveries made by ",authorx," :",list(discovery_table[authorx]),". Count: ",len(discovery_table[authorx])
  178.  
  179. years=set()
  180. for r in filemap:
  181.     if r['YEAR']!='':
  182.         years.add(int(r['YEAR']))
  183.  
  184. #Year of Most Recent Discovery
  185. most_recent=max(years)
  186.  
  187. #Most Recent Discoveries
  188. recent_discoveries=set()
  189. for r in filemap:
  190.     if int(r['YEAR'])==most_recent:
  191.         recent_discoveries.add((r["SPECIES_NAME"]).upper())
  192. print "Most Recent(",most_recent,") Discoveries: ",list(recent_discoveries)
Add Comment
Please, Sign In to add comment