Advertisement
Guest User

poi_id code

a guest
Nov 20th, 2017
302
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.53 KB | None | 0 0
  1. #!/usr/bin/python
  2.  
  3. import sys
  4. import pickle
  5. sys.path.append("../tools/")
  6. import pandas as pd
  7. import csv
  8. import matplotlib.pyplot as plt
  9. import numpy as np
  10.  
  11. from feature_format import featureFormat, targetFeatureSplit
  12. from tester import dump_classifier_and_data
  13.  
  14. ### Task 1: Select what features you'll use.
  15. ### features_list is a list of strings, each of which is a feature name.
  16. ### The first feature must be "poi".
  17.  
  18. print "TASK 1: Select Featuers"
  19. #Number of Employees in the Dataset
  20. print 'Number of people in the Enron dataset: {0}'.format(len(enron_data))
  21.  
  22.  
  23. #Number of POIs in the Dataset
  24. pois = [x for x, y in enron_data.items() if y['poi']]
  25. print 'Number of POI\'s: {0}'.format(len(pois))
  26. enron_data.items()[0]
  27.  
  28.  
  29.  
  30. ##Feature Example for Skilling
  31. #print str(data_dict["SKILLING JEFFREY K"])
  32.  
  33. ##Features in the Enron Dataset
  34. print 'Number of features for each person in the Enron dataset: {0}'.format(len(enron_data.values()[0]))
  35. print '      '
  36.  
  37.  
  38. #Features
  39. features_list = ['poi','salary'] # You will need to use more features
  40.  
  41. email_features = ['to_messages', 'email_address', 'from_poi_to_this_person',
  42.                   'from_messages',
  43.                   'from_this_person_to_poi', 'poi', 'shared_receipt_with_poi']
  44. financial_features = ['salary', 'deferral_payments', 'total_payments',
  45.                       'loan_advances', 'bonus', 'restricted_stock_deferred',
  46.                       'deferred_income', 'total_stock_value', 'expenses',
  47.                       'exercised_stock_options',
  48.                       'other', 'long_term_incentive', 'restricted_stock',
  49.                       'director_fees']
  50.  
  51.  
  52. ###Missing Values in features
  53. print "Missing Values in each Feature"
  54. def nan_values(data_dict):
  55.     counts = dict.fromkeys(data_dict.itervalues().next().keys(), 0)
  56.     for i in data_dict:
  57.         employee = data_dict[i]
  58.         for j in employee:
  59.             if employee[j] == 'NaN':
  60.                 counts[j] += 1
  61.     return counts
  62.  
  63. valid_values = nan_values(data_dict)
  64. print valid_values
  65. print '     '
  66.  
  67. ### Load the dictionary containing the dataset
  68. with open("final_project_dataset.pkl", "r") as data_file:
  69.     data_dict = pickle.load(data_file)
  70.  
  71.  
  72. ######################### Task 2: Remove outliers##############################
  73. print "TASK 2: Remove Outliers"    
  74. #Identifying Outliers    
  75.  
  76. ### read in data dictionary, convert to numpy array
  77. print 'TOTAL'
  78. data = featureFormat(data_dict, financial_features)
  79.  
  80. for point in data:
  81.     salary = point[0]
  82.     bonus = point[1]
  83.     matplotlib.pyplot.scatter( salary, bonus )
  84.  
  85. matplotlib.pyplot.xlabel("salary")
  86. matplotlib.pyplot.ylabel("bonus")
  87. matplotlib.pyplot.show()
  88.  
  89. #Eugene Lockhart Outlier
  90. print 'EUGENE E LOCKHART'
  91. print str(data_dict["LOCKHART EUGENE E"])
  92. print '     '
  93.  
  94. #Travel Agency in the Park Outlier
  95. print 'Travel Agency In The Park'
  96. print str(data_dict['THE TRAVEL AGENCY IN THE PARK'])
  97.  
  98.  
  99. #removing all 3 outliers
  100. outliers = ['LOCKHART EUGENE E','TOTAL', 'THE TRAVEL AGENCY IN THE PARK']
  101. for outlier in outliers:
  102.     data_dict.pop(outlier, 0) #Lesson 8.17
  103. print '  '
  104.    
  105. #Update of employee count after removal of OUTLIERS
  106. print "Number of Enron employees after removing outliers:", len(data_dict.keys())
  107. print '        '
  108.  
  109.  
  110.    
  111. ########### Task 3: Create new feature(s)################
  112. print 'TASK 3: Create new feature(s)'
  113.  
  114. #Function to compute ratio of two initial features:
  115. def ratio(numerator, denominator):
  116.     if (numerator == 'NaN') or (denominator == 'NaN') or (denominator == 0):
  117.         fraction = 0
  118.     else:
  119.         fraction = float(numerator)/float(denominator)
  120.     return fraction
  121.  
  122. #Create 3 New Features
  123. def bonus_to_salary_ratio(dict):
  124.     for key in dict:
  125.         bonus = dict[key]['bonus']
  126.         salary = dict[key]['salary']
  127.         bonus_to_salary = ratio(bonus, salary)
  128.         dict[key]['bonus_to_salary_ratio'] = bonus_to_salary
  129.  
  130. def from_this_person_from_poi_ratio(dict):
  131.     for key in dict:
  132.        from_this_person_from_poi = dict[key]['from_this_person_to_poi']
  133.        from_messages= dict[key]['from_messages']
  134.        fraction_to_poi = ratio(from_this_person_from_poi, from_messages)
  135.        dict[key]['fraction_to_poi'] = fraction_to_poi
  136.  
  137. def from_poi_to_this_person_ratio(dict):
  138.     for key in dict:
  139.         from_poi_to_this_person_percentage = dict[key]['from_poi_to_this_person']
  140.         to_messages = dict[key]['to_messages']
  141.         fraction_from_poi= ratio(from_poi_to_this_person_percentage, to_messages)
  142.         dict[key]['fraction_from_poi'] = fraction_from_poi
  143.  
  144.        
  145. ### Store to my_dataset for easy export below.
  146. my_dataset = data_dict
  147.  
  148.  
  149.  
  150. ### Extract features and labels from dataset for local testing
  151. data = featureFormat(my_dataset, features_list, sort_keys = True)
  152. labels, features = targetFeatureSplit(data)
  153.  
  154.  
  155. ### Task 4: Try a varity of classifiers
  156. ### Please name your classifier clf for easy export below.
  157. ### Note that if you want to do PCA or other multi-stage operations,
  158. ### you'll need to use Pipelines. For more info:
  159. ### http://scikit-learn.org/stable/modules/pipeline.html
  160. from sklearn.preprocessing import MinMaxScaler
  161. from sklearn.feature_selection import SelectKBest
  162. from sklearn.tree import DecisionTreeClassifier
  163. from sklearn.svm import SVC
  164. from sklearn.neighbors import KNeighborsClassifier
  165. from sklearn.pipeline import Pipeline
  166.  
  167. # Provided to give you a starting point. Try a variety of classifiers.
  168. from sklearn.naive_bayes import GaussianNB
  169. clf = GaussianNB()
  170.  
  171. ### Task 5: Tune your classifier to achieve better than .3 precision and recall
  172. ### using our testing script. Check the tester.py script in the final project
  173. ### folder for details on the evaluation method, especially the test_classifier
  174. ### function. Because of the small size of the dataset, the script uses
  175. ### stratified shuffle split cross validation. For more info:
  176. ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
  177.  
  178. # Example starting point. Try investigating other evaluation techniques!
  179. from sklearn.cross_validation import train_test_split
  180. features_train, features_test, labels_train, labels_test = \
  181.     train_test_split(features, labels, test_size=0.3, random_state=42)
  182.  
  183. ### Task 6: Dump your classifier, dataset, and features_list so anyone can
  184. ### check your results. You do not need to change anything below, but make sure
  185. ### that the version of poi_id.py that you submit can be run on its own and
  186. ### generates the necessary .pkl files for validating your results.
  187.  
  188. dump_classifier_and_data(clf, my_dataset, features_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement