Advertisement
Guest User

Python code

a guest
Nov 20th, 2017
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.68 KB | None | 0 0
  1. #!/usr/bin/python
  2.  
  3. import sys
  4. import pickle
  5. sys.path.append("../tools/")
  6. import pandas as pd
  7. import csv
  8. import matplotlib.pyplot as plt
  9. import numpy as np
  10.  
  11. from feature_format import featureFormat, targetFeatureSplit
  12. from tester import dump_classifier_and_data
  13.  
  14. ### Load the dictionary containing the dataset
  15. with open("final_project_dataset.pkl", "r") as data_file:
  16.     data_dict = pickle.load(data_file)
  17.  
  18.  
  19. ### Task 1: Select what features you'll use.
  20. ### features_list is a list of strings, each of which is a feature name.
  21. ### The first feature must be "poi".
  22.  
  23. print "TASK 1: Select Featuers"
  24. #Number of Employees in the Dataset
  25. print 'Number of people in the Enron dataset: {0}'.format(len(data_dict))
  26.  
  27.  
  28. #Number of POIs in the Dataset
  29. pois = [x for x, y in data_dict.items() if y['poi']]
  30. print 'Number of POI\'s: {0}'.format(len(pois))
  31. data_dict.items()[0]
  32.  
  33.  
  34.  
  35. ##Feature Example for Skilling
  36. #print str(data_dict["SKILLING JEFFREY K"])
  37.  
  38. ##Features in the Enron Dataset
  39. print 'Number of features for each person in the Enron dataset: {0}'.format(len(data_dict.values()[0]))
  40. print '      '
  41.  
  42.  
  43. #Features
  44. features_list = ['poi','salary'] # You will need to use more features
  45.  
  46. email_features = ['to_messages', 'email_address', 'from_poi_to_this_person',
  47.                   'from_messages',
  48.                   'from_this_person_to_poi', 'poi', 'shared_receipt_with_poi']
  49. financial_features = ['salary', 'deferral_payments', 'total_payments',
  50.                       'loan_advances', 'bonus', 'restricted_stock_deferred',
  51.                       'deferred_income', 'total_stock_value', 'expenses',
  52.                       'exercised_stock_options',
  53.                       'other', 'long_term_incentive', 'restricted_stock',
  54.                       'director_fees']
  55.  
  56.  
  57. ###Missing Values in features
  58. print "Missing Values in each Feature"
  59. def nan_values(data_dict):
  60.     counts = dict.fromkeys(data_dict.itervalues().next().keys(), 0)
  61.     for i in data_dict:
  62.         employee = data_dict[i]
  63.         for j in employee:
  64.             if employee[j] == 'NaN':
  65.                 counts[j] += 1
  66.     return counts
  67.  
  68. valid_values = nan_values(data_dict)
  69. print valid_values
  70. print '     '
  71.  
  72. ### Load the dictionary containing the dataset
  73. with open("final_project_dataset.pkl", "r") as data_file:
  74.     data_dict = pickle.load(data_file)
  75.  
  76.  
  77. ######################### Task 2: Remove outliers##############################
  78. print "TASK 2: Remove Outliers"    
  79. #Identifying Outliers    
  80.  
  81. ### read in data dictionary, convert to numpy array
  82. print 'TOTAL'
  83. data = featureFormat(data_dict, financial_features)
  84.  
  85. for point in data:
  86.     salary = point[0]
  87.     bonus = point[1]
  88.     matplotlib.pyplot.scatter( salary, bonus )
  89.  
  90. matplotlib.pyplot.xlabel("salary")
  91. matplotlib.pyplot.ylabel("bonus")
  92. matplotlib.pyplot.show()
  93.  
  94. #Eugene Lockhart Outlier
  95. print 'EUGENE E LOCKHART'
  96. print str(data_dict["LOCKHART EUGENE E"])
  97. print '     '
  98.  
  99. #Travel Agency in the Park Outlier
  100. print 'Travel Agency In The Park'
  101. print str(data_dict['THE TRAVEL AGENCY IN THE PARK'])
  102.  
  103.  
  104. #removing all 3 outliers
  105. outliers = ['LOCKHART EUGENE E','TOTAL', 'THE TRAVEL AGENCY IN THE PARK']
  106. for outlier in outliers:
  107.     data_dict.pop(outlier, 0) #Lesson 8.17
  108. print '  '
  109.    
  110. #Update of employee count after removal of OUTLIERS
  111. print "Number of Enron employees after removing outliers:", len(data_dict.keys())
  112. print '        '
  113.  
  114.  
  115.    
  116. ########### Task 3: Create new feature(s)################
  117. print 'TASK 3: Create new feature(s)'
  118.  
  119. #Function to compute ratio of two initial features:
  120. def ratio(numerator, denominator):
  121.     if (numerator == 'NaN') or (denominator == 'NaN') or (denominator == 0):
  122.         fraction = 0
  123.     else:
  124.         fraction = float(numerator)/float(denominator)
  125.     return fraction
  126.  
  127. #Create 3 New Features
  128. def bonus_to_salary_ratio(dict):
  129.     for key in dict:
  130.         bonus = dict[key]['bonus']
  131.         salary = dict[key]['salary']
  132.         bonus_to_salary = ratio(bonus, salary)
  133.         dict[key]['bonus_to_salary_ratio'] = bonus_to_salary
  134.  
  135. def from_this_person_from_poi_ratio(dict):
  136.     for key in dict:
  137.        from_this_person_from_poi = dict[key]['from_this_person_to_poi']
  138.        from_messages= dict[key]['from_messages']
  139.        fraction_to_poi = ratio(from_this_person_from_poi, from_messages)
  140.        dict[key]['fraction_to_poi'] = fraction_to_poi
  141.  
  142. def from_poi_to_this_person_ratio(dict):
  143.     for key in dict:
  144.         from_poi_to_this_person_percentage = dict[key]['from_poi_to_this_person']
  145.         to_messages = dict[key]['to_messages']
  146.         fraction_from_poi= ratio(from_poi_to_this_person_percentage, to_messages)
  147.         dict[key]['fraction_from_poi'] = fraction_from_poi
  148.  
  149.        
  150. ### Store to my_dataset for easy export below.
  151. my_dataset = data_dict
  152.  
  153.  
  154.  
  155. ### Extract features and labels from dataset for local testing
  156. data = featureFormat(my_dataset, features_list, sort_keys = True)
  157. labels, features = targetFeatureSplit(data)
  158.  
  159.  
  160. ### Task 4: Try a varity of classifiers
  161. ### Please name your classifier clf for easy export below.
  162. ### Note that if you want to do PCA or other multi-stage operations,
  163. ### you'll need to use Pipelines. For more info:
  164. ### http://scikit-learn.org/stable/modules/pipeline.html
  165. from sklearn.preprocessing import MinMaxScaler
  166. from sklearn.feature_selection import SelectKBest
  167. from sklearn.tree import DecisionTreeClassifier
  168. from sklearn.svm import SVC
  169. from sklearn.neighbors import KNeighborsClassifier
  170. from sklearn.pipeline import Pipeline
  171.  
  172. # Provided to give you a starting point. Try a variety of classifiers.
  173. from sklearn.naive_bayes import GaussianNB
  174. clf = GaussianNB()
  175.  
  176. ### Task 5: Tune your classifier to achieve better than .3 precision and recall
  177. ### using our testing script. Check the tester.py script in the final project
  178. ### folder for details on the evaluation method, especially the test_classifier
  179. ### function. Because of the small size of the dataset, the script uses
  180. ### stratified shuffle split cross validation. For more info:
  181. ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
  182.  
  183. # Example starting point. Try investigating other evaluation techniques!
  184. from sklearn.cross_validation import train_test_split
  185. features_train, features_test, labels_train, labels_test = \
  186.     train_test_split(features, labels, test_size=0.3, random_state=42)
  187.  
  188. ### Task 6: Dump your classifier, dataset, and features_list so anyone can
  189. ### check your results. You do not need to change anything below, but make sure
  190. ### that the version of poi_id.py that you submit can be run on its own and
  191. ### generates the necessary .pkl files for validating your results.
  192.  
  193. dump_classifier_and_data(clf, my_dataset, features_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement