Advertisement
Guest User

Untitled

a guest
Nov 21st, 2017
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.04 KB | None | 0 0
  1. #!/usr/bin/python
  2.  
  3. import sys
  4. import pickle
  5. sys.path.append("../tools/")
  6.  
  7. from feature_format import featureFormat, targetFeatureSplit
  8. from tester import dump_classifier_and_data
  9. import matplotlib.pyplot as plt
  10.  
  11. from outliers import find_max
  12.  
  13. ### Task 1: Select what features you'll use.
  14. ### features_list is a list of strings, each of which is a feature name.
  15. ### The first feature must be "poi".
  16. ### restricted stock is stock that is given from employer
  17. ### to employee and cannot be transferred.
  18. ### Stock deferrals delay delivery of shares to employee
  19. ### until a specified date i.e. retirement.
  20. ### deferral payments refer to buying something without
  21. ### making payments until a specified date.
  22. ### deferred income in money that is received upfront
  23. ### but reported in installments i.e. annual fee 12,000
  24. ### is reported as 1,000 each month.
  25. features_list = ['poi','salary', 'from_this_person_to_poi', \
  26.  'from_poi_to_this_person', 'total_stock_value', \
  27.  'deferral_payments', 'restricted_stock_deferred', \
  28.  'deferred_income', 'total_payments', 'loan_advances', \
  29.  'bonus', 'expenses', 'exercised_stock_options', \
  30.  'other', 'long_term_incentive', 'restricted_stock',\
  31.  'director_fees', 'to_messages', 'from_messages',
  32.  'shared_receipt_with_poi', 'percent_exercised_stock', \
  33.  'percent_to_poi', 'percent_from_poi'] # You will need to use more features
  34.  
  35. ### Load the dictionary containing the dataset
  36. with open("final_project_dataset.pkl", "r") as data_file:
  37.     data_dict = pickle.load(data_file)
  38.  
  39. ### Task 2: Remove outliers
  40.  
  41. # many outliers in this dataset are important and should be kept
  42. # find_max revealed 'TOTAL' to have the max value
  43. # for financial features.  Remove the 'TOTAL' key.
  44. data_dict.pop('TOTAL', 0)
  45.  
  46. # find the max/min value and person for a feature
  47. print find_max(data_dict, 'bonus')
  48.  
  49.  
  50. ### Task 3: Create new feature(s)
  51. ### Store to my_dataset for easy export below.
  52.  
  53. # new feature 'percent_exercised_stock'
  54. for person in data_dict.keys():
  55.     num = float(data_dict[person]['exercised_stock_options'])
  56.     den = float(data_dict[person]['total_stock_value'])
  57.     data_dict[person]['percent_exercised_stock'] = num/den
  58.  
  59. # new feature 'percent_to_poi'
  60. for person in data_dict.keys():
  61.     num = float(data_dict[person]['from_this_person_to_poi'])
  62.     den = float(data_dict[person]['from_messages'])
  63.     data_dict[person]['percent_to_poi'] = num/den
  64.  
  65. # new feature 'percent_from_poi'
  66. for person in data_dict.keys():
  67.     num = float(data_dict[person]['from_poi_to_this_person'])
  68.     den = float(data_dict[person]['to_messages'])
  69.     data_dict[person]['percent_from_poi'] = num/den
  70.  
  71.  
  72.  
  73. my_dataset = data_dict
  74.  
  75. ### Extract features and labels from dataset for local testing
  76. data = featureFormat(my_dataset, features_list, sort_keys = True)
  77. labels, features = targetFeatureSplit(data)
  78.  
  79.  
  80. # Visualize data
  81. for point in data:
  82.     poi = point[0]
  83.     salary = point[1]
  84.     to_poi = point[2]
  85.     from_poi = point[3]
  86.     total_stock = point[4]
  87.     deferral_payments = point[5]
  88.     restricted_stock_deferred = point[6]
  89.     deferred_income = point[7]
  90.     total_payments = point[8]
  91.     loan_advances = point[9]
  92.     bonus = point[10]
  93.     expenses = point[11]
  94.     exercised_stock_options = point[12]
  95.     other = point[13]
  96.     long_term_incentive = point[14]
  97.     restricted_stock = point[15]
  98.     director_fees = point[16]
  99.     to_messages = point[17]
  100.     from_messages = point[18]
  101.     shared_receipt_with_poi = point[19]
  102.     percent_exercised_stock = point[20]
  103.     percent_to_poi = point[21]
  104.     percent_from_poi = point[22]
  105.     plt.scatter(poi, percent_from_poi)
  106.  
  107. plt.show()
  108.  
  109.  
  110. ### Task 4: Try a varity of classifiers
  111. ### Please name your classifier clf for easy export below.
  112. ### Note that if you want to do PCA or other multi-stage operations,
  113. ### you'll need to use Pipelines. For more info:
  114. ### http://scikit-learn.org/stable/modules/pipeline.html
  115.  
  116. # Provided to give you a starting point. Try a variety of classifiers.
  117. from sklearn.naive_bayes import GaussianNB
  118. clf = GaussianNB()
  119. clf.fit(features, labels)
  120.  
  121. ### Task 5: Tune your classifier to achieve better than .3 precision and recall
  122. ### using our testing script. Check the tester.py script in the final project
  123. ### folder for details on the evaluation method, especially the test_classifier
  124. ### function. Because of the small size of the dataset, the script uses
  125. ### stratified shuffle split cross validation. For more info:
  126. ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
  127.  
  128. # Example starting point. Try investigating other evaluation techniques!
  129. from sklearn.cross_validation import train_test_split
  130. features_train, features_test, labels_train, labels_test = \
  131.     train_test_split(features, labels, test_size=0.3, random_state=42)
  132.  
  133.  
  134. ### Task 6: Dump your classifier, dataset, and features_list so anyone can
  135. ### check your results. You do not need to change anything below, but make sure
  136. ### that the version of poi_id.py that you submit can be run on its own and
  137. ### generates the necessary .pkl files for validating your results.
  138.  
  139. dump_classifier_and_data(clf, my_dataset, features_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement