Advertisement
TeresaAysan

DAND ML Final Project Tester Predictions 10000

Jan 21st, 2018
347
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.86 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. #!/usr/bin/python
  3. #!/usr/bin/python2.7
  4. from __future__ import division # need this to get decimal results for division and this must occur at the beginning of the file
  5. from __future__ import print_function # Required to get rid of Spyder EOF syntax errors. Must use python 3 print format
  6. # DAND ML Final Project Aysan - skinnied down to just test tester.py
  7.  
  8. import sys # Original
  9.  
  10. import pickle # Original
  11. sys.path.append("../tools/") # Original
  12.  
  13. import os # Good practice to always put this at the start of my code
  14. assert os.path.basename(__file__) != '__main__.py' # Good practice to always put this at the start of my code
  15.  
  16. from sklearn.naive_bayes import GaussianNB # Original for #4
  17. from sklearn.cross_validation import train_test_split # Original for #5
  18.  
  19. # My imports - in addition to the division import above
  20. import base64
  21. import copy
  22. import itertools
  23. import json
  24. import numpy as np
  25. import pylab as pl
  26. import random # Needed to create training data
  27. import subprocess
  28. import warnings
  29. warnings.filterwarnings("ignore")
  30.  
  31. import matplotlib
  32. matplotlib.use('agg')
  33. import matplotlib.pyplot as plt
  34. import matplotlib.patches as mpatches
  35. import matplotlib.ticker as ticker
  36.  
  37. from matplotlib.patches import Rectangle
  38. from matplotlib.lines import Line2D
  39.  
  40. from itertools import product
  41. from numpy import array
  42. from numpy.lib.recfunctions import append_fields
  43.  
  44. from sklearn import cross_validation # Lesson 14 Quiz 3
  45. from sklearn.cross_validation import KFold # Lesson 14 Quiz 8
  46. from sklearn.cross_validation import StratifiedShuffleSplit
  47. from sklearn import linear_model
  48. from sklearn.linear_model import LinearRegression # So that I dont have to type linear.model when I use LinearRegression
  49. from sklearn import naive_bayes
  50. from sklearn.metrics import accuracy_score# Accuracy: no. of all data points labeled correctly divided by all data points.
  51. from sklearn.metrics import precision_score
  52. from sklearn.metrics import recall_score
  53. from sklearn import svm
  54. from sklearn.svm import SVC
  55. from sklearn import tree
  56.  
  57. from tester import dump_classifier_and_data # Original
  58. from tester import test_classifier # ????? may not need this in final... if not, then delete it in main as well
  59.  
  60. from time import time
  61.  
  62. # I want a general dictionary of values that I created that I want to call, so that I can print the dictionary to see what variable
  63. general_dict = {}
  64.  
  65. # Set up global features for use in functions that will change the features:
  66. arrFeatures = []
  67. arrnew_dataColor = []
  68. data = []
  69. data_dict = {}
  70. dataColor = []
  71. dataFinal = []
  72. dataFinalFloat = []
  73. dataFinalRatioPOI = []
  74. dataRatio = []
  75. dataRatioN = []
  76. feature_testLen = []
  77. features = {} # {} for an array, [] for a list
  78. features_list =[]
  79. features_test = []
  80. features_train = []
  81. labels = []
  82. labels_test = []
  83. labels_train = []
  84. labelsFinal = []
  85. my_dataset = {}
  86. numFeatures = {}
  87. varBegin = []
  88.  
  89.  
  90.  
  91. ### Load the dictionary containing the dataset
  92.  
  93. def myDataset():
  94. global my_dataset
  95. with open("final_project_dataset.pkl", "r") as data_file: # Original
  96. data_dict = pickle.load(data_file) # Original
  97.  
  98. # Delete the "TOTAL" and outlier key & features
  99. del data_dict["TOTAL"]
  100. del data_dict["LAY KENNETH L"]
  101.  
  102. # Define my_dataset
  103. my_dataset = data_dict
  104.  
  105. # Remove rows if both "deferral_payments" and "bonus" are NaN
  106. #print ("\nlen of my_dataset before remove rows: ", len(my_dataset)) # 144 Need to remove the rows where deferral_payments and bonus are both NaN
  107. #print ("\nmy_dataset before conditionally removing rows:", my_dataset)
  108.  
  109. if isinstance(my_dataset,dict):
  110. for key, value in my_dataset.items():
  111. if isinstance(value, dict) or isinstance(value, list):
  112. for k, v in value.items():
  113. if k == "deferral_payments" and v == "NaN":
  114. for k, v in value.items():
  115. if k == "bonus" and v == "NaN":
  116. del my_dataset[key]
  117. #print("in myDataset - my_dataset after isinstance: ", len(my_dataset), my_dataset)
  118. """
  119. in myDataset - my_dataset after isinstance: 97 {'METTS MARK': {'salary': 365788, 'to_messages': 807, 'deferral_payments': 'NaN',
  120. 'total_payments': 1061827, 'exercised_stock_options': 'NaN', 'bonus': 600000, 'restricted_stock': 585062, 'shared_receipt_with_poi': 702,
  121. 'restricted_stock_deferred': 'NaN', 'total_stock_value': 585062, 'expenses': 94299, 'loan_advances': 'NaN', 'from_messages': 29, 'other': 1740,
  122. 'from_this_person_to_poi': 1, 'poi': False, 'director_fees': 'NaN', 'deferred_income': 'NaN', 'long_term_incentive': 'NaN', 'email_address':
  123. 'mark.metts@enron.com', 'from_poi_to_this_person': 38}, 'BAXTER JOHN C': {'salary': 267102, 'to_messages': 'NaN', 'deferral_payments': 1295738,
  124. 'total_payments': 5634343, 'exercised_stock_options': 6680544, 'bonus': 1200000, 'restricted_stock': 3942714, 'shared_receipt_with_poi': 'NaN',
  125. 'restricted_stock_deferred': 'NaN', 'total_stock_value': 10623258, 'expenses': 11200, 'loan_advances': 'NaN', 'from_messages': 'NaN', 'other': 2660303,
  126. 'from_this_person_to_poi': 'NaN', 'poi': False, 'director_fees': 'NaN', 'deferred_income': -1386055, 'long_term_incentive': 1586055,
  127. 'email_address': 'NaN', 'from_poi_to_this_person': 'NaN'}, 'ELLIOTT STEVEN': {'salary': 170941, 'to_messages': 'NaN', 'deferral_payments': 'NaN',
  128. ...
  129. """
  130.  
  131. general_dict.update({'my_dataset':my_dataset})
  132.  
  133. return my_dataset
  134.  
  135. def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
  136.  
  137. return_list = []
  138.  
  139. # Key order - first branch is for Python 3 compatibility on mini-projects,
  140. # second branch is for compatibility on final project.
  141. if isinstance(sort_keys, str):
  142. import pickle
  143. keys = pickle.load(open(sort_keys, "rb"))
  144. elif sort_keys:
  145. keys = sorted(dictionary.keys())
  146. else:
  147. keys = dictionary.keys()
  148.  
  149. for key in keys:
  150. tmp_list = []
  151. for feature in features:
  152. try:
  153. dictionary[key][feature]
  154. except KeyError:
  155. print("error: key ", feature, " not present")
  156.  
  157. return
  158. value = dictionary[key][feature]
  159. if value=="NaN" and remove_NaN:
  160. value = 0
  161. tmp_list.append( float(value) )
  162.  
  163. # Logic for deciding whether or not to add the data point.
  164. append = True
  165. # exclude 'poi' class as criteria.
  166. if features[0] == 'poi':
  167. test_list = tmp_list[1:]
  168. else:
  169. test_list = tmp_list
  170. ### if all features are zero and you want to remove
  171. ### data points that are all zero, do that here
  172. if remove_all_zeroes:
  173. append = False
  174. for item in test_list:
  175. if item != 0 and item != "NaN":
  176. append = True
  177. break
  178. ### if any features for a given data point are zero
  179. ### and you want to remove data points with any zeroes,
  180. ### handle that here
  181. if remove_any_zeroes:
  182. if 0 in test_list or "NaN" in test_list:
  183. append = False
  184. ### Append the data point if flagged for addition.
  185. if append:
  186. return_list.append( np.array(tmp_list) )
  187.  
  188. return np.array(return_list)
  189.  
  190. # Superscript #28 in my references document: code from Udacity: targetFeatureSplit()
  191. def targetFeatureSplit( data ):
  192.  
  193. target = []
  194. features = []
  195. for item in data:
  196. target.append( item[0] )
  197. features.append( item[1:] )
  198. #print("features: ", features) # features: [array([ 2869717., 4175000.]), array([ 178980., 0.]), array([ 1295738., 1200000.]), array([ 260455., 400000.]),...
  199. return target, features
  200.  
  201.  
  202. def defineVariables3():
  203. testClassifiers = [GaussianNB(), tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
  204. max_features=None, max_leaf_nodes=None,
  205. min_impurity_split=1e-07, min_samples_leaf=1,
  206. min_samples_split=2, min_weight_fraction_leaf=0.0,
  207. presort=False, random_state=42, splitter='best'), SVC(kernel="linear") ] # To use in a loop when I am testing various classifiers, rather than repeating code.
  208.  
  209. features_list = ['poi','deferral_payments','bonus']
  210.  
  211. general_dict.update({'testClassifiers':testClassifiers, 'features_list':features_list})
  212.  
  213. return general_dict
  214.  
  215.  
  216. def my_test_classifier(clf, dataset, features_list, folds = 1000): # ????? I may not need this. It is a copy of tester.py test_classifier
  217. data = featureFormat(dataset, features_list, sort_keys = True)
  218. labels, features = targetFeatureSplit(data)
  219. #print("\nin my_test_classifier - clf: ", clf) # in test_classifier - clf: GaussianNB(priors=None)
  220. cv = StratifiedShuffleSplit(labels, folds, random_state = 42) # Original
  221.  
  222. true_negatives = 0
  223. false_negatives = 0
  224. true_positives = 0
  225. false_positives = 0
  226.  
  227. PERF_FORMAT_STRING = "\
  228. \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
  229. Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
  230. RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
  231. \tFalse negatives: {:4d}\tTrue negatives: {:4d}"
  232.  
  233. for train_idx, test_idx in cv:
  234. features_train = []
  235. features_test = []
  236. labels_train = []
  237. labels_test = []
  238. for ii in train_idx:
  239. features_train.append( features[ii] )
  240. labels_train.append( labels[ii] )
  241. for jj in test_idx:
  242. features_test.append( features[jj] )
  243. labels_test.append( labels[jj] )
  244.  
  245. ### fit the classifier using training set, and test on test set
  246. clf.fit(features_train, labels_train)
  247. predictions = clf.predict(features_test)
  248. for prediction, truth in zip(predictions, labels_test):
  249. if prediction == 0 and truth == 0:
  250. true_negatives += 1
  251. elif prediction == 0 and truth == 1:
  252. false_negatives += 1
  253. elif prediction == 1 and truth == 0:
  254. false_positives += 1
  255. elif prediction == 1 and truth == 1:
  256. true_positives += 1
  257. else:
  258. print("\nWarning: Found a predicted label not == 0 or 1.")
  259. print("All predictions should take value 0 or 1.")
  260. print("Evaluating performance for processed predictions:")
  261. break
  262.  
  263. try:
  264. total_predictions = true_negatives + false_negatives + false_positives + true_positives
  265. accuracy = 1.0*(true_positives + true_negatives)/total_predictions
  266. precision = 1.0*true_positives/(true_positives+false_positives)
  267. recall = 1.0*true_positives/(true_positives+false_negatives)
  268. f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
  269. f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
  270.  
  271. print("\n################## My results using my_test_classifier #####################")
  272. print("\nMy scores must be better than .3 precision and recall")
  273. print("\nclf: ", clf)
  274. print("PERF_FORMAT_STRING.format etc...: ", PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
  275. print("RESULTS_FORMAT_STRING.format...: ", RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
  276. print("")
  277. except:
  278. print("Got a divide by zero when trying out:", clf)
  279. print("Precision or recall may be undefined due to a lack of true positive predicitons.")
  280.  
  281.  
  282.  
  283. # Choose my classifier
  284. def task6Dump():
  285. #global dataFinal
  286. #print("\nin task6Dump - len and DataFinal: ", len(dataFinal), dataFinal)
  287.  
  288. # Create my_dataset
  289.  
  290. my_dataset = myDataset()
  291.  
  292. clf = general_dict['testClassifiers'][1]
  293.  
  294. features_list = general_dict['features_list']
  295.  
  296. #print("\nin task 6. clf, features list, len my_dataset: ", clf, features_list, len(my_dataset))
  297.  
  298. dump_classifier_and_data(clf, my_dataset, features_list) # Original
  299. ##### I need the above line as part of my final code
  300.  
  301.  
  302. def main():
  303. defineVariables3()
  304. task6Dump()
  305. my_test_classifier(general_dict['testClassifiers'][1], general_dict['my_dataset'], general_dict['features_list'], folds = 1000) # my code to test tester.py
  306. print("\n############################ from udacity tester.py #################################\n")
  307. test_classifier(general_dict['testClassifiers'][1], general_dict['my_dataset'], general_dict['features_list'], folds = 1000) # from tester.py
  308.  
  309. if __name__ == "__main__": # Always put this in my programs
  310. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement