Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- #!/usr/bin/python
- #!/usr/bin/python2.7
- from __future__ import division # need this to get decimal results for division and this must occur at the beginning of the file
- from __future__ import print_function # Required to get rid of Spyder EOF syntax errors. Must use python 3 print format
- # DAND ML Final Project Aysan - skinnied down to just test tester.py
- import sys # Original
- import pickle # Original
- sys.path.append("../tools/") # Original
- import os # Good practice to always put this at the start of my code
- assert os.path.basename(__file__) != '__main__.py' # Good practice to always put this at the start of my code
- from sklearn.naive_bayes import GaussianNB # Original for #4
- from sklearn.cross_validation import train_test_split # Original for #5
- # My imports - in addition to the division import above
- import base64
- import copy
- import itertools
- import json
- import numpy as np
- import pylab as pl
- import random # Needed to create training data
- import subprocess
- import warnings
- warnings.filterwarnings("ignore")
- import matplotlib
- matplotlib.use('agg')
- import matplotlib.pyplot as plt
- import matplotlib.patches as mpatches
- import matplotlib.ticker as ticker
- from matplotlib.patches import Rectangle
- from matplotlib.lines import Line2D
- from itertools import product
- from numpy import array
- from numpy.lib.recfunctions import append_fields
- from sklearn import cross_validation # Lesson 14 Quiz 3
- from sklearn.cross_validation import KFold # Lesson 14 Quiz 8
- from sklearn.cross_validation import StratifiedShuffleSplit
- from sklearn import linear_model
- from sklearn.linear_model import LinearRegression # So that I dont have to type linear.model when I use LinearRegression
- from sklearn import naive_bayes
- from sklearn.metrics import accuracy_score# Accuracy: no. of all data points labeled correctly divided by all data points.
- from sklearn.metrics import precision_score
- from sklearn.metrics import recall_score
- from sklearn import svm
- from sklearn.svm import SVC
- from sklearn import tree
- from tester import dump_classifier_and_data # Original
- from tester import test_classifier # ????? may not need this in final... if not, then delete it in main as well
- from time import time
- # I want a general dictionary of values that I created that I want to call, so that I can print the dictionary to see what variable
- general_dict = {}
- # Set up global features for use in functions that will change the features:
- arrFeatures = []
- arrnew_dataColor = []
- data = []
- data_dict = {}
- dataColor = []
- dataFinal = []
- dataFinalFloat = []
- dataFinalRatioPOI = []
- dataRatio = []
- dataRatioN = []
- feature_testLen = []
- features = {} # {} for an array, [] for a list
- features_list =[]
- features_test = []
- features_train = []
- labels = []
- labels_test = []
- labels_train = []
- labelsFinal = []
- my_dataset = {}
- numFeatures = {}
- varBegin = []
- ### Load the dictionary containing the dataset
- def myDataset():
- global my_dataset
- with open("final_project_dataset.pkl", "r") as data_file: # Original
- data_dict = pickle.load(data_file) # Original
- # Delete the "TOTAL" and outlier key & features
- del data_dict["TOTAL"]
- del data_dict["LAY KENNETH L"]
- # Define my_dataset
- my_dataset = data_dict
- # Remove rows if both "deferral_payments" and "bonus" are NaN
- #print ("\nlen of my_dataset before remove rows: ", len(my_dataset)) # 144 Need to remove the rows where deferral_payments and bonus are both NaN
- #print ("\nmy_dataset before conditionally removing rows:", my_dataset)
- if isinstance(my_dataset,dict):
- for key, value in my_dataset.items():
- if isinstance(value, dict) or isinstance(value, list):
- for k, v in value.items():
- if k == "deferral_payments" and v == "NaN":
- for k, v in value.items():
- if k == "bonus" and v == "NaN":
- del my_dataset[key]
- #print("in myDataset - my_dataset after isinstance: ", len(my_dataset), my_dataset)
- """
- in myDataset - my_dataset after isinstance: 97 {'METTS MARK': {'salary': 365788, 'to_messages': 807, 'deferral_payments': 'NaN',
- 'total_payments': 1061827, 'exercised_stock_options': 'NaN', 'bonus': 600000, 'restricted_stock': 585062, 'shared_receipt_with_poi': 702,
- 'restricted_stock_deferred': 'NaN', 'total_stock_value': 585062, 'expenses': 94299, 'loan_advances': 'NaN', 'from_messages': 29, 'other': 1740,
- 'from_this_person_to_poi': 1, 'poi': False, 'director_fees': 'NaN', 'deferred_income': 'NaN', 'long_term_incentive': 'NaN', 'email_address':
- 'mark.metts@enron.com', 'from_poi_to_this_person': 38}, 'BAXTER JOHN C': {'salary': 267102, 'to_messages': 'NaN', 'deferral_payments': 1295738,
- 'total_payments': 5634343, 'exercised_stock_options': 6680544, 'bonus': 1200000, 'restricted_stock': 3942714, 'shared_receipt_with_poi': 'NaN',
- 'restricted_stock_deferred': 'NaN', 'total_stock_value': 10623258, 'expenses': 11200, 'loan_advances': 'NaN', 'from_messages': 'NaN', 'other': 2660303,
- 'from_this_person_to_poi': 'NaN', 'poi': False, 'director_fees': 'NaN', 'deferred_income': -1386055, 'long_term_incentive': 1586055,
- 'email_address': 'NaN', 'from_poi_to_this_person': 'NaN'}, 'ELLIOTT STEVEN': {'salary': 170941, 'to_messages': 'NaN', 'deferral_payments': 'NaN',
- ...
- """
- general_dict.update({'my_dataset':my_dataset})
- return my_dataset
- def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
- return_list = []
- # Key order - first branch is for Python 3 compatibility on mini-projects,
- # second branch is for compatibility on final project.
- if isinstance(sort_keys, str):
- import pickle
- keys = pickle.load(open(sort_keys, "rb"))
- elif sort_keys:
- keys = sorted(dictionary.keys())
- else:
- keys = dictionary.keys()
- for key in keys:
- tmp_list = []
- for feature in features:
- try:
- dictionary[key][feature]
- except KeyError:
- print("error: key ", feature, " not present")
- return
- value = dictionary[key][feature]
- if value=="NaN" and remove_NaN:
- value = 0
- tmp_list.append( float(value) )
- # Logic for deciding whether or not to add the data point.
- append = True
- # exclude 'poi' class as criteria.
- if features[0] == 'poi':
- test_list = tmp_list[1:]
- else:
- test_list = tmp_list
- ### if all features are zero and you want to remove
- ### data points that are all zero, do that here
- if remove_all_zeroes:
- append = False
- for item in test_list:
- if item != 0 and item != "NaN":
- append = True
- break
- ### if any features for a given data point are zero
- ### and you want to remove data points with any zeroes,
- ### handle that here
- if remove_any_zeroes:
- if 0 in test_list or "NaN" in test_list:
- append = False
- ### Append the data point if flagged for addition.
- if append:
- return_list.append( np.array(tmp_list) )
- return np.array(return_list)
- # Superscript #28 in my references document: code from Udacity: targetFeatureSplit()
- def targetFeatureSplit( data ):
- target = []
- features = []
- for item in data:
- target.append( item[0] )
- features.append( item[1:] )
- #print("features: ", features) # features: [array([ 2869717., 4175000.]), array([ 178980., 0.]), array([ 1295738., 1200000.]), array([ 260455., 400000.]),...
- return target, features
- def defineVariables3():
- testClassifiers = [GaussianNB(), tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
- max_features=None, max_leaf_nodes=None,
- min_impurity_split=1e-07, min_samples_leaf=1,
- min_samples_split=2, min_weight_fraction_leaf=0.0,
- presort=False, random_state=42, splitter='best'), SVC(kernel="linear") ] # To use in a loop when I am testing various classifiers, rather than repeating code.
- features_list = ['poi','deferral_payments','bonus']
- general_dict.update({'testClassifiers':testClassifiers, 'features_list':features_list})
- return general_dict
- def my_test_classifier(clf, dataset, features_list, folds = 1000): # ????? I may not need this. It is a copy of tester.py test_classifier
- data = featureFormat(dataset, features_list, sort_keys = True)
- labels, features = targetFeatureSplit(data)
- #print("\nin my_test_classifier - clf: ", clf) # in test_classifier - clf: GaussianNB(priors=None)
- cv = StratifiedShuffleSplit(labels, folds, random_state = 42) # Original
- true_negatives = 0
- false_negatives = 0
- true_positives = 0
- false_positives = 0
- PERF_FORMAT_STRING = "\
- \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
- Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
- RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
- \tFalse negatives: {:4d}\tTrue negatives: {:4d}"
- for train_idx, test_idx in cv:
- features_train = []
- features_test = []
- labels_train = []
- labels_test = []
- for ii in train_idx:
- features_train.append( features[ii] )
- labels_train.append( labels[ii] )
- for jj in test_idx:
- features_test.append( features[jj] )
- labels_test.append( labels[jj] )
- ### fit the classifier using training set, and test on test set
- clf.fit(features_train, labels_train)
- predictions = clf.predict(features_test)
- for prediction, truth in zip(predictions, labels_test):
- if prediction == 0 and truth == 0:
- true_negatives += 1
- elif prediction == 0 and truth == 1:
- false_negatives += 1
- elif prediction == 1 and truth == 0:
- false_positives += 1
- elif prediction == 1 and truth == 1:
- true_positives += 1
- else:
- print("\nWarning: Found a predicted label not == 0 or 1.")
- print("All predictions should take value 0 or 1.")
- print("Evaluating performance for processed predictions:")
- break
- try:
- total_predictions = true_negatives + false_negatives + false_positives + true_positives
- accuracy = 1.0*(true_positives + true_negatives)/total_predictions
- precision = 1.0*true_positives/(true_positives+false_positives)
- recall = 1.0*true_positives/(true_positives+false_negatives)
- f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
- f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
- print("\n################## My results using my_test_classifier #####################")
- print("\nMy scores must be better than .3 precision and recall")
- print("\nclf: ", clf)
- print("PERF_FORMAT_STRING.format etc...: ", PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
- print("RESULTS_FORMAT_STRING.format...: ", RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
- print("")
- except:
- print("Got a divide by zero when trying out:", clf)
- print("Precision or recall may be undefined due to a lack of true positive predicitons.")
- # Choose my classifier
- def task6Dump():
- #global dataFinal
- #print("\nin task6Dump - len and DataFinal: ", len(dataFinal), dataFinal)
- # Create my_dataset
- my_dataset = myDataset()
- clf = general_dict['testClassifiers'][1]
- features_list = general_dict['features_list']
- #print("\nin task 6. clf, features list, len my_dataset: ", clf, features_list, len(my_dataset))
- dump_classifier_and_data(clf, my_dataset, features_list) # Original
- ##### I need the above line as part of my final code
- def main():
- defineVariables3()
- task6Dump()
- my_test_classifier(general_dict['testClassifiers'][1], general_dict['my_dataset'], general_dict['features_list'], folds = 1000) # my code to test tester.py
- print("\n############################ from udacity tester.py #################################\n")
- test_classifier(general_dict['testClassifiers'][1], general_dict['my_dataset'], general_dict['features_list'], folds = 1000) # from tester.py
- if __name__ == "__main__": # Always put this in my programs
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement