DAND ML Final Project Tester Predictions 10000

# -*- coding: utf-8 -*-
#!/usr/bin/python
#!/usr/bin/python2.7
from __future__ import division # need this to get decimal results for division and this must occur at the beginning of the file
from __future__ import print_function # Required to get rid of Spyder EOF syntax errors. Must use python 3 print format
# DAND ML Final Project Aysan - skinnied down to just test tester.py

import sys # Original

import pickle # Original
sys.path.append("../tools/") # Original

import os # Good practice to always put this at the start of my code
assert os.path.basename(__file__) != '__main__.py'  # Good practice to always put this at the start of my code

from sklearn.naive_bayes import GaussianNB # Original for #4
from sklearn.cross_validation import train_test_split # Original for #5

# My imports - in addition to the division import above
import base64
import copy
import itertools
import json
import numpy as np
import pylab as pl
import random # Needed to create training data
import subprocess
import warnings
warnings.filterwarnings("ignore")

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.ticker as ticker

from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D

from itertools import product
from numpy import array
from numpy.lib.recfunctions import append_fields

from sklearn import cross_validation # Lesson 14 Quiz 3
from sklearn.cross_validation import KFold # Lesson 14 Quiz 8
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn import linear_model
from sklearn.linear_model import LinearRegression # So that I dont have to type linear.model when I use LinearRegression
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score# Accuracy: no. of all data points labeled correctly divided by all data points.
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import svm
from sklearn.svm import SVC
from sklearn import tree

from tester import dump_classifier_and_data # Original
from tester import test_classifier # ????? may not need this in final... if not, then delete it in main as well

from time import time

# I want a general dictionary of values that I created that I want to call, so that I can print the dictionary to see what variable
general_dict = {}

# Set up global features for use in functions that will change the features:
arrFeatures = []
arrnew_dataColor = []
data = []
data_dict = {}
dataColor = []
dataFinal = []
dataFinalFloat = []
dataFinalRatioPOI = []
dataRatio = []
dataRatioN = []
feature_testLen = []
features = {} # {} for an array, [] for a list
features_list =[]
features_test = []
features_train = []
labels = []
labels_test = []
labels_train = []
labelsFinal = []
my_dataset = {}
numFeatures = {}
varBegin = []


### Load the dictionary containing the dataset

def myDataset():
    global my_dataset
    with open("final_project_dataset.pkl", "r") as data_file: # Original
        data_dict = pickle.load(data_file) # Original

    # Delete the "TOTAL" and outlier key & features
    del data_dict["TOTAL"]
    del data_dict["LAY KENNETH L"]

    # Define my_dataset
    my_dataset = data_dict

    # Remove rows if both "deferral_payments" and "bonus" are NaN
    #print ("\nlen of my_dataset before remove rows: ", len(my_dataset)) # 144 Need to remove the rows where deferral_payments and bonus are both NaN
    #print ("\nmy_dataset before conditionally removing rows:", my_dataset)

    if isinstance(my_dataset,dict):
        for key, value in my_dataset.items():
            if isinstance(value, dict) or isinstance(value, list):
                for k, v in value.items():
                    if k == "deferral_payments" and v == "NaN":
                        for k, v in value.items():
                            if k == "bonus" and v == "NaN":
                                del my_dataset[key]
    #print("in myDataset - my_dataset after isinstance: ", len(my_dataset), my_dataset)
    """
    in myDataset - my_dataset after isinstance:  97 {'METTS MARK': {'salary': 365788, 'to_messages': 807, 'deferral_payments': 'NaN',
    'total_payments': 1061827, 'exercised_stock_options': 'NaN', 'bonus': 600000, 'restricted_stock': 585062, 'shared_receipt_with_poi': 702,
    'restricted_stock_deferred': 'NaN', 'total_stock_value': 585062, 'expenses': 94299, 'loan_advances': 'NaN', 'from_messages': 29, 'other': 1740,
    'from_this_person_to_poi': 1, 'poi': False, 'director_fees': 'NaN', 'deferred_income': 'NaN', 'long_term_incentive': 'NaN', 'email_address':
        'mark.metts@enron.com', 'from_poi_to_this_person': 38}, 'BAXTER JOHN C': {'salary': 267102, 'to_messages': 'NaN', 'deferral_payments': 1295738,
        'total_payments': 5634343, 'exercised_stock_options': 6680544, 'bonus': 1200000, 'restricted_stock': 3942714, 'shared_receipt_with_poi': 'NaN',
        'restricted_stock_deferred': 'NaN', 'total_stock_value': 10623258, 'expenses': 11200, 'loan_advances': 'NaN', 'from_messages': 'NaN', 'other': 2660303,
        'from_this_person_to_poi': 'NaN', 'poi': False, 'director_fees': 'NaN', 'deferred_income': -1386055, 'long_term_incentive': 1586055,
        'email_address': 'NaN', 'from_poi_to_this_person': 'NaN'}, 'ELLIOTT STEVEN': {'salary': 170941, 'to_messages': 'NaN', 'deferral_payments': 'NaN',
        ...
    """

    general_dict.update({'my_dataset':my_dataset})

    return my_dataset

def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):

    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print("error: key ", feature, " not present")

                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)

#  Superscript #28 in my references document: code from Udacity: targetFeatureSplit()
def targetFeatureSplit( data ):

    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )
    #print("features: ", features) # features:  [array([ 2869717.,  4175000.]), array([ 178980.,       0.]), array([ 1295738.,  1200000.]), array([ 260455.,  400000.]),...
    return target, features


def defineVariables3():
    testClassifiers = [GaussianNB(), tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best'), SVC(kernel="linear") ] # To use in a loop when I am testing various classifiers, rather than repeating code.

    features_list = ['poi','deferral_payments','bonus']

    general_dict.update({'testClassifiers':testClassifiers, 'features_list':features_list})

    return general_dict


def my_test_classifier(clf, dataset, features_list, folds = 1000): # ????? I may not need this. It is a copy of tester.py test_classifier
    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    #print("\nin my_test_classifier - clf: ", clf) # in test_classifier - clf:  GaussianNB(priors=None)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42) # Original

    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0

    PERF_FORMAT_STRING = "\
    \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
    Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
    RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
    \tFalse negatives: {:4d}\tTrue negatives: {:4d}"

    for train_idx, test_idx in cv:
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

    ### fit the classifier using training set, and test on test set
    clf.fit(features_train, labels_train)
    predictions = clf.predict(features_test)
    for prediction, truth in zip(predictions, labels_test):
        if prediction == 0 and truth == 0:
            true_negatives += 1
        elif prediction == 0 and truth == 1:
            false_negatives += 1
        elif prediction == 1 and truth == 0:
            false_positives += 1
        elif prediction == 1 and truth == 1:
            true_positives += 1
        else:
            print("\nWarning: Found a predicted label not == 0 or 1.")
            print("All predictions should take value 0 or 1.")
            print("Evaluating performance for processed predictions:")
            break

    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)

        print("\n################## My results using my_test_classifier #####################")
        print("\nMy scores must be better than .3 precision and recall")
        print("\nclf: ", clf)
        print("PERF_FORMAT_STRING.format etc...: ", PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
        print("RESULTS_FORMAT_STRING.format...: ", RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
        print("")
    except:
        print("Got a divide by zero when trying out:", clf)
        print("Precision or recall may be undefined due to a lack of true positive predicitons.")


# Choose my classifier
def task6Dump():
    #global dataFinal
    #print("\nin task6Dump - len and DataFinal: ", len(dataFinal), dataFinal)

    # Create my_dataset

    my_dataset = myDataset()

    clf = general_dict['testClassifiers'][1]

    features_list = general_dict['features_list']

    #print("\nin task 6. clf, features list, len my_dataset: ", clf, features_list, len(my_dataset))

    dump_classifier_and_data(clf, my_dataset, features_list) # Original
    ##### I need the above line as part of my final code


def main():
    defineVariables3()
    task6Dump()
    my_test_classifier(general_dict['testClassifiers'][1], general_dict['my_dataset'], general_dict['features_list'], folds = 1000) # my code to test tester.py
    print("\n############################ from udacity tester.py #################################\n")
    test_classifier(general_dict['testClassifiers'][1], general_dict['my_dataset'], general_dict['features_list'], folds = 1000) # from tester.py

if __name__ == "__main__":  # Always put this in my programs
    main()