Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import pickle
- sys.path.append("../tools/")
- from feature_format import featureFormat, targetFeatureSplit
- from tester import dump_classifier_and_data
- from sklearn.preprocessing import MinMaxScaler, StandardScaler
- from sklearn.feature_selection import SelectKBest, f_classif
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.naive_bayes import GaussianNB
- from sklearn.pipeline import Pipeline
- from sklearn.cross_validation import train_test_split,StratifiedShuffleSplit
- from sklearn.grid_search import GridSearchCV
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.decomposition import PCA
- from sklearn.svm import SVC
- from sklearn.metrics import accuracy_score
- ### Load the dictionary containing the dataset
- with open("final_project_dataset.pkl", "r") as data_file:
- data_dict = pickle.load(data_file)
- data_dict.pop("TOTAL",0)
- data_dict.pop("BANNANTINE JAMES M", 0)
- data_dict.pop("GRAY RODNEY", 0)
- def test_classifier(clf, dataset, features_list, folds = 1000):
- data = featureFormat(dataset, features_list, sort_keys = True)
- labels, features = targetFeatureSplit(data)
- cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
- true_negatives = 0
- false_negatives = 0
- true_positives = 0
- false_positives = 0
- for train_idx, test_idx in cv:
- features_train = []
- features_test = []
- labels_train = []
- labels_test = []
- for ii in train_idx:
- features_train.append( features[ii] )
- labels_train.append( labels[ii] )
- for jj in test_idx:
- features_test.append( features[jj] )
- labels_test.append( labels[jj] )
- ### fit the classifier using training set, and test on test set
- clf.fit(features_train, labels_train)
- predictions = clf.predict(features_test)
- for prediction, truth in zip(predictions, labels_test):
- if prediction == 0 and truth == 0:
- true_negatives += 1
- elif prediction == 0 and truth == 1:
- false_negatives += 1
- elif prediction == 1 and truth == 0:
- false_positives += 1
- elif prediction == 1 and truth == 1:
- true_positives += 1
- else:
- print "Warning: Found a predicted label not == 0 or 1."
- print "All predictions should take value 0 or 1."
- print "Evaluating performance for processed predictions:"
- break
- try:
- total_predictions = true_negatives + false_negatives + false_positives + true_positives
- accuracy = 1.0*(true_positives + true_negatives)/total_predictions
- precision = 1.0*true_positives/(true_positives+false_positives)
- recall = 1.0*true_positives/(true_positives+false_negatives)
- f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
- f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
- print clf
- print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
- print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
- print ""
- except:
- print "Got a divide by zero when trying out:", clf
- print "Precision or recall may be undefined due to a lack of true positive predicitons."
- features_list = ['poi',
- 'salary',
- 'exercised_stock_options',
- 'bonus',
- 'total_stock_value',
- 'deferred_income'
- ]
- my_dataset = data_dict
- data = featureFormat(my_dataset, features_list, sort_keys = True)
- labels, features = targetFeatureSplit(data)
- clf = GaussianNB()
- clf.fit(features_train, labels_train)
- test_classifier(clf, my_dataset, features_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement