poi_id code

#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")
import pandas as pd
import csv
import matplotlib.pyplot as plt
import numpy as np

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

print "TASK 1: Select Featuers"
#Number of Employees in the Dataset
print 'Number of people in the Enron dataset: {0}'.format(len(enron_data))


#Number of POIs in the Dataset
pois = [x for x, y in enron_data.items() if y['poi']]
print 'Number of POI\'s: {0}'.format(len(pois))
enron_data.items()[0]


##Feature Example for Skilling
#print str(data_dict["SKILLING JEFFREY K"])

##Features in the Enron Dataset
print 'Number of features for each person in the Enron dataset: {0}'.format(len(enron_data.values()[0]))
print '      '


#Features
features_list = ['poi','salary'] # You will need to use more features

email_features = ['to_messages', 'email_address', 'from_poi_to_this_person',
                  'from_messages',
                  'from_this_person_to_poi', 'poi', 'shared_receipt_with_poi']
financial_features = ['salary', 'deferral_payments', 'total_payments',
                      'loan_advances', 'bonus', 'restricted_stock_deferred',
                      'deferred_income', 'total_stock_value', 'expenses',
                      'exercised_stock_options',
                      'other', 'long_term_incentive', 'restricted_stock',
                      'director_fees']


###Missing Values in features
print "Missing Values in each Feature"
def nan_values(data_dict):
    counts = dict.fromkeys(data_dict.itervalues().next().keys(), 0)
    for i in data_dict:
        employee = data_dict[i]
        for j in employee:
            if employee[j] == 'NaN':
                counts[j] += 1
    return counts

valid_values = nan_values(data_dict)
print valid_values
print '     '

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)


######################### Task 2: Remove outliers##############################
print "TASK 2: Remove Outliers"
#Identifying Outliers

### read in data dictionary, convert to numpy array
print 'TOTAL'
data = featureFormat(data_dict, financial_features)

for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter( salary, bonus )

matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()

#Eugene Lockhart Outlier
print 'EUGENE E LOCKHART'
print str(data_dict["LOCKHART EUGENE E"])
print '     '

#Travel Agency in the Park Outlier
print 'Travel Agency In The Park'
print str(data_dict['THE TRAVEL AGENCY IN THE PARK'])


#removing all 3 outliers
outliers = ['LOCKHART EUGENE E','TOTAL', 'THE TRAVEL AGENCY IN THE PARK']
for outlier in outliers:
    data_dict.pop(outlier, 0) #Lesson 8.17
print '  '

#Update of employee count after removal of OUTLIERS
print "Number of Enron employees after removing outliers:", len(data_dict.keys())
print '        '


########### Task 3: Create new feature(s)################
print 'TASK 3: Create new feature(s)'

#Function to compute ratio of two initial features:
def ratio(numerator, denominator):
    if (numerator == 'NaN') or (denominator == 'NaN') or (denominator == 0):
        fraction = 0
    else:
        fraction = float(numerator)/float(denominator)
    return fraction

#Create 3 New Features
def bonus_to_salary_ratio(dict):
    for key in dict:
        bonus = dict[key]['bonus']
        salary = dict[key]['salary']
        bonus_to_salary = ratio(bonus, salary)
        dict[key]['bonus_to_salary_ratio'] = bonus_to_salary

def from_this_person_from_poi_ratio(dict):
    for key in dict:
       from_this_person_from_poi = dict[key]['from_this_person_to_poi']
       from_messages= dict[key]['from_messages']
       fraction_to_poi = ratio(from_this_person_from_poi, from_messages)
       dict[key]['fraction_to_poi'] = fraction_to_poi

def from_poi_to_this_person_ratio(dict):
    for key in dict:
        from_poi_to_this_person_percentage = dict[key]['from_poi_to_this_person']
        to_messages = dict[key]['to_messages']
        fraction_from_poi= ratio(from_poi_to_this_person_percentage, to_messages)
        dict[key]['fraction_from_poi'] = fraction_from_poi


### Store to my_dataset for easy export below.
my_dataset = data_dict


### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)