Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import sys
- import pickle
- sys.path.append("../tools/")
- from feature_format import featureFormat, targetFeatureSplit
- from tester import dump_classifier_and_data
- import matplotlib.pyplot as plt
- from outliers import find_max
- ### Task 1: Select what features you'll use.
- ### features_list is a list of strings, each of which is a feature name.
- ### The first feature must be "poi".
- ### restricted stock is stock that is given from employer
- ### to employee and cannot be transferred.
- ### Stock deferrals delay delivery of shares to employee
- ### until a specified date i.e. retirement.
- ### deferral payments refer to buying something without
- ### making payments until a specified date.
- ### deferred income in money that is received upfront
- ### but reported in installments i.e. annual fee 12,000
- ### is reported as 1,000 each month.
- features_list = ['poi','salary', 'from_this_person_to_poi', \
- 'from_poi_to_this_person', 'total_stock_value', \
- 'deferral_payments', 'restricted_stock_deferred', \
- 'deferred_income', 'total_payments', 'loan_advances', \
- 'bonus', 'expenses', 'exercised_stock_options', \
- 'other', 'long_term_incentive', 'restricted_stock',\
- 'director_fees', 'to_messages', 'from_messages',
- 'shared_receipt_with_poi', 'percent_exercised_stock', \
- 'percent_to_poi', 'percent_from_poi'] # You will need to use more features
- ### Load the dictionary containing the dataset
- with open("final_project_dataset.pkl", "r") as data_file:
- data_dict = pickle.load(data_file)
- ### Task 2: Remove outliers
- # many outliers in this dataset are important and should be kept
- # find_max revealed 'TOTAL' to have the max value
- # for financial features. Remove the 'TOTAL' key.
- data_dict.pop('TOTAL', 0)
- # find the max/min value and person for a feature
- print find_max(data_dict, 'bonus')
- ### Task 3: Create new feature(s)
- ### Store to my_dataset for easy export below.
- # new feature 'percent_exercised_stock'
- for person in data_dict.keys():
- num = float(data_dict[person]['exercised_stock_options'])
- den = float(data_dict[person]['total_stock_value'])
- data_dict[person]['percent_exercised_stock'] = num/den
- # new feature 'percent_to_poi'
- for person in data_dict.keys():
- num = float(data_dict[person]['from_this_person_to_poi'])
- den = float(data_dict[person]['from_messages'])
- data_dict[person]['percent_to_poi'] = num/den
- # new feature 'percent_from_poi'
- for person in data_dict.keys():
- num = float(data_dict[person]['from_poi_to_this_person'])
- den = float(data_dict[person]['to_messages'])
- data_dict[person]['percent_from_poi'] = num/den
- my_dataset = data_dict
- ### Extract features and labels from dataset for local testing
- data = featureFormat(my_dataset, features_list, sort_keys = True)
- labels, features = targetFeatureSplit(data)
- # Visualize data
- for point in data:
- poi = point[0]
- salary = point[1]
- to_poi = point[2]
- from_poi = point[3]
- total_stock = point[4]
- deferral_payments = point[5]
- restricted_stock_deferred = point[6]
- deferred_income = point[7]
- total_payments = point[8]
- loan_advances = point[9]
- bonus = point[10]
- expenses = point[11]
- exercised_stock_options = point[12]
- other = point[13]
- long_term_incentive = point[14]
- restricted_stock = point[15]
- director_fees = point[16]
- to_messages = point[17]
- from_messages = point[18]
- shared_receipt_with_poi = point[19]
- percent_exercised_stock = point[20]
- percent_to_poi = point[21]
- percent_from_poi = point[22]
- plt.scatter(poi, percent_from_poi)
- plt.show()
- ### Task 4: Try a varity of classifiers
- ### Please name your classifier clf for easy export below.
- ### Note that if you want to do PCA or other multi-stage operations,
- ### you'll need to use Pipelines. For more info:
- ### http://scikit-learn.org/stable/modules/pipeline.html
- # Provided to give you a starting point. Try a variety of classifiers.
- from sklearn.naive_bayes import GaussianNB
- clf = GaussianNB()
- clf.fit(features, labels)
- ### Task 5: Tune your classifier to achieve better than .3 precision and recall
- ### using our testing script. Check the tester.py script in the final project
- ### folder for details on the evaluation method, especially the test_classifier
- ### function. Because of the small size of the dataset, the script uses
- ### stratified shuffle split cross validation. For more info:
- ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
- # Example starting point. Try investigating other evaluation techniques!
- from sklearn.cross_validation import train_test_split
- features_train, features_test, labels_train, labels_test = \
- train_test_split(features, labels, test_size=0.3, random_state=42)
- ### Task 6: Dump your classifier, dataset, and features_list so anyone can
- ### check your results. You do not need to change anything below, but make sure
- ### that the version of poi_id.py that you submit can be run on its own and
- ### generates the necessary .pkl files for validating your results.
- dump_classifier_and_data(clf, my_dataset, features_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement