Advertisement
Guest User

Untitled

a guest
Oct 20th, 2017
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.98 KB | None | 0 0
  1. #!/usr/bin/python
  2.  
  3. import sys
  4. import pickle
  5. import numpy
  6. import math
  7. import pandas as pd
  8. import matplotlib.pyplot as plt
  9.  
  10. sys.path.append("../tools/")
  11. from feature_format import featureFormat, targetFeatureSplit
  12. from tester import dump_classifier_and_data
  13.  
  14. ### Task 1: Select what features you'll use.
  15. ### features_list is a list of strings, each of which is a feature name.
  16. ### The first feature must be "poi".
  17.  
  18. ### Load the dictionary containing the dataset
  19. with open("final_project_dataset.pkl", "r") as data_file:
  20.     data_dict = pickle.load(data_file)
  21.  
  22.  
  23. ############################ Explore the data ############################
  24.  
  25. # Convert data into a pandas dataframe
  26. data_df = pd.DataFrame.from_dict(data_dict, orient='index')
  27.  
  28. # Drop email column as it is useless for this analysis
  29. data_df = data_df.drop('email_address', axis=1)
  30.  
  31.  
  32.  
  33. print "Total number of data points:", len(data_df)
  34.  
  35. print "Number of POIs in Dataset:", len(data_df[data_df.poi == 1])
  36.  
  37. print "Number of Non POIs in Dataset:", len(data_df[data_df.poi == 0])
  38.  
  39. print "Number of features used:", len(data_df.columns)
  40.  
  41.  
  42. # Detect NaNs in every feature available
  43.  
  44. # Convert columns data type to float to detect NaNs
  45. data_df[['salary','to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi',\
  46. 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi','director_fees',\
  47. 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']] = \
  48. data_df[['salary','to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi',\
  49. 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi','director_fees',\
  50. 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']].astype(float)
  51.  
  52. # print data_df.info()
  53.  
  54. # Getting to better undertsand the dataset
  55. feature_decription = data_df.describe()
  56. # print feature_decription
  57. # feature_decription.to_csv("asd.csv")
  58.  
  59. data_df[['salary','to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi',\
  60. 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi','director_fees',\
  61. 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']] = \
  62. data_df[['salary','to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi',\
  63. 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi','director_fees',\
  64. 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']].fillna(0)
  65.  
  66. # print data_df.info()
  67.  
  68. # print data_df.head()
  69.  
  70. ### Task 2: Remove outliers
  71.  
  72. # Plot the salary and bonus
  73. plt.scatter(data_df.salary, data_df.bonus)
  74. plt.title("Salary vs Bonus Before Outlier")
  75. # plt.show()
  76.  
  77. # Detect the outliers on salary
  78. # print data_df[data_df.salary > 25000000]
  79.  
  80. # Drop the Total row detected above
  81. data_df = data_df.drop('TOTAL')
  82.  
  83. # Dropping the THE TRAVEL AGENCY IN THE PARK
  84. data_df = data_df.drop('THE TRAVEL AGENCY IN THE PARK')
  85.  
  86. # Replot the salary and bonus without Total
  87. plt.scatter(data_df.salary, data_df.bonus)
  88. plt.title("Salary vs Bonus After Outlier")
  89. # plt.show()
  90.  
  91. q1 = data_df.quantile(0.25)
  92. q3 = data_df.quantile(0.75)
  93.  
  94. IQR = q3-q1
  95.  
  96. outliers = data_df[(data_df<(q1 - 1.5*IQR)) | (data_df>(q3 + 1.5*IQR))].count(axis=1)
  97.  
  98. print outliers.sort_values(ascending=False).head(10)
  99.  
  100. # Check list of POIs
  101. # print data_df[data_df.poi == 1].poi
  102.  
  103. # Drop outliers whom are no POIs
  104. # print len(data_df)
  105. data_df = data_df.drop(['FREVERT MARK A', 'LAVORATO JOHN J', 'BAXTER JOHN C'])
  106. # print len(data_df)
  107.  
  108. # Return the pandas dataframe ot dictionary to be capable of running it with the tester
  109. my_dataset = data_df.to_dict(orient='index')
  110.  
  111. # Initial features to be tested
  112. features_list = ['poi','salary','to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi',\
  113. 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi','director_fees',\
  114. 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']
  115.  
  116. ### Extract features and labels from dataset for local testing
  117. data = featureFormat(my_dataset, features_list, sort_keys = True)
  118. labels, features = targetFeatureSplit(data)
  119.  
  120. from sklearn.naive_bayes import GaussianNB
  121. from sklearn.tree import DecisionTreeClassifier
  122. from sklearn.ensemble import RandomForestClassifier
  123. from sklearn.linear_model import LogisticRegression
  124.  
  125.  
  126.  
  127. # Gaussian Naive Bayes Classifier
  128. # clf = GaussianNB()
  129.  
  130. # Decision Tree Classifier
  131. # clf = DecisionTreeClassifier()
  132.  
  133. # Random Forest Classifier
  134. # clf = RandomForestClassifier()
  135.  
  136. # Logisitics Regression
  137. # clf = LogisticRegression()
  138.  
  139.  
  140. ### Task 3: Create new feature(s)
  141.  
  142.  
  143. # Feature 1: Feature that gets the ratio of email to POI from the total number of messages
  144. for key, value in my_dataset.items():
  145.     if my_dataset[key]['from_messages'] == 0:
  146.         my_dataset[key]["to_poi_ratio"] = 0
  147.     else:
  148.         my_dataset[key]["to_poi_ratio"] =  float(my_dataset[key]['from_this_person_to_poi'])/float(my_dataset[key]['from_messages'])
  149.  
  150. # Feature 2: Feature that gets the ratio of email from POI from the total number of messages
  151. for key, value in my_dataset.items():
  152.     if my_dataset[key]['from_messages'] == 0:
  153.         my_dataset[key]["from_poi_ratio"] = 0
  154.     else:
  155.         my_dataset[key]["from_poi_ratio"] =  float(my_dataset[key]['from_poi_to_this_person'])/float(my_dataset[key]['to_messages'])
  156.  
  157.  
  158. # Feature 3: Feature that gets the ratio of shared email with a POI
  159. for key, value in my_dataset.items():
  160.     if my_dataset[key]["shared_receipt_with_poi"] == 0 or my_dataset[key]["to_messages"] == 0:
  161.         my_dataset[key]['ratio_cced_poi'] = 0
  162.     else:
  163.         my_dataset[key]['ratio_cced_poi'] = my_dataset[key]['shared_receipt_with_poi']/my_dataset[key]['to_messages']
  164.  
  165.  
  166. # Feature 4: Ratio of bonus from salary
  167. for key, value in my_dataset.items():
  168.     if my_dataset[key]['bonus'] == 0 or my_dataset[key]['salary'] == 0:
  169.         my_dataset[key]["ratio_bonus_salary"] = 0
  170.     else:
  171.         my_dataset[key]["ratio_bonus_salary"] =  float(my_dataset[key]['bonus'])/float(my_dataset[key]['salary'])
  172.  
  173.  
  174. # Feature 6: Ratio of bonus from total payments
  175. for key, value in my_dataset.items():
  176.     if my_dataset[key]['bonus'] == 0 or my_dataset[key]['total_payments'] == 0:
  177.         my_dataset[key]["ratio_bonus_payments"] = 0
  178.     else:
  179.         my_dataset[key]["ratio_bonus_payments"] = float(my_dataset[key]['bonus'])/float(my_dataset[key]['total_payments'])
  180.  
  181.  
  182. # New feature list after adding the new 4 feaures
  183. features_list = ['poi','salary', 'bonus', 'to_poi_ratio', 'from_poi_ratio', 'ratio_bonus_salary', 'ratio_bonus_payments','to_messages', 'deferral_payments', \
  184. 'total_payments', 'exercised_stock_options', 'restricted_stock', 'shared_receipt_with_poi','restricted_stock_deferred', 'total_stock_value', \
  185. 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi','director_fees',\
  186. 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']
  187.  
  188. ### Extract features and labels from dataset for local testing
  189. data = featureFormat(my_dataset, features_list, sort_keys = True)
  190. labels, features = targetFeatureSplit(data)
  191.  
  192. poi = data[:,0]
  193. salary = data[:,1]
  194. bonus = data[:,2]
  195. to_poi_ratio = data[:,3]
  196. from_poi_ratio = data[:,4]
  197. ratio_bonus_salary = data[:,5]
  198. ratio_bonus_payments = data[:,6]
  199.  
  200.  
  201. # Plotting some new and old features
  202. plt.scatter(salary, bonus, c = poi)
  203. plt.xlabel("Salary")
  204. plt.ylabel("Bonus")
  205. plt.title("Salary vs Bonus Labeled by POI")
  206. # plt.show()
  207.  
  208. plt.scatter(to_poi_ratio, from_poi_ratio, c = poi)
  209. plt.xlabel("Ratio of Email to a POI")
  210. plt.ylabel("Ratio of Email from a POI")
  211. plt.title("Ratio of Email From & To a POI")
  212. # plt.show()
  213.  
  214. plt.scatter(ratio_bonus_salary, ratio_bonus_payments, c = poi)
  215. plt.xlabel("Bonus Ratio from Salary")
  216. plt.ylabel("Bonus Ratio from Total Payments")
  217. plt.title("Bonus Ratio from Salary vs Total Payment")
  218. # plt.show()
  219.  
  220.  
  221. from sklearn.pipeline import Pipeline
  222. from sklearn.decomposition import PCA
  223. from sklearn.feature_selection import SelectKBest, chi2
  224. from sklearn.cross_validation import StratifiedShuffleSplit, train_test_split
  225. from sklearn.model_selection import GridSearchCV
  226.  
  227. ############################################################################
  228. pipe = Pipeline([
  229.     ('reduce_dim', SelectKBest()),
  230.     ('classify', DecisionTreeClassifier())
  231. ])
  232.  
  233.  
  234. N_FEATURES_OPTIONS = numpy.arange(1, len(features_list))
  235. C_OPTIONS = [10, 20]
  236.  
  237. param_grid = [
  238.     {
  239.         'reduce_dim': [SelectKBest(chi2)],
  240.         'reduce_dim__k': N_FEATURES_OPTIONS,
  241.         'classify__C': C_OPTIONS
  242.     },
  243. ]
  244.  
  245. clf = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
  246.  
  247. # estimators = [('feature_selection', SelectKBest(k=10)), ('classifier_type', GaussianNB())]
  248.  
  249. # estimators = [('feature_selection', SelectKBest(k=10)), ('classifier_type', DecisionTreeClassifier())]
  250.  
  251. # estimators = [('feature_selection', SelectKBest(k=10)), ('classifier_type', RandomForestClassifier())]
  252.  
  253. # estimators = [('feature_selection', SelectKBest(k=10)), ('classifier_type', LogisticRegression())]
  254.  
  255. # clf = Pipeline(estimators)
  256.  
  257. # # Run here but to be commented when model is running in tester
  258. # features_train, features_test, labels_train, labels_test = \
  259. #     train_test_split(features, labels, train_size=.45, stratify=labels)
  260.  
  261. # skbest = SelectKBest(k=10)
  262. # sk_transform = skbest.fit_transform(features_train, labels_train)
  263. # mask = skbest.get_support(True)
  264. # print skbest.scores_
  265.  
  266. # feature_list_reduced = ['poi']
  267. # for i in mask:
  268. #     print '%s score: %f' % (features_list[i + 1], skbest.scores_[i])
  269. #     feature_list_reduced.append(features_list[i + 1])
  270.  
  271.  
  272. # Updated list of features that will be used
  273. # features_list = ['poi', 'salary', 'bonus', 'ratio_bonus_salary', 'total_payments', 'exercised_stock_options', 'restricted_stock', \
  274. # 'shared_receipt_with_poi', 'total_stock_value', 'long_term_incentive', 'from_poi_to_this_person']
  275.  
  276. # # Decision Tree
  277. # parameters = {'min_samples_leaf':[2,3,4,5], 'min_samples_split':[2,3,4,5,6,7], 'max_depth':[2,4,5,6,7]}
  278.  
  279. # tree = DecisionTreeClassifier()
  280. # clf = GridSearchCV(tree, parameters)
  281. # clf.fit(features, labels)
  282. # clf = clf.best_estimator_
  283.  
  284. # ### Task 5: Tune your classifier to achieve better than .3 precision and recall
  285.  
  286. ### Task 6: Dump your classifier, dataset, and features_list so anyone can
  287. ### check your results. You do not need to change anything below, but make sure
  288. ### that the version of poi_id.py that you submit can be run on its own and
  289. ### generates the necessary .pkl files for validating your results.
  290.  
  291.  
  292. dump_classifier_and_data(clf, my_dataset, features_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement