Advertisement
Guest User

Untitled

a guest
Oct 20th, 2017
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.29 KB | None | 0 0
  1. #!/usr/bin/python
  2.  
  3. import sys
  4. import numpy as np
  5. import pandas as pd
  6. import pickle
  7. sys.path.append("../tools/")
  8.  
  9. from feature_format import featureFormat, targetFeatureSplit
  10. from tester import dump_classifier_and_data
  11.  
  12. ### Task 1: Select what features you'll use.
  13. ### features_list is a list of strings, each of which is a feature name.
  14. ### The first feature must be "poi".
  15. features_list = ['poi', 'salary', 'total_payments', 'loan_advances', 'bonus',
  16. 'total_stock_value', 'expenses',
  17. 'from_poi_to_this_person', 'from_this_person_to_poi',
  18. ]
  19.  
  20. ### Load the dictionary containing the dataset
  21. with open("final_project_dataset.pkl", "r") as data_file:
  22. data_dict = pickle.load(data_file)
  23.  
  24. ### Task 2: Remove outliers
  25. data_dict.pop('TOTAL', 0 ) #remove TOTAL row
  26. data_dict.pop('THE TRAVEL AGENCY IN THE PARK', 0) #who is this?
  27. data_dict.pop('LOCKHART EUGENEE', 0) #all empty rows
  28.  
  29. ### Task 3: Create new feature(s)
  30. df = pd.DataFrame.from_records(list(data_dict.values()))
  31. employees = pd.Series(list(data_dict.keys()))
  32.  
  33. # set the index of df to be the employees series:
  34. df.set_index(employees, inplace=True)
  35. df = df.apply(pd.to_numeric, errors = 'coerce')
  36. df.fillna(0, inplace=True)
  37. df['total_poi_email'] = df['from_poi_to_this_person'] - df['from_this_person_to_poi']
  38. df['total_poi_email_percent'] = df['total_poi_email'] / (df['to_messages'] + df['from_messages'])
  39. df = df.drop('from_poi_to_this_person', 1)
  40. df = df.drop('from_this_person_to_poi', 1)
  41. df = df.drop('to_messages', 1)
  42. df = df.drop('from_messages', 1)
  43. df = df.apply(pd.to_numeric, errors = 'coerce')
  44. df.fillna(0, inplace=True)
  45. new_features_list = list(df.columns.values)
  46.  
  47. # Ensure POI is the 1st column
  48. new_features_list.remove('poi')
  49. new_features_list.insert(0, 'poi')
  50.  
  51. # Remove unnecessary features
  52. old_features = ['deferral_payments', 'deferred_income', 'director_fees', 'email_address',
  53. 'exercised_stock_options', 'expenses', 'loan_advances', 'long_term_incentive',
  54. 'other', 'restricted_stock', 'restricted_stock_deferred',
  55. 'shared_receipt_with_poi']
  56. for feat in old_features:
  57. new_features_list.remove(feat)
  58.  
  59. # create a dictionary from the dataframe
  60. df_dict = df.to_dict('index')
  61.  
  62. ### Store to my_dataset for easy export below.
  63. my_dataset = df_dict
  64.  
  65. ### Extract features and labels from dataset for local testing
  66. data = featureFormat(my_dataset, new_features_list, sort_keys = True)
  67. labels, features = targetFeatureSplit(data)
  68.  
  69. #feature scaling
  70. from sklearn.preprocessing import MinMaxScaler
  71. scaler = MinMaxScaler()
  72. scaler.fit_transform(features)
  73.  
  74.  
  75. #make train/test sets
  76. from sklearn.cross_validation import train_test_split
  77. features_train, features_test, labels_train, labels_test = \
  78. train_test_split(features, labels, test_size=0.3, random_state=42)
  79.  
  80. ### Task 4: Try a varity of classifiers
  81. ### Please name your classifier clf for easy export below.
  82. ### Note that if you want to do PCA or other multi-stage operations,
  83. ### you'll need to use Pipelines. For more info:
  84. ### http://scikit-learn.org/stable/modules/pipeline.html
  85.  
  86. # Provided to give you a starting point. Try a variety of classifiers.
  87.  
  88. ### Pipleline to improve workflow
  89. ### selectKbest then classifier
  90. from sklearn.pipeline import Pipeline
  91. from sklearn.feature_selection import SelectKBest
  92. from sklearn.grid_search import GridSearchCV
  93. from sklearn.tree import DecisionTreeClassifier
  94. from sklearn.metrics import classification_report
  95. from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
  96. from sklearn.naive_bayes import GaussianNB
  97.  
  98.  
  99. clf = DecisionTreeClassifier(min_samples_split = 10)
  100. select = SelectKBest(k=2)
  101.  
  102. steps = [('feature_selection', select),
  103. ('decision_tree', clf)]
  104.  
  105. pipeline = Pipeline(steps)
  106. pipeline.fit(features, labels)
  107. prediction = pipeline.predict(features_test)
  108. report = classification_report(prediction, labels_test)
  109.  
  110. parameters = dict(feature_selection__k=[2, 3, 4, 5, 'all'],
  111. decision_tree__min_samples_split=[2, 3, 4, 5, 10])
  112.  
  113. # Validation method - StratifiedShuffleSplit
  114. from sklearn.cross_validation import StratifiedShuffleSplit
  115.  
  116. sss = StratifiedShuffleSplit(
  117. labels,
  118. n_iter = 100,
  119. test_size = 0.3,
  120. random_state = 0
  121. )
  122.  
  123. cv = GridSearchCV(pipeline, param_grid=parameters, scoring='f1', cv = sss)
  124.  
  125. cv.fit(features, labels)
  126. prediction = cv.predict(features_test)
  127. report = classification_report(prediction, labels_test)
  128. print(report)
  129.  
  130. print("Best score: %0.3f" % cv.best_score_)
  131. print("Best parameters set:")
  132. best_parameters = cv.best_estimator_.get_params()
  133. for param_name in sorted(parameters.keys()):
  134. print("\t%s: %r" % (param_name, best_parameters[param_name]))
  135.  
  136. from tester import test_classifier
  137. test_classifier(pipeline, my_dataset, new_features_list)
  138.  
  139. print("\nLet's try other classifiers")
  140.  
  141. ### Voting Classifier to combine multiple classifiers
  142. from sklearn.ensemble import VotingClassifier
  143.  
  144. clf1 = DecisionTreeClassifier(min_samples_split=2)
  145. clf2 = RandomForestClassifier(random_state=1)
  146. clf3 = GaussianNB()
  147. eclf = VotingClassifier(estimators=[('dt', clf1), ('rf', clf2), ('gnb', clf3)],
  148. voting='soft')
  149.  
  150. params = {'dt__min_samples_split': [2, 3, 4, 5],
  151. 'rf__n_estimators': [20, 30, 50],
  152. }
  153.  
  154. grid = GridSearchCV(estimator=eclf, param_grid=params, cv=sss, scoring='f1')
  155. grid = grid.fit(features, labels)
  156.  
  157. grid_prediction = grid.predict(features_test)
  158. grid_report = classification_report(grid_prediction, labels_test)
  159. print(grid_report)
  160.  
  161. print("Best score: %0.3f" % grid.best_score_)
  162. print("Best parameters set:")
  163. grid_best_parameters = grid.best_estimator_.get_params()
  164. for param_name in sorted(params.keys()):
  165. print("\t%s: %r" % (param_name, grid_best_parameters[param_name]))
  166.  
  167. # Classifiers tester
  168. #test_classifier(eclf, my_dataset, new_features_list)
  169.  
  170. clf = pipeline
  171.  
  172. ### Task 6: Dump your classifier, dataset, and features_list so anyone can
  173. ### check your results. You do not need to change anything below, but make sure
  174. ### that the version of poi_id.py that you submit can be run on its own and
  175. ### generates the necessary .pkl files for validating your results.
  176.  
  177. dump_classifier_and_data(clf, my_dataset, new_features_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement