Advertisement
Guest User

Untitled

a guest
Feb 8th, 2016
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.58 KB | None | 0 0
  1. """ Amazon Access Challenge Starter Code
  2.  
  3. These files provide some starter code using
  4. the scikit-learn library. It provides some examples on how
  5. to design a simple algorithm, including pre-processing,
  6. training a logistic regression classifier on the data,
  7. assess its performance through cross-validation and some
  8. pointers on where to go next.
  9.  
  10. Paul Duan <email@paulduan.com>
  11. """
  12.  
  13. from __future__ import division
  14.  
  15. import numpy as np
  16. from sklearn import (metrics, cross_validation, linear_model, preprocessing)
  17.  
  18. from io_helper import (load_data, save_results)
  19.  
  20. SEED = 42 # always use a seed for randomized procedures
  21.  
  22.  
  23. def load_data(filename, use_labels=True):
  24. """
  25. Load data from CSV files and return them as numpy arrays
  26. The use_labels parameter indicates whether one should
  27. read the first column (containing class labels). If false,
  28. return all 0s.
  29. """
  30.  
  31. # load column 1 to 8 (ignore last one)
  32. data = np.loadtxt(open("data/" + filename), delimiter=',',
  33. usecols=range(1, 9), skiprows=1)
  34. if use_labels:
  35. labels = np.loadtxt(open("data/" + filename), delimiter=',',
  36. usecols=[0], skiprows=1)
  37. else:
  38. labels = np.zeros(data.shape[0])
  39. return labels, data
  40.  
  41.  
  42. def save_results(predictions, filename):
  43. """Given a vector of predictions, save results in CSV format."""
  44. with open(filename, 'w') as f:
  45. f.write("id,ACTION\n")
  46. for i, pred in enumerate(predictions):
  47. f.write("%d,%f\n" % (i + 1, pred))
  48.  
  49.  
  50. def main():
  51. """
  52. Fit models and make predictions.
  53. We'll use one-hot encoding to transform our categorical features
  54. into binary features.
  55. y and X will be numpy array objects.
  56. """
  57. model = linear_model.LogisticRegression(C=3) # the classifier we'll use
  58.  
  59. # === load data in memory === #
  60. print "loading data"
  61. y, X = load_data('train.csv')
  62. y_test, X_test = load_data('test.csv', use_labels=False)
  63.  
  64. # === one-hot encoding === #
  65. # we want to encode the category IDs encountered both in
  66. # the training and the test set, so we fit the encoder on both
  67. encoder = preprocessing.OneHotEncoder()
  68. encoder.fit(np.vstack((X, X_test)))
  69. X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse)
  70. X_test = encoder.transform(X_test)
  71.  
  72. # if you want to create new features, you'll need to compute them
  73. # before the encoding, and append them to your dataset after
  74.  
  75. # === training & metrics === #
  76. mean_auc = 0.0
  77. n = 10 # repeat the CV procedure 10 times to get more precise results
  78. for i in range(n):
  79. # for each iteration, randomly hold out 20% of the data as CV set
  80. X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
  81. X, y, test_size=.20, random_state=i*SEED)
  82.  
  83. # if you want to perform feature selection / hyperparameter
  84. # optimization, this is where you want to do it
  85.  
  86. # train model and make predictions
  87. model.fit(X_train, y_train)
  88. preds = model.predict_proba(X_cv)[:, 1]
  89.  
  90. # compute AUC metric for this CV fold
  91. fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
  92. roc_auc = metrics.auc(fpr, tpr)
  93. print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
  94. mean_auc += roc_auc
  95.  
  96. print "Mean AUC: %f" % (mean_auc/n)
  97.  
  98. # === Predictions === #
  99. # When making predictions, retrain the model on the whole training set
  100. model.fit(X, y)
  101. preds = model.predict_proba(X_test)[:, 1]
  102. filename = raw_input("Enter name for submission file: ")
  103. save_results(preds, filename + ".csv")
  104.  
  105. if __name__ == '__main__':
  106. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement