Guest User

Untitled

a guest
Aug 6th, 2016
219
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.95 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. from sklearn.linear_model import SGDClassifier
  4. from sklearn.grid_search import GridSearchCV
  5. from sklearn.cross_validation import cross_val_score
  6. import sys
  7.  
  8. # check for valid sys.argv (must be 2 files)
  9. if len(sys.argv) < 3:
  10.     sys.stdout.write("You must set train and test files as arguments!")
  11.     sys.exit(0)
  12.  
  13. # read train data
  14. with open(sys.argv[1], 'r') as train_file:
  15.     train_df = pd.read_csv(train_file, sep='\t')
  16. features_cols = ['x' + str(i).zfill(3) for i in range(30)]
  17. X_train = np.array(train_df[features_cols]) #features
  18. y_train = np.array(train_df['y'])
  19.  
  20. # build a model
  21. n_samples = len(train_df.index)
  22. n_iter = np.ceil((10**6)/n_samples)
  23. clf = SGDClassifier(loss='log', penalty='none', fit_intercept=False, n_iter=n_iter)
  24.  
  25. #clf.fit(X, Y)
  26.  
  27. # TODO:
  28. # Параметры подбирай через GridSearch, scoring function — f1_weigted,
  29. # проверку делай через cross_val_score с cv=10.
  30.  
  31. param_grid = {
  32.     #'alpha': [0.001, 0.0001, 0.00001, 0.000001],
  33. }
  34. grid = GridSearchCV(clf, param_grid, cv=10, scoring='f1_weighted')
  35. print(grid)
  36. grid.fit(X_train, y_train)
  37.  
  38. best_params = grid.best_estimator_.get_params()
  39. print('best params:', best_params)
  40.  
  41. # read test data
  42. with open(sys.argv[2], 'r') as test_file:
  43.     test_df = pd.read_csv(test_file, sep='\t')
  44. X_test = test_df[features_cols]
  45. y_test = np.array(test_df['y'])
  46.  
  47. predict_Y = grid.predict(X_test)
  48. print('test predicted:', predict_Y)
  49. print('test real:', y_test)
  50. error = np.mean(predict_Y != y_test)
  51. print('test error:', error)
  52.  
  53. predict_train_Y = grid.predict(X_train)
  54. print('train predicted:', predict_train_Y)
  55. print('train real:', y_train)
  56. train_error = np.mean(predict_train_Y != y_train)
  57. print('train error:', train_error)
  58.  
  59. clf = SGDClassifier(**best_params)
  60. scores = cross_val_score(clf, X_train, y_train, cv=10)
  61. print('Best %s: %0.3f (+/- %0.2f)' % \
  62.     ('f1_weighted', scores.mean(), scores.std() / 2))
Advertisement
Add Comment
Please, Sign In to add comment