Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- from sklearn.model_selection import StratifiedKFold
- from sklearn.svm import SVC
- # We'll use 10-fold cross-validation
- n_folds = 10
- # Create some data with 500 samples and 100 features
- N = 500
- K = 100
- X = np.random.normal(0, 1, size=(N, K))
- # Create a binary target variable with 500 samples
- y = np.repeat([0, 1], repeats=(N / 2))
- # And a (continuous) confound, again with 500 samples
- c = np.random.normal(0, 1, size=(N, 1))
- # We'll add an intercept to our confound variable
- c = np.c_[np.ones(c.shape), c]
- # Let's define our cross-validation scheme
- skf = StratifiedKFold(n_splits=n_folds)
- # And our model
- clf = SVC(kernel='linear')
- for train_idx, test_idx in skf.split(X, y):
- X_train = X[train_idx, :]
- X_test = X[test_idx, :]
- y_train = y[train_idx]
- y_test = y[test_idx]
- c_train = c[train_idx, :]
- c_test = c[test_idx, :]
- print("Shape of X_train: %s" % (X_train.shape,))
- print("Shape of X_test: %s" % (X_test.shape,))
- # Now, let's estimate the "confound model" on the train-set only
- # We'll add an intercept as well
- c_weights_train = np.linalg.lstsq(c_train, X_train, rcond=None)[0]
- print("Shape of weights: %s" % (c_weights_train.shape,))
- # Now, we can regress out c_train from X_train
- X_train_corr = X_train - c_train.dot(c_weights_train)
- print("Shape of X_train (corrected): %s" % (X_train_corr.shape,))
- # Now, let's *cross-validate* our confound regression procedure,
- # i.e., use c_weights_train for correcting X_test!
- X_test_corr = X_test - c_test.dot(c_weights_train)
- print("Shape of X_test (corrected): %s" % (X_test_corr.shape,))
- # Fit model on corrected data
- clf.fit(X_train_corr, y_train)
- pred = clf.predict(X_test_corr)
Add Comment
Please, Sign In to add comment