Untitled

import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC

# We'll use 10-fold cross-validation
n_folds = 10

# Create some data with 500 samples and 100 features
N = 500
K = 100
X = np.random.normal(0, 1, size=(N, K))

# Create a binary target variable with 500 samples
y = np.repeat([0, 1], repeats=(N / 2))

# And a (continuous) confound, again with 500 samples
c = np.random.normal(0, 1, size=(N, 1))

# We'll add an intercept to our confound variable
c = np.c_[np.ones(c.shape), c]

# Let's define our cross-validation scheme
skf = StratifiedKFold(n_splits=n_folds)

# And our model
clf = SVC(kernel='linear')

for train_idx, test_idx in skf.split(X, y):
    X_train = X[train_idx, :]
    X_test = X[test_idx, :]
    y_train = y[train_idx]
    y_test = y[test_idx]
    c_train = c[train_idx, :]
    c_test = c[test_idx, :]

    print("Shape of X_train: %s" % (X_train.shape,))
    print("Shape of X_test: %s" % (X_test.shape,))

    # Now, let's estimate the "confound model" on the train-set only
    # We'll add an intercept as well
    c_weights_train = np.linalg.lstsq(c_train, X_train, rcond=None)[0]
    print("Shape of weights: %s" % (c_weights_train.shape,))

    # Now, we can regress out c_train from X_train
    X_train_corr = X_train - c_train.dot(c_weights_train)
    print("Shape of X_train (corrected): %s" % (X_train_corr.shape,))

    # Now, let's *cross-validate* our confound regression procedure,
    # i.e., use c_weights_train for correcting X_test!
    X_test_corr = X_test - c_test.dot(c_weights_train)
    print("Shape of X_test (corrected): %s" % (X_test_corr.shape,))

    # Fit model on corrected data
    clf.fit(X_train_corr, y_train)
    pred = clf.predict(X_test_corr)