Untitled

# Regression
# Project 2
from matplotlib.pyplot import figure, plot, subplot, title, xlabel, ylabel, show, clim
import matplotlib
from scipy.io import loadmat
import sklearn.linear_model as lm
from sklearn import model_selection
from toolbox_02450 import feature_selector_lr, bmplot
import numpy as np
from Project1 import dataInArray, normalizeSet, dataOutOfK_ALL
# Splitting data in input and output without names and nominal attributes.

print(dataOutOfK_ALL)

matplotlib.rcParams.update({'font.size': 5})
attributeNames = ['LM: N. America','LM: S. America','LM: Europe','LM: Africa','LM: Asia','LM: Oceania', 'Zone: NE', 'Zone: SE', 'Zone: SW', 'Zone: NW', 'Area', 'Language: English', 'Language: Spanish' , 'Language: French' ,'Language: German' ,'Language: Slavic' ,'Language: Other Indo-European', 'Language: Chinese', 'Language: Arabic','Language: J/T/F/M', 'Language: Others','Bars', 'Stripes', 'Colours','Red', 'Green', 'Blue', 'Gold', 'White', 'Black', 'Orange', 'Mainhue: Red', 'Mainhue: Green', 'Mainhue: Blue','Mainhue: Gold','Mainhue: White', 'Mainhue: Black', 'Mainhue: Orange/brown', '#Circles', '#Crosses','#Saltires', '#Quarters', '#Sunstars', 'Crescent', 'Triangles','Icons','Animates', 'Text', 'TLC: Red','TLC: Green','TLC: Blue', 'TLC: Gold', 'TLC: White','TLC: Black','TLC: Orange/brown', 'BRC: Red', 'BRC: Green', 'BRC: Blue','BRC: Gold','BRC: White', 'BRC: Black', 'BRC: Orange/brown']
#Normalize output.
y = (dataInArray[:,4].astype(np.double)-np.mean(dataInArray[:,4].astype(np.double)))


X = dataOutOfK_ALL.astype(np.double)
N, M = X.shape
print(M)
print(N)
## Crossvalidation
# Create crossvalidation partition for evaluation
K = 10
CV = model_selection.KFold(n_splits=K, shuffle=True)

# Initialize variables
Features = np.zeros((M, K))
Error_train = np.empty((K, 1))
Error_test = np.empty((K, 1))
Error_train_fs = np.empty((K, 1))
Error_test_fs = np.empty((K, 1))
Error_train_nofeatures = np.empty((K, 1))
Error_test_nofeatures = np.empty((K, 1))

k = 0
for train_index, test_index in CV.split(X):

    # extract training and test set for current CV fold
    X_train = np.copy(X[train_index, :])
    y_train = np.copy(y[train_index])
    X_test = np.copy(X[test_index, :])
    y_test = y[test_index]
    internal_cross_validation = 10

    X_train = normalizeSet(X_train, M)
    y_train = normalizeSet(y_train, M)
    X_test = normalizeSet(X_test, M)
    y_test = normalizeSet(y_test, M)

    # Compute squared error without using the input data at all
    Error_train_nofeatures[k] = np.square(y_train - y_train.mean()).sum() / y_train.shape[0]
    Error_test_nofeatures[k] = np.square(y_test - y_test.mean()).sum() / y_test.shape[0]

    # Compute squared error with all features selected (no feature selection)
    m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train)
    Error_train[k] = np.square(y_train - m.predict(X_train)).sum() / y_train.shape[0]
    Error_test[k] = np.square(y_test - m.predict(X_test)).sum() / y_test.shape[0]

    # Compute squared error with feature subset selection
    # textout = 'verbose';
    textout = '';
    selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation,
                                                                          display=textout)

    Features[selected_features, k] = 1
    # .. alternatively you could use module sklearn.feature_selection
    if len(selected_features) is 0:
        print('No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).')
    else:
        m = lm.LinearRegression(fit_intercept=True).fit(X_train[:, selected_features], y_train)
        Error_train_fs[k] = np.square(y_train - m.predict(X_train[:, selected_features])).sum() / y_train.shape[0]
        Error_test_fs[k] = np.square(y_test - m.predict(X_test[:, selected_features])).sum() / y_test.shape[0]

        figure(k)
        subplot(1, 2, 1)
        plot(range(1, len(loss_record)), loss_record[1:])
        xlabel('Iteration')
        ylabel('Squared error (crossvalidation)')

        subplot(1, 3, 3)
        bmplot(attributeNames, range(1, features_record.shape[1]), -features_record[:, 1:])
        clim(-1.5, 0)
        xlabel('Iteration')

    print('Cross validation fold {0}/{1}'.format(k + 1, K))
    print('Train indices: {0}'.format(train_index))
    print('Test indices: {0}'.format(test_index))
    print('Features no: {0}\n'.format(selected_features.size))

    k += 1

# Display results
print('\n')
print('Linear regression without feature selection:\n')
print('- Training error: {0}'.format(Error_train.mean()))
print('- Test error:     {0}'.format(Error_test.mean()))
print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum() - Error_train.sum()) / Error_train_nofeatures.sum()))
print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum() - Error_test.sum()) / Error_test_nofeatures.sum()))
print('Linear regression with feature selection:\n')
print('- Training error: {0}'.format(Error_train_fs.mean()))
print('- Test error:     {0}'.format(Error_test_fs.mean()))
print(
    '- R^2 train:     {0}'.format((Error_train_nofeatures.sum() - Error_train_fs.sum()) / Error_train_nofeatures.sum()))
print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum() - Error_test_fs.sum()) / Error_test_nofeatures.sum()))

figure(k)
subplot(1, 3, 2)
bmplot(attributeNames, range(1, Features.shape[1] + 1), -Features)
clim(-1.5, 0)
xlabel('Crossvalidation fold')
ylabel('Attribute')

# Inspect selected feature coefficients effect on the entire dataset and
# plot the fitted model residual error as function of each attribute to
# inspect for systematic structure in the residual

f = 2  # cross-validation fold to inspect
ff = Features[:, f - 1].nonzero()[0]
if len(ff) is 0:
    print('\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).')
else:
    m = lm.LinearRegression(fit_intercept=True).fit(X[:, ff], y)

    y_est = m.predict(X[:, ff])
    residual = y - y_est

    figure(k + 1, figsize=(12, 6))
    title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f))
    for i in range(0, len(ff)):
        subplot(2, np.ceil(len(ff) / 2.0), i + 1)
        plot(X[:, ff[i]], residual, '.')
        xlabel(attributeNames[ff[i]])
        ylabel('residual error')

show()