Advertisement
Guest User

Untitled

a guest
Mar 20th, 2018
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.35 KB | None | 0 0
  1. # Regression
  2. # Project 2
  3. from matplotlib.pyplot import figure, plot, subplot, title, xlabel, ylabel, show, clim
  4. import matplotlib
  5. from scipy.io import loadmat
  6. import sklearn.linear_model as lm
  7. from sklearn import model_selection
  8. from toolbox_02450 import feature_selector_lr, bmplot
  9. import numpy as np
  10. from Project1 import dataInArray, normalizeSet, dataOutOfK_ALL
  11. # Splitting data in input and output without names and nominal attributes.
  12.  
  13. print(dataOutOfK_ALL)
  14.  
  15. matplotlib.rcParams.update({'font.size': 5})
  16. attributeNames = ['LM: N. America','LM: S. America','LM: Europe','LM: Africa','LM: Asia','LM: Oceania', 'Zone: NE', 'Zone: SE', 'Zone: SW', 'Zone: NW', 'Area', 'Language: English', 'Language: Spanish' , 'Language: French' ,'Language: German' ,'Language: Slavic' ,'Language: Other Indo-European', 'Language: Chinese', 'Language: Arabic','Language: J/T/F/M', 'Language: Others','Bars', 'Stripes', 'Colours','Red', 'Green', 'Blue', 'Gold', 'White', 'Black', 'Orange', 'Mainhue: Red', 'Mainhue: Green', 'Mainhue: Blue','Mainhue: Gold','Mainhue: White', 'Mainhue: Black', 'Mainhue: Orange/brown', '#Circles', '#Crosses','#Saltires', '#Quarters', '#Sunstars', 'Crescent', 'Triangles','Icons','Animates', 'Text', 'TLC: Red','TLC: Green','TLC: Blue', 'TLC: Gold', 'TLC: White','TLC: Black','TLC: Orange/brown', 'BRC: Red', 'BRC: Green', 'BRC: Blue','BRC: Gold','BRC: White', 'BRC: Black', 'BRC: Orange/brown']
  17. #Normalize output.
  18. y = (dataInArray[:,4].astype(np.double)-np.mean(dataInArray[:,4].astype(np.double)))
  19.  
  20.  
  21. X = dataOutOfK_ALL.astype(np.double)
  22. N, M = X.shape
  23. print(M)
  24. print(N)
  25. ## Crossvalidation
  26. # Create crossvalidation partition for evaluation
  27. K = 10
  28. CV = model_selection.KFold(n_splits=K, shuffle=True)
  29.  
  30. # Initialize variables
  31. Features = np.zeros((M, K))
  32. Error_train = np.empty((K, 1))
  33. Error_test = np.empty((K, 1))
  34. Error_train_fs = np.empty((K, 1))
  35. Error_test_fs = np.empty((K, 1))
  36. Error_train_nofeatures = np.empty((K, 1))
  37. Error_test_nofeatures = np.empty((K, 1))
  38.  
  39. k = 0
  40. for train_index, test_index in CV.split(X):
  41.  
  42. # extract training and test set for current CV fold
  43. X_train = np.copy(X[train_index, :])
  44. y_train = np.copy(y[train_index])
  45. X_test = np.copy(X[test_index, :])
  46. y_test = y[test_index]
  47. internal_cross_validation = 10
  48.  
  49. X_train = normalizeSet(X_train, M)
  50. y_train = normalizeSet(y_train, M)
  51. X_test = normalizeSet(X_test, M)
  52. y_test = normalizeSet(y_test, M)
  53.  
  54. # Compute squared error without using the input data at all
  55. Error_train_nofeatures[k] = np.square(y_train - y_train.mean()).sum() / y_train.shape[0]
  56. Error_test_nofeatures[k] = np.square(y_test - y_test.mean()).sum() / y_test.shape[0]
  57.  
  58. # Compute squared error with all features selected (no feature selection)
  59. m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train)
  60. Error_train[k] = np.square(y_train - m.predict(X_train)).sum() / y_train.shape[0]
  61. Error_test[k] = np.square(y_test - m.predict(X_test)).sum() / y_test.shape[0]
  62.  
  63. # Compute squared error with feature subset selection
  64. # textout = 'verbose';
  65. textout = '';
  66. selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation,
  67. display=textout)
  68.  
  69. Features[selected_features, k] = 1
  70. # .. alternatively you could use module sklearn.feature_selection
  71. if len(selected_features) is 0:
  72. print('No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).')
  73. else:
  74. m = lm.LinearRegression(fit_intercept=True).fit(X_train[:, selected_features], y_train)
  75. Error_train_fs[k] = np.square(y_train - m.predict(X_train[:, selected_features])).sum() / y_train.shape[0]
  76. Error_test_fs[k] = np.square(y_test - m.predict(X_test[:, selected_features])).sum() / y_test.shape[0]
  77.  
  78. figure(k)
  79. subplot(1, 2, 1)
  80. plot(range(1, len(loss_record)), loss_record[1:])
  81. xlabel('Iteration')
  82. ylabel('Squared error (crossvalidation)')
  83.  
  84. subplot(1, 3, 3)
  85. bmplot(attributeNames, range(1, features_record.shape[1]), -features_record[:, 1:])
  86. clim(-1.5, 0)
  87. xlabel('Iteration')
  88.  
  89. print('Cross validation fold {0}/{1}'.format(k + 1, K))
  90. print('Train indices: {0}'.format(train_index))
  91. print('Test indices: {0}'.format(test_index))
  92. print('Features no: {0}\n'.format(selected_features.size))
  93.  
  94. k += 1
  95.  
  96. # Display results
  97. print('\n')
  98. print('Linear regression without feature selection:\n')
  99. print('- Training error: {0}'.format(Error_train.mean()))
  100. print('- Test error: {0}'.format(Error_test.mean()))
  101. print('- R^2 train: {0}'.format((Error_train_nofeatures.sum() - Error_train.sum()) / Error_train_nofeatures.sum()))
  102. print('- R^2 test: {0}'.format((Error_test_nofeatures.sum() - Error_test.sum()) / Error_test_nofeatures.sum()))
  103. print('Linear regression with feature selection:\n')
  104. print('- Training error: {0}'.format(Error_train_fs.mean()))
  105. print('- Test error: {0}'.format(Error_test_fs.mean()))
  106. print(
  107. '- R^2 train: {0}'.format((Error_train_nofeatures.sum() - Error_train_fs.sum()) / Error_train_nofeatures.sum()))
  108. print('- R^2 test: {0}'.format((Error_test_nofeatures.sum() - Error_test_fs.sum()) / Error_test_nofeatures.sum()))
  109.  
  110. figure(k)
  111. subplot(1, 3, 2)
  112. bmplot(attributeNames, range(1, Features.shape[1] + 1), -Features)
  113. clim(-1.5, 0)
  114. xlabel('Crossvalidation fold')
  115. ylabel('Attribute')
  116.  
  117. # Inspect selected feature coefficients effect on the entire dataset and
  118. # plot the fitted model residual error as function of each attribute to
  119. # inspect for systematic structure in the residual
  120.  
  121. f = 2 # cross-validation fold to inspect
  122. ff = Features[:, f - 1].nonzero()[0]
  123. if len(ff) is 0:
  124. print('\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).')
  125. else:
  126. m = lm.LinearRegression(fit_intercept=True).fit(X[:, ff], y)
  127.  
  128. y_est = m.predict(X[:, ff])
  129. residual = y - y_est
  130.  
  131. figure(k + 1, figsize=(12, 6))
  132. title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f))
  133. for i in range(0, len(ff)):
  134. subplot(2, np.ceil(len(ff) / 2.0), i + 1)
  135. plot(X[:, ff[i]], residual, '.')
  136. xlabel(attributeNames[ff[i]])
  137. ylabel('residual error')
  138.  
  139. show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement