Advertisement
Guest User

Untitled

a guest
Oct 23rd, 2018
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.42 KB | None | 0 0
  1. from typing import List, Dict, Any
  2.  
  3. import matplotlib.pyplot as plt
  4. import numpy as np
  5. import math
  6. from scipy.stats.stats import pearsonr
  7. from scipy.stats.mstats import zscore
  8. from sklearn.decomposition import PCA
  9. from sklearn.feature_selection import SelectKBest, chi2
  10. from sklearn import preprocessing
  11.  
  12. from numpy import array
  13. from matplotlib import colors as mcolors
  14.  
  15. attributes: List[str] = []
  16. mapClass: Dict[str, int] = dict()
  17. revMapClass: Dict[int, str] = dict()
  18. classes: List[List[float]] = []
  19. numOfClasses = 0
  20. numOfReadings = 0
  21. numOfRecords = 0
  22. colors = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000']
  23.  
  24. def openFile(str):
  25. global attributes
  26. global numOfClasses
  27. global numOfReadings
  28. f1 = open(str, "r")
  29. content: List[str] = f1.read().splitlines()
  30. attributes = content[0].split(',')
  31. for i in range(1, len(content)):
  32. curdata = content[i].split(',')
  33. if (not mapClass.__contains__(curdata[0])):
  34. mapClass[curdata[0]] = numOfClasses
  35. revMapClass[numOfClasses] = curdata[0]
  36. numOfClasses += 1
  37. tmp: List[float] = []
  38. for j in range(1, len(curdata)):
  39. tmp.append(float(curdata[j]))
  40. classes[mapClass[curdata[0]]].append(tmp)
  41. numOfReadings += (len(content) - 1)
  42.  
  43.  
  44. def readData():
  45. global numOfRecords
  46. for i in range(7):
  47. empty_list = []
  48. classes.append(empty_list)
  49. openFile("segmentation.test")
  50. openFile("segmentation.data")
  51. print("Number of Readings " + str(numOfReadings))
  52. print("Number of Classes " + str(numOfClasses))
  53. print("Number of Attributes " + str(len(attributes)))
  54. numOfRecords = int(numOfReadings / numOfClasses)
  55.  
  56.  
  57. def plotData():
  58. X = []
  59. for i in range(numOfClasses):
  60. for j in range(numOfRecords):
  61. X.append(classes[i][j])
  62. plt.title("Data Plot")
  63. plt.plot(X)
  64. plt.show()
  65.  
  66. def pearson():
  67. Matrix = [[0 for x in range(len(attributes))] for y in range(len(attributes))]
  68. for i in range(len(attributes)):
  69. for j in range(i + 1, len(attributes)):
  70. x = []
  71. y = []
  72. for c in range(numOfClasses):
  73. for cur in range(len(classes[c])):
  74. x.append(classes[c][cur][i])
  75. y.append(classes[c][cur][j])
  76. Matrix[i][i] = pearsonr(x, x)[0]
  77. Matrix[i][j] = pearsonr(x, y)[0]
  78. Matrix[j][i] = pearsonr(y, x)[0]
  79. plt.imshow(Matrix)
  80. plt.title("Pearson Matrix")
  81. plt.show()
  82. return Matrix
  83.  
  84.  
  85. def covariance():
  86. data = []
  87. for i in range(len(attributes)):
  88. data.append([])
  89. for i in range(numOfClasses):
  90. for j in range(numOfRecords):
  91. for k in range(len(attributes)):
  92. data[k].append(classes[i][j][k])
  93. return np.cov(data)
  94.  
  95.  
  96. def check(pearson, cov):
  97. print("pearosn before: ")
  98. for i in range(len(pearson)):
  99. print(pearson[i])
  100. for i in range(len(pearson)):
  101. for j in range(len(pearson[0])):
  102. pearson[i][j] = pearson[i][j] * math.sqrt(cov[i][i]) * math.sqrt(cov[j][j])
  103. print("pearosn: ")
  104. for i in range(len(pearson)):
  105. print(pearson[i])
  106. print("cov")
  107. for i in range(len(cov)):
  108. print(cov[i])
  109.  
  110.  
  111. def histogram(matrix, title):
  112. for i in range(numOfClasses):
  113. current_class = matrix[i]
  114. f, (ax1) = plt.subplots(1, 1)
  115. plt.suptitle(title)
  116. for j in range(len(attributes)):
  117. ax1.hist([k for k in current_class[:][j]], color = colors[j])
  118. ax1.legend(attributes[j])
  119. ax1.set_title(revMapClass[i] + " histogram")
  120. plt.show()
  121.  
  122.  
  123. def histogram_bins():
  124. for i in range(numOfClasses):
  125. for k in [5, 10, 12]:
  126. current_class = classes[i]
  127. f, (ax1) = plt.subplots(1, 1)
  128. for j in range(len(attributes)):
  129. ax1.hist([k for k in current_class[:][j]], bins = k, color = colors[j])
  130. ax1.set_title(revMapClass[i] + " bins: " + str(k))
  131. plt.show()
  132.  
  133.  
  134. def z_score():
  135. matrix = [[[0] * len(attributes)] * numOfRecords] * numOfClasses
  136. for i in range(len(attributes)):
  137. attr = []
  138. for j in range(numOfClasses):
  139. for k in range(numOfRecords):
  140. attr.append(classes[j][k][i])
  141. attr_normalized = zscore(attr)
  142. id = 0
  143. print(attr_normalized)
  144. for j in range(numOfClasses):
  145. for k in range(numOfRecords):
  146. if math.isnan(attr_normalized[id]):
  147. matrix[j][k][i] = 0
  148. else:
  149. matrix[j][k][i] = attr_normalized[id]
  150. id += 1
  151.  
  152. histogram(matrix, "After z-score normalization")
  153. return matrix
  154.  
  155. def min_max_scaler():
  156. min_max_scaler = preprocessing.MinMaxScaler()
  157. matrix = []
  158. for i in range(numOfClasses):
  159. for j in range(numOfRecords):
  160. matrix.append(classes[i][j])
  161. data_min_max_scaled = min_max_scaler.fit_transform(matrix)
  162. return_matrix = []
  163. for i in range(numOfClasses):
  164. return_matrix.append([])
  165. for i in range(len(data_min_max_scaled)):
  166. return_matrix[int(i / numOfRecords)].append(data_min_max_scaled[i])
  167. histogram(return_matrix, "After min-max-scaler")
  168. return return_matrix
  169.  
  170.  
  171. def get_pca(normalized_matrix):
  172. pca = PCA(n_components=15)
  173. X = []
  174. for i in range(numOfClasses):
  175. for j in range(numOfRecords):
  176. X.append(normalized_matrix[i][j])
  177. pca.fit_transform(X)
  178. print(pca.explained_variance_ratio_)
  179.  
  180. def get_kbest(matrix, bestK):
  181. X = []
  182. for i in range(numOfClasses):
  183. for j in range(numOfRecords):
  184. X.append(matrix[i][j])
  185. fit = SelectKBest(chi2, k=bestK).fit(X, [1 for z in range(len(X))])
  186. print(fit.scores_)
  187. X_new = fit.tranform(matrix)
  188. print(X_new)
  189. return X_new
  190.  
  191.  
  192. if __name__ == "__main__":
  193. readData()
  194. #plotData()
  195. #Pearson = pearson()
  196. #cov = covariance()
  197. #check(Pearson, cov)
  198. #histogram(classes, "Raw data")
  199. #histogram_bins()
  200. #z_score_matrix = z_score()
  201. scaler_matrix = min_max_scaler()
  202. #get_pca(z_score_matrix)
  203. get_kbest(scaler_matrix, 1)
  204. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement