Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from typing import List, Dict, Any
- import matplotlib.pyplot as plt
- import numpy as np
- import math
- from scipy.stats.stats import pearsonr
- from scipy.stats.mstats import zscore
- from sklearn.decomposition import PCA
- from sklearn.feature_selection import SelectKBest, chi2
- from sklearn import preprocessing
- from numpy import array
- from matplotlib import colors as mcolors
- attributes: List[str] = []
- mapClass: Dict[str, int] = dict()
- revMapClass: Dict[int, str] = dict()
- classes: List[List[float]] = []
- numOfClasses = 0
- numOfReadings = 0
- numOfRecords = 0
- colors = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000']
- def openFile(str):
- global attributes
- global numOfClasses
- global numOfReadings
- f1 = open(str, "r")
- content: List[str] = f1.read().splitlines()
- attributes = content[0].split(',')
- for i in range(1, len(content)):
- curdata = content[i].split(',')
- if (not mapClass.__contains__(curdata[0])):
- mapClass[curdata[0]] = numOfClasses
- revMapClass[numOfClasses] = curdata[0]
- numOfClasses += 1
- tmp: List[float] = []
- for j in range(1, len(curdata)):
- tmp.append(float(curdata[j]))
- classes[mapClass[curdata[0]]].append(tmp)
- numOfReadings += (len(content) - 1)
- def readData():
- global numOfRecords
- for i in range(7):
- empty_list = []
- classes.append(empty_list)
- openFile("segmentation.test")
- openFile("segmentation.data")
- print("Number of Readings " + str(numOfReadings))
- print("Number of Classes " + str(numOfClasses))
- print("Number of Attributes " + str(len(attributes)))
- numOfRecords = int(numOfReadings / numOfClasses)
- def plotData():
- X = []
- for i in range(numOfClasses):
- for j in range(numOfRecords):
- X.append(classes[i][j])
- plt.title("Data Plot")
- plt.plot(X)
- plt.show()
- def pearson():
- Matrix = [[0 for x in range(len(attributes))] for y in range(len(attributes))]
- for i in range(len(attributes)):
- for j in range(i + 1, len(attributes)):
- x = []
- y = []
- for c in range(numOfClasses):
- for cur in range(len(classes[c])):
- x.append(classes[c][cur][i])
- y.append(classes[c][cur][j])
- Matrix[i][i] = pearsonr(x, x)[0]
- Matrix[i][j] = pearsonr(x, y)[0]
- Matrix[j][i] = pearsonr(y, x)[0]
- plt.imshow(Matrix)
- plt.title("Pearson Matrix")
- plt.show()
- return Matrix
- def covariance():
- data = []
- for i in range(len(attributes)):
- data.append([])
- for i in range(numOfClasses):
- for j in range(numOfRecords):
- for k in range(len(attributes)):
- data[k].append(classes[i][j][k])
- return np.cov(data)
- def check(pearson, cov):
- print("pearosn before: ")
- for i in range(len(pearson)):
- print(pearson[i])
- for i in range(len(pearson)):
- for j in range(len(pearson[0])):
- pearson[i][j] = pearson[i][j] * math.sqrt(cov[i][i]) * math.sqrt(cov[j][j])
- print("pearosn: ")
- for i in range(len(pearson)):
- print(pearson[i])
- print("cov")
- for i in range(len(cov)):
- print(cov[i])
- def histogram(matrix, title):
- for i in range(numOfClasses):
- current_class = matrix[i]
- f, (ax1) = plt.subplots(1, 1)
- plt.suptitle(title)
- for j in range(len(attributes)):
- ax1.hist([k for k in current_class[:][j]], color = colors[j])
- ax1.legend(attributes[j])
- ax1.set_title(revMapClass[i] + " histogram")
- plt.show()
- def histogram_bins():
- for i in range(numOfClasses):
- for k in [5, 10, 12]:
- current_class = classes[i]
- f, (ax1) = plt.subplots(1, 1)
- for j in range(len(attributes)):
- ax1.hist([k for k in current_class[:][j]], bins = k, color = colors[j])
- ax1.set_title(revMapClass[i] + " bins: " + str(k))
- plt.show()
- def z_score():
- matrix = [[[0] * len(attributes)] * numOfRecords] * numOfClasses
- for i in range(len(attributes)):
- attr = []
- for j in range(numOfClasses):
- for k in range(numOfRecords):
- attr.append(classes[j][k][i])
- attr_normalized = zscore(attr)
- id = 0
- print(attr_normalized)
- for j in range(numOfClasses):
- for k in range(numOfRecords):
- if math.isnan(attr_normalized[id]):
- matrix[j][k][i] = 0
- else:
- matrix[j][k][i] = attr_normalized[id]
- id += 1
- histogram(matrix, "After z-score normalization")
- return matrix
- def min_max_scaler():
- min_max_scaler = preprocessing.MinMaxScaler()
- matrix = []
- for i in range(numOfClasses):
- for j in range(numOfRecords):
- matrix.append(classes[i][j])
- data_min_max_scaled = min_max_scaler.fit_transform(matrix)
- return_matrix = []
- for i in range(numOfClasses):
- return_matrix.append([])
- for i in range(len(data_min_max_scaled)):
- return_matrix[int(i / numOfRecords)].append(data_min_max_scaled[i])
- histogram(return_matrix, "After min-max-scaler")
- return return_matrix
- def get_pca(normalized_matrix):
- pca = PCA(n_components=15)
- X = []
- for i in range(numOfClasses):
- for j in range(numOfRecords):
- X.append(normalized_matrix[i][j])
- pca.fit_transform(X)
- print(pca.explained_variance_ratio_)
- def get_kbest(matrix, bestK):
- X = []
- for i in range(numOfClasses):
- for j in range(numOfRecords):
- X.append(matrix[i][j])
- fit = SelectKBest(chi2, k=bestK).fit(X, [1 for z in range(len(X))])
- print(fit.scores_)
- X_new = fit.tranform(matrix)
- print(X_new)
- return X_new
- if __name__ == "__main__":
- readData()
- #plotData()
- #Pearson = pearson()
- #cov = covariance()
- #check(Pearson, cov)
- #histogram(classes, "Raw data")
- #histogram_bins()
- #z_score_matrix = z_score()
- scaler_matrix = min_max_scaler()
- #get_pca(z_score_matrix)
- get_kbest(scaler_matrix, 1)
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement