Untitled

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD as SVD
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import explained_variance_score as evs
from sklearn.metrics import mean_squared_error as mse


V = np.array([[0,1,0,1,2,2],
              [2,3,1,1,2,2],
              [1,1,1,0,1,1],
              [0,2,3,4,1,1],
              [0,0,0,0,1,0]])
V = pd.DataFrame(V, index = ('Овощи', 'Фрукты', 'Сладости', 'Хлеб', 'Кофе'),
                columns = ('Миша', 'Маша' ,'Рома', 'Дима', 'Витя', 'Вова'))

def reconstruct(model, data):
#input: model - sklearn model, data - pandas DataFrame
#returns pandas DataFrame of reconstructed matrix
    model.fit(data)
    W = model.transform(data)
    H = model.components_
    reconstructed = pd.DataFrame(np.round(np.dot(W, H), 2), columns=data.columns, index=data.index)
    # next line use in case of SVD or PCA decomposition to substitute negative elements for zeros
    # reconstructed[reconstructed < 0] = 0
    new_data = []
    # interpolation of reconstructed matrix to range [0,1]
    for i in range(0, reconstructed.shape[0]):
        r = reconstructed.loc[reconstructed.index[i]]
        rd = np.interp(r, (r.min(), r.max()), (0, +1))
        new_data.append(rd)
    interpolated = pd.DataFrame(np.round(new_data, 2), index=reconstructed.index, columns=reconstructed.columns)
    return reconstructed , interpolated

def sparsity(data):
    return 1.0 - np.count_nonzero(data) / data.size


def rel_spars(data):
    return 1 - (sparsity(data) / sparsity(V))


def evaluate_plot(model, data):
    EVS = []
    MSE = []
    MAE = []
    SPARS = []
    ks = [1, 2, 3, 4, 5]
    for k in ks:
        MAE.append(mae(data, reconstruct(model(n_components=k), data)[0]))
        MSE.append((mse(data, reconstruct(model(n_components=k), data)[0])))
        EVS.append(evs(data, reconstruct(model(n_components=k), data)[0]))
        SPARS.append(rel_spars(reconstruct(model(n_components=k), data)[0]))
    plt.xlabel('N - components')
    plt.ylabel('Value')
    plt.plot(MSE)
    plt.plot(MAE)
    plt.plot(EVS)
    plt.plot(SPARS)
    METRIC = np.round(MSE, 1)
    for i in range(0, len((METRIC))):
        if METRIC[i] == METRIC[i + 1]:
            plt.scatter(i, MSE[i], c='red')
            plt.scatter(i, EVS[i], c='red')
            plt.scatter(i, SPARS[i], c='green')
            plt.text(i, EVS[i] + 0.01, str(np.round(EVS[i], 3)))
            plt.text(i, SPARS[i] + 0.01, str(np.round(SPARS[i], 3)))
            plt.vlines(i, 0, 1, colors='red')
            break
    plt.legend(('RMSE', 'MAE', 'EVS', 'SPARSITY'), loc='best')
    plt.xticks([0, 1, 2, 3, 4], ks)
    plt.show()


model = NMF
# or use other models to see result
#model = PCA
#model = SVD

#usage
evaluate_plot(model,V)
print(reconstruct(model,V)[0]) #reconstructed
print(reconstruct(model,V)[1]) #interpolated