Estadística

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from numpy import quantile
from math import sqrt
from matplotlib import style, cm
from prettytable import PrettyTable
from scipy.stats import norm, t

def BasicInformation(SAMPLE, plot=True):
    n = len(SAMPLE)

    SampleMean = sum(SAMPLE) / n
    SampleVariance = sum([(Xi - SampleMean)**2 for Xi in SAMPLE]) / (n - 1)
    def GetQuartiles(SAMPLE): return quantile(SAMPLE, .25), quantile(SAMPLE, .50), quantile(SAMPLE, .75)
    Q1, Q2, Q3 = GetQuartiles(SAMPLE)
    IQR = Q3 - Q1
    OutlierRange = [Q1 - 1.5*IQR, Q3 + 1.5*IQR]
    outliersList = [Xi for Xi in SAMPLE if Xi > OutlierRange[-1] or Xi < OutlierRange[0]]
    if plot:
        print('Sample Mean:', SampleMean)
        print('Sample Variance:', SampleVariance)
        print('Sample Standard Deviation:', sqrt(SampleVariance))
        print('Q1:', Q1, '  Q2:', Q2, '  Q3:', Q3)
        print('Outlier Range:', OutlierRange)
        print('Outliers:', set(outliersList))
        plt.boxplot(SAMPLE)
        plt.show()
        plt.hist(SAMPLE)
        plt.show()
    elif not plot:
        return SampleMean, SampleVariance, sqrt(SampleVariance), n, IQR, Q1, Q2, Q3, OutlierRange, set(outliersList)

def LinearRegression(X, Y, printValues=False, prediction=None):
    x = np.linspace(min(X),max(X),100)
    n = len(X)
    X_Mean, Y_Mean = sum(X)/len(X), sum(Y)/len(Y)
    Sxx, Sxy, Syy = sum([(x - X_Mean)**2 for x in X]), sum([(X[i]-X_Mean)*(Y[i]-Y_Mean) for i in range(len(X))]), sum([(y-Y_Mean)**2 for y in Y])

    B1 = Sxy / Sxx
    B0 = Y_Mean - B1*X_Mean
    y = B0 + B1*x
    def G(x): return B0 + B1*x

    if type(prediction) is list or type(prediction) is tuple:
        for i in prediction: print(f'Predicted Value at G({i}) = {G(i)}')
    elif prediction is not None:
        print(f'Predicted Value at G({prediction}) = {G(prediction)}')

    SSReg = B1**2 * Sxx
    SSErr = Syy-SSReg
    MSReg = SSReg
    MSErr = SSErr/(n-2)
    SSTot = SSReg + SSErr
    F = MSReg/MSErr
    R2 = SSReg/Syy
    temp = (1/n) + (X_Mean**2/Sxx)
    t_B0 = (B0) / sqrt(MSErr*temp)
    t_B1 = (B1) / sqrt(MSErr/Sxx)
    if printValues:
        print('Sxx:', Sxx)
        print('Syy:', Syy)
        print('Sxy:', Sxy)
        print('B0:', B0)
        print('B1:', B1)
        print('B0 T-Statistic:', t_B0)
        print('B1 T-Statistic:', t_B1)

    # ANOVA Table
    tableObject = PrettyTable()
    tableObject.field_names = ['     ', 'Sum of Squares', 'df', 'Mean of Squares', 'F']
    tableObject.add_row(['Model', round(SSReg, 5), 1, round(MSReg, 5), round(F, 5)])
    tableObject.add_row(['Error', round(SSErr, 5), n-2, round(MSErr, 5), ''])
    tableObject.add_row(['Total', round(SSTot, 5), n-1, '', ''])
    print(tableObject)
    print('R2:', R2)

    for currentYear in X:
        plt.plot(currentYear, G(currentYear), 'ro', color="red")

    plt.title(f'G(X) = {round(B0, 5)} + {round(B1, 5)}x')
    plt.plot(x, y, color='red')
    plt.plot(X, Y, 'ro', color='blue')
    plt.show()

def MultipleLinearRegression(X1, X2, Y):
    X = np.transpose([np.ones(len(X1)), X1, X2])
    Y = np.transpose(Y)
    B = np.matmul(np.linalg.inv(np.matmul(np.transpose(X), X)), np.matmul(np.transpose(X), Y))

    x, y = np.meshgrid(np.linspace(min(X1), max(X1), 10), np.linspace(min(X2), max(X2), 10))
    Z = B[0] + B[1]*x + B[2]*y
    print('B0:', B[0])
    print('B1:', B[1])
    print('B2:', B[2])

    fig = plt.figure()
    ax = plt.axes(projection='3d')
    ax.scatter3D(X1, X2, Y, color='red')
    ax.plot_surface(x, y, Z, alpha=0.75, cmap=cm.Blues)
    plt.show()

def Compute_PValue(Z, testSide):
    """
    Calcula el valor del P-Value para un Z-Test.
    Z = Valor calculado de la Z de la Normal Estándar.
    testSide = Left, Right o Double. Dependiendo del tipo de test.
    """
    P = None
    testSide = testSide.capitalize()
    if testSide == 'Right':
        P = norm.cdf(Z)
        print(P)
    elif testSide == 'Left':
        P = 1 - norm.cdf(Z)
        print(P)
    elif testSide == 'Double':
        P = 2*(1-norm.cdf(abs(Z)))
        print(P)
    else:
        print(f'Error con el tipo de test, {testSide} no es válido.')
        print('El test debe ser Left, Right o Double.')
    if P: return P

def t_Test(HypMean, SAMPLE=None, Mean=None, Sigma=None, n=None):
    """
    Pues calcula el valor de t y vas a suspender igualmente.
    """
    if SAMPLE:
        sampleInfo = BasicInformation(SAMPLE, False)
        Mean, Sigma, n = sampleInfo[0], sampleInfo[2], sampleInfo[3]
    t = (Mean - HypMean) / (Sigma/sqrt(n))
    print('T:', t)
    return t, None