Untitled

import statsmodels.api as sm
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib import rcParams
import scipy.stats as stats
import pylab
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import math
import re
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Roboto']
rcParams['font.size'] = 20
from matplotlib.colors import ListedColormap
from sklearn import datasets
from random import randint
from nltk.corpus import stopwords

col = ['#ef4631', '#10b9ce', '#2292ec', '#ff9138', '#3f50b0', '#f7bb09']
cols_light = ['#f47f71', '#72dde9','#66b4f2','#ffb478','#7a86c8','#f9cf55']

def plot_scatter_2d(df, x1, x2, y, ax=None):
    """"
    Description: Plots a scatter plot of two dependent variables, with different color/shape per class
    IMPORTRANT: Works for only 2 independent variables, given 1 dependent

    Parameters:
        df(n x m): dataframe containing variables of interest
        x1 (string): name of x-axis variable
        x2 (string): name of y-axis variable
        y (string): name of labels

    Returns:
        Scatter plot
    """
    if ax == None:
        f, ax = plt.subplots(1, figsize = (10,8))

    for idx, cl in enumerate(np.unique(df[y])):

        ax.scatter(x=df[df[y]==cl][x1],
                   y=df[df[y]==cl][x2],
                   label = cl,
                   c = col[idx],
                   s = 100)


    if ax == None:
        plt.title(x1 + ' vs ' + x2)
        plt.legend()
        plt.xlabel(x1)
        plt.ylabel(x2)
        plt.show()

def plot_assignments(df, mean1, mean2, mean3, feature1, feature2, wLabel = False):

    """
    Function to plot the movement of the centroids as well as the point assignments per iteration.
    Note that this is just to show the method, and does not really need to be done by you.

    Parameters
    -------------------------
    df : The dataframe containing your data
    mean1 : centroid 1
    mean2 : centroid 2
    mean3 : centroid 3
    wLabel : Option to show the labels or not
    """
    col_ex = ['#ef4631', '#2292ec', '#3f50b0']
    labelling = ['setosa', 'versicolor', 'virginica']
    temp_df = df.copy()
    cluster_mem = []

    for i in range(df.shape[0]):
        pt = (df[feature1][i],df[feature2][i])

        dist1 = np.sqrt((pt[0]-mean1[0])**2 + (pt[1]-mean1[1])**2)
        dist2 = np.sqrt((pt[0]-mean2[0])**2 + (pt[1]-mean2[1])**2)
        dist3 = np.sqrt((pt[0]-mean3[0])**2 + (pt[1]-mean3[1])**2)

        cluster_mem.append(np.argmin([dist1, dist2, dist3]))

    temp_df['Cluster'] = cluster_mem

    if wLabel == False:
        f, ax = plt.subplots(1, figsize = (8,6))
        for idx, cl in enumerate(np.unique(temp_df['Cluster'])):

            ax.scatter(x=temp_df[temp_df['Cluster']==cl][feature1],
                       y=temp_df[temp_df['Cluster']==cl][feature2],
                       c = col_ex[idx],
                       label=labelling[idx],
                       s = 100, alpha = 0.1)

        ax.scatter(x = mean1[0], y = mean1[1], color = col_ex[0], s = 200)
        ax.scatter(x = mean2[0], y = mean2[1], color = col_ex[1], s = 200)
        ax.scatter(x = mean3[0], y = mean3[1], color = col_ex[2], s = 200)

        #plt.legend()
        plt.xlabel(feature1)
        plt.ylabel(feature2)

        plt.show()

    else:

        f, ax = plt.subplots(1,2, figsize = (20, 10))
        for idx, cl in enumerate(np.unique(temp_df['Cluster'])):

            ax[0].scatter(x=temp_df[temp_df['Cluster']==cl][feature1],
                       y=temp_df[temp_df['Cluster']==cl][feature2],
                       c = col_ex[idx],
                       s = 100, alpha = 0.1)

        ax[0].scatter(x = mean1[0], y = mean1[1], color = col_ex[0], s = 200)
        ax[0].scatter(x = mean2[0], y = mean2[1], color = col_ex[1], s = 200)
        ax[0].scatter(x = mean3[0], y = mean3[1], color = col_ex[2], s = 200)

        ax[0].set_title('Clustering Result')
        ax[1].set_title('Actual Data')

        ax[0].set_xlabel([feature1])
        ax[0].set_ylabel([feature2])
        ax[1].set_xlabel([feature1])
        ax[1].set_ylabel([feature2])


        plt.xlabel(feature1)
        plt.ylabel(feature2)

        for idx, cl in enumerate(np.unique(temp_df['species'])):

            ax[1].scatter(x=temp_df[temp_df['species']==cl][feature1],
                       y=temp_df[temp_df['species']==cl][feature2],
                       c = col_ex[idx],
                       label=labelling[idx],
                       s = 100)

        plt.legend()
        plt.show()

        new_mean1 = temp_df[temp_df.Cluster == 0].mean(axis = 0)
        new_mean2 = temp_df[temp_df.Cluster == 1].mean(axis = 0)
        new_mean3 = temp_df[temp_df.Cluster == 2].mean(axis = 0)
        print(new_mean1, new_mean2, new_mean3)
        return (new_mean1, new_mean2, new_mean3)