Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import statsmodels.api as sm
- from scipy import stats
- stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
- import pandas as pd
- from matplotlib import pyplot as plt
- import seaborn as sns
- import numpy as np
- from matplotlib import rcParams
- import scipy.stats as stats
- import pylab
- from sklearn.linear_model import LogisticRegression
- from sklearn.linear_model import LinearRegression
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.svm import SVC
- from sklearn.metrics import confusion_matrix
- from sklearn.metrics import roc_curve
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score
- from sklearn.metrics import confusion_matrix
- import math
- import re
- rcParams['font.family'] = 'sans-serif'
- rcParams['font.sans-serif'] = ['Roboto']
- rcParams['font.size'] = 20
- from matplotlib.colors import ListedColormap
- from sklearn import datasets
- from random import randint
- from nltk.corpus import stopwords
- col = ['#ef4631', '#10b9ce', '#2292ec', '#ff9138', '#3f50b0', '#f7bb09']
- cols_light = ['#f47f71', '#72dde9','#66b4f2','#ffb478','#7a86c8','#f9cf55']
- def plot_scatter_2d(df, x1, x2, y, ax=None):
- """"
- Description: Plots a scatter plot of two dependent variables, with different color/shape per class
- IMPORTRANT: Works for only 2 independent variables, given 1 dependent
- Parameters:
- df(n x m): dataframe containing variables of interest
- x1 (string): name of x-axis variable
- x2 (string): name of y-axis variable
- y (string): name of labels
- Returns:
- Scatter plot
- """
- if ax == None:
- f, ax = plt.subplots(1, figsize = (10,8))
- for idx, cl in enumerate(np.unique(df[y])):
- ax.scatter(x=df[df[y]==cl][x1],
- y=df[df[y]==cl][x2],
- label = cl,
- c = col[idx],
- s = 100)
- if ax == None:
- plt.title(x1 + ' vs ' + x2)
- plt.legend()
- plt.xlabel(x1)
- plt.ylabel(x2)
- plt.show()
- def plot_assignments(df, mean1, mean2, mean3, feature1, feature2, wLabel = False):
- """
- Function to plot the movement of the centroids as well as the point assignments per iteration.
- Note that this is just to show the method, and does not really need to be done by you.
- Parameters
- -------------------------
- df : The dataframe containing your data
- mean1 : centroid 1
- mean2 : centroid 2
- mean3 : centroid 3
- wLabel : Option to show the labels or not
- """
- col_ex = ['#ef4631', '#2292ec', '#3f50b0']
- labelling = ['setosa', 'versicolor', 'virginica']
- temp_df = df.copy()
- cluster_mem = []
- for i in range(df.shape[0]):
- pt = (df[feature1][i],df[feature2][i])
- dist1 = np.sqrt((pt[0]-mean1[0])**2 + (pt[1]-mean1[1])**2)
- dist2 = np.sqrt((pt[0]-mean2[0])**2 + (pt[1]-mean2[1])**2)
- dist3 = np.sqrt((pt[0]-mean3[0])**2 + (pt[1]-mean3[1])**2)
- cluster_mem.append(np.argmin([dist1, dist2, dist3]))
- temp_df['Cluster'] = cluster_mem
- if wLabel == False:
- f, ax = plt.subplots(1, figsize = (8,6))
- for idx, cl in enumerate(np.unique(temp_df['Cluster'])):
- ax.scatter(x=temp_df[temp_df['Cluster']==cl][feature1],
- y=temp_df[temp_df['Cluster']==cl][feature2],
- c = col_ex[idx],
- label=labelling[idx],
- s = 100, alpha = 0.1)
- ax.scatter(x = mean1[0], y = mean1[1], color = col_ex[0], s = 200)
- ax.scatter(x = mean2[0], y = mean2[1], color = col_ex[1], s = 200)
- ax.scatter(x = mean3[0], y = mean3[1], color = col_ex[2], s = 200)
- #plt.legend()
- plt.xlabel(feature1)
- plt.ylabel(feature2)
- plt.show()
- else:
- f, ax = plt.subplots(1,2, figsize = (20, 10))
- for idx, cl in enumerate(np.unique(temp_df['Cluster'])):
- ax[0].scatter(x=temp_df[temp_df['Cluster']==cl][feature1],
- y=temp_df[temp_df['Cluster']==cl][feature2],
- c = col_ex[idx],
- s = 100, alpha = 0.1)
- ax[0].scatter(x = mean1[0], y = mean1[1], color = col_ex[0], s = 200)
- ax[0].scatter(x = mean2[0], y = mean2[1], color = col_ex[1], s = 200)
- ax[0].scatter(x = mean3[0], y = mean3[1], color = col_ex[2], s = 200)
- ax[0].set_title('Clustering Result')
- ax[1].set_title('Actual Data')
- ax[0].set_xlabel([feature1])
- ax[0].set_ylabel([feature2])
- ax[1].set_xlabel([feature1])
- ax[1].set_ylabel([feature2])
- plt.xlabel(feature1)
- plt.ylabel(feature2)
- for idx, cl in enumerate(np.unique(temp_df['species'])):
- ax[1].scatter(x=temp_df[temp_df['species']==cl][feature1],
- y=temp_df[temp_df['species']==cl][feature2],
- c = col_ex[idx],
- label=labelling[idx],
- s = 100)
- plt.legend()
- plt.show()
- new_mean1 = temp_df[temp_df.Cluster == 0].mean(axis = 0)
- new_mean2 = temp_df[temp_df.Cluster == 1].mean(axis = 0)
- new_mean3 = temp_df[temp_df.Cluster == 2].mean(axis = 0)
- print(new_mean1, new_mean2, new_mean3)
- return (new_mean1, new_mean2, new_mean3)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement