Untitled

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import math

# Initialize values and load data
data_path = r'F:\AIUB\DM\Project Supervised\glass_data.csv'
data = pd.read_csv(data_path, index_col='ID')

features = ['RI', 'NA2O', 'MGO', 'AL2O3', 'SIO2', 'K2O', 'CAO', 'BAO', 'FE2O3']
oxyde_columns = features[1:]
class_col = 'TYPE'

# keys:
# FP  = Float Processed
# NFP = Non Float Processed
labels = ['Building Windows FP', 'Building Windows NFP', 'Vehicle Windows FP',
              'Vehicle Windows NFP', 'Containers', 'Tableware','Headlamps']


# In[2]:


# Replace the TYPE column values from forigen keys to their actual names

def getLabelNameFromReference(ref):
    global labels
    if ref < 1 or ref > 7:
        raise ValueError('Invalid reference number for Glass Type column: {}'.format(ref))

    return labels[ref-1]

data['TYPE'] = data['TYPE'].map(getLabelNameFromReference)


# In[3]:


# All instances have very similar value for Refractive index (RI)
# By scaling the column by 1000 we

data['RI']*= 1000


# In[4]:


# Train the classifier

from sklearn.neighbors import KNeighborsClassifier

k_neighbours= 3
predictor = KNeighborsClassifier(n_neighbors= k_neighbours, metric= 'manhattan', weights='distance')
# predictor.fit(data[features], data[class_col])


# In[5]:


# make predictions

# predictor.predict(data[features][:3])


# ## Classifier Evaluation:
#

# In[6]:


# Do k fold cross validation, ,many times
from sklearn.model_selection import cross_val_score

k= 8
repeat = 5

print('{} fold cross validation:'.format(k))

grand_sum = 0
for _ in range(repeat):
    data = data.sample(frac= 1).reset_index(drop= True)

    scores = cross_val_score(predictor, data[features], data[class_col], scoring= 'accuracy', cv= k, n_jobs= -1)
    grand_sum += sum(scores)

    print('Results: -----------------')
    print('Scores:', ', '.join([str(round(x, 3)) for x in scores]))
    print('Average:', round(np.average(scores), 4))
    print()

grand_avg_accuracy= grand_sum/(k*repeat)

print('Grand Average:', round(grand_avg_accuracy, 3))


# In[7]:


# Helper functoins
def getConfidenceInterval(accuracy):
    """ accuracy = 0-1 """
    zcl= 1.64        # z value for 0.9 confidence level
    std_error = math.sqrt((accuracy*(1-accuracy))/len(data))

    return accuracy-std_error*zcl, accuracy+std_error*zcl

lower, upper = getConfidenceInterval(grand_avg_accuracy)
lower = round(lower, 3)
upper = round(upper, 3)
print('Predictive accuricy lies within the range: [{}, {}] (90% confidence)'.format(lower, upper))


# In[8]:


# Construct a confusion matrix usinghte entire data set

# from sklearn.metrics import confusion_matrix

# confusion_matrix = confusion_matrix(data[class_col], predictor.predict(data[features]), labels)
# confusion_matrix = pd.DataFrame(confusion_matrix, columns= labels, )
# confusion_matrix.insert(0, '', labels)

# confusion_matrix


# In[ ]: