Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # coding: utf-8
- # In[1]:
- import numpy as np
- import pandas as pd
- import sklearn as sk
- import matplotlib.pyplot as plt
- import math
- # Initialize values and load data
- data_path = r'F:\AIUB\DM\Project Supervised\glass_data.csv'
- data = pd.read_csv(data_path, index_col='ID')
- features = ['RI', 'NA2O', 'MGO', 'AL2O3', 'SIO2', 'K2O', 'CAO', 'BAO', 'FE2O3']
- oxyde_columns = features[1:]
- class_col = 'TYPE'
- # keys:
- # FP = Float Processed
- # NFP = Non Float Processed
- labels = ['Building Windows FP', 'Building Windows NFP', 'Vehicle Windows FP',
- 'Vehicle Windows NFP', 'Containers', 'Tableware','Headlamps']
- # In[2]:
- # Replace the TYPE column values from forigen keys to their actual names
- def getLabelNameFromReference(ref):
- global labels
- if ref < 1 or ref > 7:
- raise ValueError('Invalid reference number for Glass Type column: {}'.format(ref))
- return labels[ref-1]
- data['TYPE'] = data['TYPE'].map(getLabelNameFromReference)
- # In[3]:
- # All instances have very similar value for Refractive index (RI)
- # By scaling the column by 1000 we
- data['RI']*= 1000
- # In[4]:
- # Train the classifier
- from sklearn.neighbors import KNeighborsClassifier
- k_neighbours= 3
- predictor = KNeighborsClassifier(n_neighbors= k_neighbours, metric= 'manhattan', weights='distance')
- # predictor.fit(data[features], data[class_col])
- # In[5]:
- # make predictions
- # predictor.predict(data[features][:3])
- # ## Classifier Evaluation:
- #
- # In[6]:
- # Do k fold cross validation, ,many times
- from sklearn.model_selection import cross_val_score
- k= 8
- repeat = 5
- print('{} fold cross validation:'.format(k))
- grand_sum = 0
- for _ in range(repeat):
- data = data.sample(frac= 1).reset_index(drop= True)
- scores = cross_val_score(predictor, data[features], data[class_col], scoring= 'accuracy', cv= k, n_jobs= -1)
- grand_sum += sum(scores)
- print('Results: -----------------')
- print('Scores:', ', '.join([str(round(x, 3)) for x in scores]))
- print('Average:', round(np.average(scores), 4))
- print()
- grand_avg_accuracy= grand_sum/(k*repeat)
- print('Grand Average:', round(grand_avg_accuracy, 3))
- # In[7]:
- # Helper functoins
- def getConfidenceInterval(accuracy):
- """ accuracy = 0-1 """
- zcl= 1.64 # z value for 0.9 confidence level
- std_error = math.sqrt((accuracy*(1-accuracy))/len(data))
- return accuracy-std_error*zcl, accuracy+std_error*zcl
- lower, upper = getConfidenceInterval(grand_avg_accuracy)
- lower = round(lower, 3)
- upper = round(upper, 3)
- print('Predictive accuricy lies within the range: [{}, {}] (90% confidence)'.format(lower, upper))
- # In[8]:
- # Construct a confusion matrix usinghte entire data set
- # from sklearn.metrics import confusion_matrix
- # confusion_matrix = confusion_matrix(data[class_col], predictor.predict(data[features]), labels)
- # confusion_matrix = pd.DataFrame(confusion_matrix, columns= labels, )
- # confusion_matrix.insert(0, '', labels)
- # confusion_matrix
- # In[ ]:
Add Comment
Please, Sign In to add comment