fake_world

ml5

Dec 3rd, 2020
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.64 KB | None | 0 0
  1. import math
  2. import statistics as st
  3. from sklearn.model_selection import train_test_split
  4. import pandas as pd
  5.  
  6. def summarizeByClass(x_tr,y_tr):    
  7.     separated = {}  # Create a dictionary with  labels as keys 1 and 0  
  8.     for i in range(len(x_train)):
  9.         x, y = x_tr[i],y_tr[i]
  10.         if (y not in separated):
  11.             separated[y] = []
  12.         separated[y].append(x)    
  13.    
  14.     summary = {} # to store mean and std of +ve and -ve instances
  15.     for lbl, subset in separated.items():    
  16.         summary[lbl] = [ (st.mean(attribute), st.stdev(attribute))
  17.                          for attribute in zip(*subset)];  #zip(*res) transposes a matrix (2-d array/list)
  18.  
  19.     return summary
  20.  
  21. #For continuous attributes p is estimated using Gaussion distribution
  22. def estimateProbability(x, mean, stdev):
  23.     exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
  24.     return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
  25.  
  26. def predict(summaries, testVector):
  27.     bestLabel, bestProb = None, -1
  28.     p = {}
  29.     for lbl, mean_std in summaries.items():
  30.         #class and attribute information as mean and sd
  31.         p[lbl] = 1
  32.         for i in range(len(mean_std)):
  33.             mean, stdev = mean_std[i]
  34.             x = testVector[i]
  35.             p[lbl] *= estimateProbability(x, mean, stdev);
  36.            
  37.         #assigns that class which has he highest prob
  38.         if bestLabel is None or p[lbl] > bestProb:
  39.             bestProb = p[lbl]
  40.             bestLabel = lbl
  41.            
  42.     return bestLabel
  43.  
  44. def do_classification_compute_accuracy(summaries, test_x, test_y):
  45.     correct = 0
  46.     for i in range(len(test_x)):
  47.         result = predict(summaries, test_x[i])
  48.         if result == test_y[i]:
  49.             correct = correct + 1
  50.  
  51.     accuracy = (correct/float(len(test_x))) * 100.0
  52.     return accuracy
  53.  
  54. # Main program
  55. df=pd.read_csv('data5.csv',header=None)
  56. cols = [0,1,2,3,4,5,6,7]
  57. df_x = df[df.columns[cols]]
  58. df_y = df[df.columns[8]]
  59.  
  60. X = df_x.values.tolist()
  61. Y = df_y.values.tolist()
  62.  
  63. x_train, x_test, y_train, y_test = train_test_split(X,Y)
  64.  
  65. print('Dataset loaded...')
  66. print('Total instances available :',len(X))
  67. print('Total attributes present  :',len(X[0])-1)
  68. print("First Five instances of dataset:")
  69. for i in range(5):
  70.     print(i+1 , ':' , X[i])
  71.        
  72. print('\nDataset is split into training and testing set.')
  73. print('Training examples = {0} \nTesting examples  = {1}'.format
  74.       (len(x_train), len(x_test)))
  75.  
  76. summaries = summarizeByClass(x_train,y_train);
  77. accuracy = do_classification_compute_accuracy(summaries,x_test,y_test)
  78.  
  79. print('\nAccuracy of the Naive Baysian Classifier is :', accuracy)
Add Comment
Please, Sign In to add comment