Guest User

Untitled

a guest
Mar 19th, 2018
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.18 KB | None | 0 0
  1. #naive_bayes
  2. import pandas as pd
  3. import numpy as np
  4. import math
  5.  
  6.  
  7. # convert class in the form of 0 and 1
  8. def actual_list(data_actual, d_class):
  9. dlist = []
  10.  
  11. for x in range(len(data_actual)):
  12. if(data_actual[x] == d_class):
  13. dlist.append(1)
  14. else:
  15. dlist.append(0)
  16.  
  17. return dlist
  18.  
  19.  
  20. # returns traing set separates according to class
  21. # and class probablities(cp)
  22. def separate_Class(dataset):
  23. separated = {} # dataset separated by class
  24. cp = {} # class probablity
  25. for i in range(len(dataset)):
  26. vector = dataset[i]
  27.  
  28. # add new entry if class not previously present
  29. if (vector[-1] not in separated):
  30. separated[vector[-1]] = []
  31. cp[vector[-1]] = 0
  32.  
  33. separated[vector[-1]].append(vector[0:-1])
  34. cp[vector[-1]] += 1
  35.  
  36. for k in cp:
  37. cp[k] /= float(len(dataset))
  38. return separated, cp
  39.  
  40.  
  41. # calculate mean and standard deviation
  42. def learn(separated):
  43. summary = {}
  44. for classValue, instances in separated.items():
  45. summary[classValue] = list(zip(
  46. np.mean(instances, axis=0), np.std(instances, axis=0, ddof=1)))
  47. return summary
  48.  
  49.  
  50. # Gaussian probablity density function
  51. def Gaussian_probablity(x, mean, stdev):
  52. exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
  53. return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
  54.  
  55.  
  56. # calculates class probablities for given feature vector
  57. def class_probablity(vector, summary, cp):
  58. probablity = {}
  59. for classValue, classSummaries in summary.items():
  60. probablity[classValue] = cp[classValue]
  61. for i in range(len(classSummaries)):
  62. mean, stdev = classSummaries[i]
  63. x = vector[i]
  64. probablity[classValue] *= Gaussian_probablity(x, mean, stdev)
  65. return probablity
  66.  
  67.  
  68. # predict the class for given input attributes
  69. def predict(summaries, inputVector, cp):
  70. probabilities = class_probablity(inputVector, summaries, cp)
  71. bestLabel, bestProb = None, -1
  72. for classValue, probability in probabilities.items():
  73. if bestLabel is None or probability > bestProb:
  74. bestProb = probability
  75. bestLabel = classValue
  76. return bestLabel
  77.  
  78.  
  79. # get predictions for test set
  80. def getPredictions(summaries, testSet, cp):
  81. predictions = []
  82. for i in range(len(testSet)):
  83. result = predict(summaries, testSet[i], cp)
  84. predictions.append(result)
  85. return predictions
  86.  
  87.  
  88. # function to calculate accuracy of the prediction
  89. def accuracy(testSet, predictions):
  90. correct = 0
  91. for x in range(len(testSet)):
  92. if testSet[x][-1] == predictions[x]:
  93. correct += 1
  94. return (correct / float(len(testSet))) * 100.0
  95.  
  96.  
  97. def k_fold_cross_validation(X, K, randomise=False):
  98. ac = 0
  99.  
  100. if randomise:
  101. from random import shuffle
  102. X = list(X)
  103. shuffle(X)
  104. for k in range(K):
  105. training = [x for i, x in enumerate(X) if i % K != k]
  106. test = [x for i, x in enumerate(X) if i % K == k]
  107. # convert from list to matricx form
  108. training = np.vstack(training)
  109. test = np.vstack(test)
  110. # print(validation)
  111. separated, cp = separate_Class(training) # cp is class probablity
  112. summaries = learn(separated)
  113. # print(summaries)
  114. predictions = getPredictions(summaries, test, cp)
  115. ac += accuracy(test, predictions)
  116. # average accuracy of all the folds
  117. f_accuracy = float(ac) / float(K)
  118. # print(f_accuracy)
  119. return f_accuracy
  120.  
  121.  
  122. def naive_bayes(df, d_class):
  123. folds = 10
  124. m, n = df.shape
  125. data_actual = df.ix[:, n - 1]
  126. actual_data = actual_list(data_actual, d_class)
  127. # feature matrix
  128. features = df.as_matrix(columns=df.columns[0:n - 1])
  129. # print(features)
  130. X = np.c_[features, actual_data]
  131.  
  132. accuracy = k_fold_cross_validation(X, folds, False)
  133. # print(accuracy)
  134. return accuracy
  135.  
  136.  
  137. if __name__ == "__main__":
  138. # SET VARIABLES#################
  139. # read and load file
  140. filename = 'C:\Major\DMDW\DMDW\Sarcasm Code\GA-NB\example.csv'
  141. # filename = 'Glass_New.csv'
  142. d_class = 1
  143. df = pd.read_csv(filename)
  144. # call naive bayes function
  145. final_accuracy = naive_bayes(df, d_class)
  146. print("accuracy= ", final_accuracy)
Add Comment
Please, Sign In to add comment