Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #naive_bayes
- import pandas as pd
- import numpy as np
- import math
- # convert class in the form of 0 and 1
- def actual_list(data_actual, d_class):
- dlist = []
- for x in range(len(data_actual)):
- if(data_actual[x] == d_class):
- dlist.append(1)
- else:
- dlist.append(0)
- return dlist
- # returns traing set separates according to class
- # and class probablities(cp)
- def separate_Class(dataset):
- separated = {} # dataset separated by class
- cp = {} # class probablity
- for i in range(len(dataset)):
- vector = dataset[i]
- # add new entry if class not previously present
- if (vector[-1] not in separated):
- separated[vector[-1]] = []
- cp[vector[-1]] = 0
- separated[vector[-1]].append(vector[0:-1])
- cp[vector[-1]] += 1
- for k in cp:
- cp[k] /= float(len(dataset))
- return separated, cp
- # calculate mean and standard deviation
- def learn(separated):
- summary = {}
- for classValue, instances in separated.items():
- summary[classValue] = list(zip(
- np.mean(instances, axis=0), np.std(instances, axis=0, ddof=1)))
- return summary
- # Gaussian probablity density function
- def Gaussian_probablity(x, mean, stdev):
- exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
- return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
- # calculates class probablities for given feature vector
- def class_probablity(vector, summary, cp):
- probablity = {}
- for classValue, classSummaries in summary.items():
- probablity[classValue] = cp[classValue]
- for i in range(len(classSummaries)):
- mean, stdev = classSummaries[i]
- x = vector[i]
- probablity[classValue] *= Gaussian_probablity(x, mean, stdev)
- return probablity
- # predict the class for given input attributes
- def predict(summaries, inputVector, cp):
- probabilities = class_probablity(inputVector, summaries, cp)
- bestLabel, bestProb = None, -1
- for classValue, probability in probabilities.items():
- if bestLabel is None or probability > bestProb:
- bestProb = probability
- bestLabel = classValue
- return bestLabel
- # get predictions for test set
- def getPredictions(summaries, testSet, cp):
- predictions = []
- for i in range(len(testSet)):
- result = predict(summaries, testSet[i], cp)
- predictions.append(result)
- return predictions
- # function to calculate accuracy of the prediction
- def accuracy(testSet, predictions):
- correct = 0
- for x in range(len(testSet)):
- if testSet[x][-1] == predictions[x]:
- correct += 1
- return (correct / float(len(testSet))) * 100.0
- def k_fold_cross_validation(X, K, randomise=False):
- ac = 0
- if randomise:
- from random import shuffle
- X = list(X)
- shuffle(X)
- for k in range(K):
- training = [x for i, x in enumerate(X) if i % K != k]
- test = [x for i, x in enumerate(X) if i % K == k]
- # convert from list to matricx form
- training = np.vstack(training)
- test = np.vstack(test)
- # print(validation)
- separated, cp = separate_Class(training) # cp is class probablity
- summaries = learn(separated)
- # print(summaries)
- predictions = getPredictions(summaries, test, cp)
- ac += accuracy(test, predictions)
- # average accuracy of all the folds
- f_accuracy = float(ac) / float(K)
- # print(f_accuracy)
- return f_accuracy
- def naive_bayes(df, d_class):
- folds = 10
- m, n = df.shape
- data_actual = df.ix[:, n - 1]
- actual_data = actual_list(data_actual, d_class)
- # feature matrix
- features = df.as_matrix(columns=df.columns[0:n - 1])
- # print(features)
- X = np.c_[features, actual_data]
- accuracy = k_fold_cross_validation(X, folds, False)
- # print(accuracy)
- return accuracy
- if __name__ == "__main__":
- # SET VARIABLES#################
- # read and load file
- filename = 'C:\Major\DMDW\DMDW\Sarcasm Code\GA-NB\example.csv'
- # filename = 'Glass_New.csv'
- d_class = 1
- df = pd.read_csv(filename)
- # call naive bayes function
- final_accuracy = naive_bayes(df, d_class)
- print("accuracy= ", final_accuracy)
Add Comment
Please, Sign In to add comment