Advertisement
lamaulfarid

NaiveBayesClassifier

May 31st, 2021
896
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.41 KB | None | 0 0
  1. # Bayes
  2. import statistics
  3. from csv import reader
  4. from random import seed
  5. from random import randrange
  6. from math import sqrt
  7. from math import exp
  8. from math import pi
  9.  
  10.  
  11. # memuat file .csv [kanan] [kiri]
  12. def load_csv(filename):
  13.     dataset = list()
  14.     with open(filename, 'r') as file:
  15.         csv_reader = reader(file)
  16.         for row in csv_reader:
  17.             if not row:
  18.                 continue
  19.             dataset.append(row)
  20.     return dataset
  21.  
  22.  
  23. # Convert string column to float [kanan] [kiri]
  24. def str_column_to_float(dataset, column):
  25.     for row in dataset:
  26.         row[column] = float(row[column].strip())
  27.  
  28.  
  29. # Convert string column to integer [kanan] [kiri]
  30. def str_column_to_int(dataset, column):
  31.     class_values = [row[column] for row in dataset]
  32.     unique = set(class_values)
  33.     lookup = dict()
  34.     for i, value in enumerate(unique):
  35.         lookup[value] = i
  36.         print('[%s] => %d' % (value, i))
  37.     for row in dataset:
  38.         row[column] = lookup[row[column]]
  39.     return lookup
  40.  
  41.  
  42. # Split a dataset into k folds [kiri]
  43. def cross_validation_split(dataset, n_folds):
  44.     dataset_split = list()
  45.     dataset_copy = list(dataset)
  46.     fold_size = int(len(dataset) / n_folds)
  47.     for _ in range(n_folds):
  48.         fold = list()
  49.         while len(fold) < fold_size:
  50.             index = randrange(len(dataset_copy))
  51.             fold.append(dataset_copy.pop(index))
  52.         dataset_split.append(fold)
  53.     return dataset_split
  54.  
  55.  
  56. # Split the dataset by class values, returns a dictionary [kanan]
  57. def separate_by_class(dataset):
  58.     separated = dict()
  59.     for i in range(len(dataset)):
  60.         vector = dataset[i]
  61.         class_value = vector[-1]
  62.         if (class_value not in separated):
  63.             separated[class_value] = list()
  64.         separated[class_value].append(vector)
  65.     return separated
  66.  
  67.  
  68. # Calculate accuracy percentage [kiri]
  69. def accuracy_metric(actual, predicted):
  70.     correct = 0
  71.     for i in range(len(actual)):
  72.         if actual[i] == predicted[i]:
  73.             correct += 1
  74.     return correct / float(len(actual)) * 100.0
  75.  
  76.  
  77. # Evaluate an algorithm using a cross validation split [kiri]
  78. def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  79.     folds = cross_validation_split(dataset, n_folds)
  80.     scores = list()
  81.     for fold in folds:
  82.         train_set = list(folds)
  83.         train_set.remove(fold)
  84.         train_set = sum(train_set, [])
  85.         test_set = list()
  86.         for row in fold:
  87.             row_copy = list(row)
  88.             test_set.append(row_copy)
  89.             row_copy[-1] = None
  90.         predicted = algorithm(train_set, test_set, *args)
  91.         actual = [row[-1] for row in fold]
  92.         accuracy = accuracy_metric(actual, predicted)
  93.         scores.append(accuracy)
  94.     return scores
  95.  
  96.  
  97. # Calculate the mean of a list of numbers [kanan][kiri]
  98. def mean(numbers):
  99.     return sum(numbers) / float(len(numbers))
  100.  
  101.  
  102. # Calculate the standard deviation of a list of numbers [kanan][kiri]
  103. def stdev(numbers):
  104.     avg = mean(numbers)
  105.     variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
  106.     return sqrt(variance)
  107.  
  108.  
  109. # Calculate the mean, stdev and count for each column in a dataset [kanan][kiri]
  110. def summarize_dataset(dataset):
  111.     summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
  112.     del (summaries[-1])
  113.     return summaries
  114.  
  115.  
  116. # Split dataset by class then calculate statistics for each row [kanan] [kiri]
  117. def summarize_by_class(dataset):
  118.     separated = separate_by_class(dataset)
  119.     summaries = dict()
  120.     for class_value, rows in separated.items():
  121.         summaries[class_value] = summarize_dataset(rows)
  122.     return summaries
  123.  
  124.  
  125. # Calculate the Gaussian probability distribution function for x [kanan] [kiri]
  126. def calculate_probability(x, mean, stdev):
  127.     exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
  128.     return (1 / (sqrt(2 * pi) * stdev)) * exponent
  129.  
  130.  
  131. # Calculate the probabilities of predicting each class for a given row [kanan] [kiri]
  132. def calculate_class_probabilities(summaries, row):
  133.     total_rows = sum([summaries[label][0][2] for label in summaries])
  134.     probabilities = dict()
  135.     for class_value, class_summaries in summaries.items():
  136.         probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
  137.         for i in range(len(class_summaries)):
  138.             mean, stdev, _ = class_summaries[i]
  139.             probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
  140.     return probabilities
  141.  
  142.  
  143. # Predict the class for a given row [kanan] [kiri]
  144. def predict(summaries, row):
  145.     probabilities = calculate_class_probabilities(summaries, row)
  146.     best_label, best_prob = None, -1
  147.     for class_value, probability in probabilities.items():
  148.         if best_label is None or probability > best_prob:
  149.             best_prob = probability
  150.             best_label = class_value
  151.     return best_label
  152.  
  153.  
  154. # Naive Bayes Algorithm
  155. def naive_bayes(train, test):
  156.     summarize = summarize_by_class(train)
  157.     predictions = list()
  158.     for row in test:
  159.         output = predict(summarize, row)
  160.         predictions.append(output)
  161.     return (predictions)
  162.  
  163.  
  164. seed(1)
  165. filename = 'Cervical-Cancer-Behavior-Risk.csv'
  166. # filename = 'sobar-72.csv'
  167. dataset = load_csv(filename)
  168. for i in range(len(dataset[0]) - 1):
  169.     str_column_to_float(dataset, i)
  170.  
  171. str_column_to_int(dataset, len(dataset[0]) - 1)
  172. model = summarize_by_class(dataset)
  173.  
  174. # record to predict
  175. record1 = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  176. recordOne = [8, 12, 9, 10, 10, 5, 10, 5, 5, 5, 2, 10, 9, 13, 2, 9, 8, 7, 12, 1]  # taken from csv file, it should be one
  177. recordTwo = [10, 14, 14, 6, 12, 7, 8, 5, 15, 12, 10, 10, 13, 11, 9, 14, 13, 15,
  178.              15]  # taken from csv fileit should be zero
  179. label = predict(model, record1)
  180. labelHas = predict(model, recordOne)
  181. labelNo = predict(model, recordTwo)
  182. print('\nData=%s\nPredicted(0 means no cervical cancer | 1 has cervical cancer) : %s \n' % (record1, label))
  183. print('\nData=%s\nPredicted(0 means no cervical cancer | 1 has cervical cancer) : %s \n' % (recordOne, label))
  184. print('\nData=%s\nPredicted(0 means no cervical cancer | 1 has cervical cancer) : %s \n' % (recordTwo, label))
  185.  
  186. n_folds = 4
  187. scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
  188. print('Scores: %s' % scores)
  189. print('Mean Accuracy: %.3f%%' % (sum(scores) / float(len(scores))))
  190. print('Median Accuracy: %.3f%%' % (statistics.median(scores)))
  191.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement