Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Bayes
- import statistics
- from csv import reader
- from random import seed
- from random import randrange
- from math import sqrt
- from math import exp
- from math import pi
- # memuat file .csv [kanan] [kiri]
- def load_csv(filename):
- dataset = list()
- with open(filename, 'r') as file:
- csv_reader = reader(file)
- for row in csv_reader:
- if not row:
- continue
- dataset.append(row)
- return dataset
- # Convert string column to float [kanan] [kiri]
- def str_column_to_float(dataset, column):
- for row in dataset:
- row[column] = float(row[column].strip())
- # Convert string column to integer [kanan] [kiri]
- def str_column_to_int(dataset, column):
- class_values = [row[column] for row in dataset]
- unique = set(class_values)
- lookup = dict()
- for i, value in enumerate(unique):
- lookup[value] = i
- print('[%s] => %d' % (value, i))
- for row in dataset:
- row[column] = lookup[row[column]]
- return lookup
- # Split a dataset into k folds [kiri]
- def cross_validation_split(dataset, n_folds):
- dataset_split = list()
- dataset_copy = list(dataset)
- fold_size = int(len(dataset) / n_folds)
- for _ in range(n_folds):
- fold = list()
- while len(fold) < fold_size:
- index = randrange(len(dataset_copy))
- fold.append(dataset_copy.pop(index))
- dataset_split.append(fold)
- return dataset_split
- # Split the dataset by class values, returns a dictionary [kanan]
- def separate_by_class(dataset):
- separated = dict()
- for i in range(len(dataset)):
- vector = dataset[i]
- class_value = vector[-1]
- if (class_value not in separated):
- separated[class_value] = list()
- separated[class_value].append(vector)
- return separated
- # Calculate accuracy percentage [kiri]
- def accuracy_metric(actual, predicted):
- correct = 0
- for i in range(len(actual)):
- if actual[i] == predicted[i]:
- correct += 1
- return correct / float(len(actual)) * 100.0
- # Evaluate an algorithm using a cross validation split [kiri]
- def evaluate_algorithm(dataset, algorithm, n_folds, *args):
- folds = cross_validation_split(dataset, n_folds)
- scores = list()
- for fold in folds:
- train_set = list(folds)
- train_set.remove(fold)
- train_set = sum(train_set, [])
- test_set = list()
- for row in fold:
- row_copy = list(row)
- test_set.append(row_copy)
- row_copy[-1] = None
- predicted = algorithm(train_set, test_set, *args)
- actual = [row[-1] for row in fold]
- accuracy = accuracy_metric(actual, predicted)
- scores.append(accuracy)
- return scores
- # Calculate the mean of a list of numbers [kanan][kiri]
- def mean(numbers):
- return sum(numbers) / float(len(numbers))
- # Calculate the standard deviation of a list of numbers [kanan][kiri]
- def stdev(numbers):
- avg = mean(numbers)
- variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
- return sqrt(variance)
- # Calculate the mean, stdev and count for each column in a dataset [kanan][kiri]
- def summarize_dataset(dataset):
- summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
- del (summaries[-1])
- return summaries
- # Split dataset by class then calculate statistics for each row [kanan] [kiri]
- def summarize_by_class(dataset):
- separated = separate_by_class(dataset)
- summaries = dict()
- for class_value, rows in separated.items():
- summaries[class_value] = summarize_dataset(rows)
- return summaries
- # Calculate the Gaussian probability distribution function for x [kanan] [kiri]
- def calculate_probability(x, mean, stdev):
- exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
- return (1 / (sqrt(2 * pi) * stdev)) * exponent
- # Calculate the probabilities of predicting each class for a given row [kanan] [kiri]
- def calculate_class_probabilities(summaries, row):
- total_rows = sum([summaries[label][0][2] for label in summaries])
- probabilities = dict()
- for class_value, class_summaries in summaries.items():
- probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
- for i in range(len(class_summaries)):
- mean, stdev, _ = class_summaries[i]
- probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
- return probabilities
- # Predict the class for a given row [kanan] [kiri]
- def predict(summaries, row):
- probabilities = calculate_class_probabilities(summaries, row)
- best_label, best_prob = None, -1
- for class_value, probability in probabilities.items():
- if best_label is None or probability > best_prob:
- best_prob = probability
- best_label = class_value
- return best_label
- # Naive Bayes Algorithm
- def naive_bayes(train, test):
- summarize = summarize_by_class(train)
- predictions = list()
- for row in test:
- output = predict(summarize, row)
- predictions.append(output)
- return (predictions)
- seed(1)
- filename = 'Cervical-Cancer-Behavior-Risk.csv'
- # filename = 'sobar-72.csv'
- dataset = load_csv(filename)
- for i in range(len(dataset[0]) - 1):
- str_column_to_float(dataset, i)
- str_column_to_int(dataset, len(dataset[0]) - 1)
- model = summarize_by_class(dataset)
- # record to predict
- record1 = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
- recordOne = [8, 12, 9, 10, 10, 5, 10, 5, 5, 5, 2, 10, 9, 13, 2, 9, 8, 7, 12, 1] # taken from csv file, it should be one
- recordTwo = [10, 14, 14, 6, 12, 7, 8, 5, 15, 12, 10, 10, 13, 11, 9, 14, 13, 15,
- 15] # taken from csv fileit should be zero
- label = predict(model, record1)
- labelHas = predict(model, recordOne)
- labelNo = predict(model, recordTwo)
- print('\nData=%s\nPredicted(0 means no cervical cancer | 1 has cervical cancer) : %s \n' % (record1, label))
- print('\nData=%s\nPredicted(0 means no cervical cancer | 1 has cervical cancer) : %s \n' % (recordOne, label))
- print('\nData=%s\nPredicted(0 means no cervical cancer | 1 has cervical cancer) : %s \n' % (recordTwo, label))
- n_folds = 4
- scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
- print('Scores: %s' % scores)
- print('Mean Accuracy: %.3f%%' % (sum(scores) / float(len(scores))))
- print('Median Accuracy: %.3f%%' % (statistics.median(scores)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement