Advertisement
lamaulfarid

kNN

May 30th, 2021
726
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.64 KB | None | 0 0
  1. # k-nearest neighbors on the Cervical Cancer Behavior Risk Data Set
  2. from csv import reader
  3. from math import sqrt
  4. from random import randrange
  5. from random import seed
  6. # from statistics import median
  7.  
  8. # Load a CSV file
  9. def load_csv(filename):
  10.     dataset = list()
  11.     with open(filename, 'r') as file:
  12.         csv_reader = reader(file)
  13.         for row in csv_reader:
  14.             if not row:
  15.                 continue
  16.             dataset.append(row)
  17.     return dataset
  18.  
  19. # Convert string column to float
  20. def str_column_to_float(dataset, column):
  21.     for row in dataset:
  22.         row[column] = float(row[column].strip())
  23.  
  24. # Convert string column to integer
  25. def str_column_to_int(dataset, column):
  26.     class_values = [row[column] for row in dataset]
  27.     unique = set(class_values)
  28.     lookup = dict()
  29.     for i, value in enumerate(unique):
  30.         lookup[value] = i
  31.     for row in dataset:
  32.         row[column] = lookup[row[column]]
  33.     return lookup
  34.  
  35. # Find the min and max values for each column
  36. def dataset_minmax(dataset):
  37.     minmax = list()
  38.     for i in range(len(dataset[0])):
  39.         col_values = [row[i] for row in dataset]
  40.         value_min = min(col_values)
  41.         value_max = max(col_values)
  42.         minmax.append([value_min, value_max])
  43.     return minmax
  44.  
  45. # Rescale dataset columns to the range 0-1
  46. def normalize_dataset(dataset, minmax):
  47.     for row in dataset:
  48.         for i in range(len(row)):
  49.             row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
  50.  
  51. # Split a dataset into k folds
  52. def cross_validation_split(dataset, n_folds):
  53.     dataset_split = list()
  54.     dataset_copy = list(dataset)
  55.     fold_size = int(len(dataset) / n_folds)
  56.     for _ in range(n_folds):
  57.         fold = list()
  58.         while len(fold) < fold_size:
  59.             index = randrange(len(dataset_copy))
  60.             fold.append(dataset_copy.pop(index))
  61.         dataset_split.append(fold)
  62.     return dataset_split
  63.  
  64. # Calculate accuracy percentage
  65. def accuracy_metric(actual, predicted):
  66.     correct = 0
  67.     for i in range(len(actual)):
  68.         if actual[i] == predicted[i]:
  69.             correct += 1
  70.     return correct / float(len(actual)) * 100.0
  71.  
  72. # Evaluate an algorithm using a cross validation split
  73. def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  74.     folds = cross_validation_split(dataset, n_folds)
  75.     scores = list()
  76.     for fold in folds:
  77.         train_set = list(folds)
  78.         train_set.remove(fold)
  79.         train_set = sum(train_set, [])
  80.         test_set = list()
  81.         for row in fold:
  82.             row_copy = list(row)
  83.             test_set.append(row_copy)
  84.             row_copy[-1] = None
  85.         predicted = algorithm(train_set, test_set, *args)
  86.         actual = [row[-1] for row in fold]
  87.         accuracy = accuracy_metric(actual, predicted)
  88.         scores.append(accuracy)
  89.     return scores
  90.  
  91. # Calculate the Euclidean distance between two vectors
  92. def euclidean_distance(row1, row2):
  93.     distance = 0.0
  94.     for i in range(len(row1)-1):
  95.         distance += (row1[i] - row2[i])**2
  96.     return sqrt(distance)
  97.  
  98. # Locate the most similar neighbors
  99. def get_neighbors(train, test_row, num_neighbors):
  100.     distances = list()
  101.     for train_row in train:
  102.         dist = euclidean_distance(test_row, train_row)
  103.         distances.append((train_row, dist))
  104.     distances.sort(key=lambda tup: tup[1])
  105.     neighbors = list()
  106.     for i in range(num_neighbors):
  107.         neighbors.append(distances[i][0])
  108.     return neighbors
  109.  
  110. # Make a prediction with neighbors
  111. def predict_classification(train, test_row, num_neighbors):
  112.     neighbors = get_neighbors(train, test_row, num_neighbors)
  113.     output_values = [row[-1] for row in neighbors]
  114.     prediction = max(set(output_values), key=output_values.count)
  115.     return prediction
  116.  
  117. # kNN Algorithm
  118. def k_nearest_neighbors(train, test, num_neighbors):
  119.     predictions = list()
  120.     for row in test:
  121.         output = predict_classification(train, row, num_neighbors)
  122.         predictions.append(output)
  123.     return(predictions)
  124.  
  125. # Test the kNN on the Cervical Cancer Behavior Risk Data Set
  126. seed(1)
  127. filename = 'Cervical-Cancer-Behavior-Risk.csv'
  128. dataset = load_csv(filename)
  129. for i in range(len(dataset[0])-1):
  130.     str_column_to_float(dataset, i)
  131.  
  132. # convert class column to integers
  133. str_column_to_int(dataset, len(dataset[0])-1)
  134.  
  135. # set n_folds and num_neighbors
  136. n_folds = 5
  137. num_neighbors = 5
  138.  
  139. # the mean classification accuracy scores
  140. scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
  141. print('Scores: %s' % scores)
  142. print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
  143. # print('Median Accuracy: %.3f%%' % (statistics.median(scores))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement