Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import heapq
- import sys
- import math as m
- from decimal import Decimal
- class Entry:
- def __init__(self, attributes):
- self.attributes = attributes
- self.diabetes = "yes" if "yes" in attributes else "no" if "no" in attributes else ""
- def __str__(self):
- string = ""
- for attributeNum, attribute in enumerate(self.attributes):
- if attributeNum == len(self.attributes) - 1:
- string += str(attribute)
- else:
- string += str(attribute) + ','
- return string
- def set_ifdiabetes(self, diabetes):
- self.diabetes = diabetes
- def euclidean(self, other_entry):
- sum = 0.0
- for i in range(len(self.attributes)):
- sum += m.pow(float(self.attributes[i]) - float(other_entry.attributes[i]), 2)
- return m.sqrt(sum)
- class NB:
- def __init__(self, training_data, testing_data):
- self.training_data = training_data
- self.testing_data = testing_data
- self.training_entries = []
- self.testing_entries = []
- self.diabetes_yes = []
- self.diabetes_no = []
- self.mu_diabetesyes = []
- self.mu_diabetesno = []
- self.sigma_diabetesyes = []
- self.sigma_diabetesno = []
- self.num_attributes = 0
- self.p_diabetesyes = 0
- self.p_diabetesno = 0
- self.num_diabetesyes = 0
- self.num_diabetesno = 0
- def train(self):
- self.traincleandata()
- self.get_mus()
- self.get_sigmas()
- def traincleandata(self):
- for line in self.training_data:
- if self.num_attributes == 0:
- self.num_attributes = len(line.split(','))
- for i in range(self.num_attributes):
- self.diabetes_yes.append([])
- self.diabetes_no.append([])
- self.mu_diabetesyes.append(0.0)
- self.mu_diabetesno.append(0.0)
- self.sigma_diabetesyes.append(0.0)
- self.sigma_diabetesno.append(0.0)
- params = line.split(',')
- cleanparams = getcleanparams(params)
- entry = Entry(cleanparams)
- self.training_entries.append(entry)
- if (entry.diabetes == "yes"):
- for i in range(len(entry.attributes) - 1):
- self.diabetes_yes[i].append(Decimal(entry.attributes[i]))
- self.p_diabetesyes += 1
- self.num_diabetesyes += 1
- else:
- for i in range(len(entry.attributes) - 1):
- self.diabetes_no[i].append(Decimal(entry.attributes[i]))
- self.p_diabetesno += 1
- self.num_diabetesno += 1
- self.p_diabetesyes = Decimal(self.p_diabetesyes) / len(self.training_entries)
- self.p_diabetesno = Decimal(self.p_diabetesno) / len(self.training_entries)
- def get_mus(self):
- for i in range(self.num_attributes - 1):
- self.mu_diabetesyes[i] = sum(self.diabetes_yes[i]) / len(self.diabetes_yes[i])
- self.mu_diabetesno[i] = sum(self.diabetes_no[i]) / len(self.diabetes_no[i])
- def get_sigmas(self):
- sigSumYes = [0] * self.num_attributes
- sigSumNo = [0] * self.num_attributes
- for i in range(self.num_attributes - 1):
- for j in range(self.num_diabetesyes):
- sigSumYes[i] += m.pow(self.diabetes_yes[i][j] - self.mu_diabetesyes[i], 2)
- self.sigma_diabetesyes[i] = m.sqrt(sigSumYes[i] / (len(self.diabetes_yes[i]) - 1))
- for j in range(self.num_diabetesno):
- sigSumNo[i] += m.pow(self.diabetes_no[i][j] - self.mu_diabetesno[i], 2)
- self.sigma_diabetesno[i] = m.sqrt(sigSumNo[i] / (len(self.diabetes_no[i]) - 1))
- def test(self):
- self.testcleandata()
- self.testalgo()
- def testcleandata(self):
- for line in self.testing_data:
- params = line.split(',')
- cleanparams = getcleanparams(params)
- entry = Entry(cleanparams)
- self.testing_entries.append(entry)
- def testalgo(self):
- counter = 1
- P_diabetesyes = [0] * self.num_attributes
- P_diabetesno = [0] * self.num_attributes
- for entry in self.testing_entries:
- pYesEntry = 1
- pNoEntry = 1
- for i in range(self.num_attributes - 1):
- P_diabetesyes[i] = Decimal((1 / (self.sigma_diabetesyes[i] * m.sqrt(2 * m.pi))) * m.pow(m.e, (-m.pow(Decimal(entry.attributes[i]) - self.mu_diabetesyes[i], 2) / (2 * m.pow(self.sigma_diabetesyes[i], 2)))))
- P_diabetesno[i] = Decimal((1 / (self.sigma_diabetesno[i] * m.sqrt(2 * m.pi))) * m.pow(m.e, (-m.pow(Decimal(entry.attributes[i]) -self.mu_diabetesno[i], 2) / (2 * m.pow(self.sigma_diabetesno[i], 2)))))
- pYesEntry *= float(P_diabetesyes[i])
- pNoEntry *= float(P_diabetesno[i])
- pYesEntry *= float(self.p_diabetesyes)
- pNoEntry *= float(self.p_diabetesno)
- entry.set_ifdiabetes("yes") if (pYesEntry/pNoEntry >= 1) else entry.set_ifdiabetes("no")
- counter += 1
- class kNN:
- def __init__(self, training_data, testing_data, k):
- self.k = k
- self.training_data = training_data
- self.testing_data = testing_data
- self.training_entries = []
- self.testing_entries = []
- def __str__(self):
- string_to_return = ''
- for entry in self.training_entries:
- string_to_return = string_to_return + str(entry) + 'n'
- return string_to_return
- def train(self):
- for line in self.training_data:
- params = line.split(',')
- self.training_entries.append(Entry(getcleanparams(params)))
- def test(self):
- counter = 1
- for line in self.testing_data:
- params = line.split(',')
- self.testing_entries.append(Entry(getcleanparams(params)))
- for testEntry in self.testing_entries:
- nearest = []
- for trainEntry in self.training_entries:
- current_entry = (testEntry.euclidean(trainEntry), str(trainEntry.diabetes), str(trainEntry))
- nearest.append(current_entry)
- heapq.heapify(nearest)
- nearest.sort()
- nearest = nearest[:int(self.k)]
- num_diabetes = 0
- for entry in nearest:
- if (entry[1] == "yes"):
- num_diabetes += 1
- testEntry.set_ifdiabetes("yes") if (num_diabetes >= int(self.k)/2) else testEntry.set_ifdiabetes("no")
- counter += 1
- def compare(self):
- for entry in self.testing_entries:
- print("-----------")
- print(entry.compare(self.training_entries[0]))
- print(entry)
- print(self.training_entries[0])
- def getcleanparams(params):
- params_nospaceortab = []
- for param in params:
- params_nospaceortab.append(param.strip())
- return params_nospaceortab
- if __name__ == '__main__':
- training_file = open(sys.argv[1])
- testing_file = open(sys.argv[2])
- classifier_input = sys.argv[3]
- training_lines = training_file.readlines()
- training_cleanlines = getcleanparams(training_lines)
- testing_lines = testing_file.readlines()
- testing_cleanlines = getcleanparams(testing_lines)
- if classifier_input == "NB":
- classifier = NB(training_cleanlines, testing_cleanlines)
- classifier.train()
- classifier.test()
- for entry in classifier.testing_entries:
- print(entry.diabetes)
- elif "NN" in classifier_input:
- classifier = kNN(training_cleanlines, testing_cleanlines, (classifier_input.index('NN') - 1))
- classifier.train()
- classifier.test()
- for entry in classifier.testing_entries:
- print(entry.diabetes)
- else:
- print("Error: unknown classifier type")
- sys.exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement