Advertisement
Guest User

Untitled

a guest
Jun 19th, 2019
127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.77 KB | None | 0 0
  1. import heapq
  2. import sys
  3. import math as m
  4. from decimal import Decimal
  5.  
  6. class Entry:
  7. def __init__(self, attributes):
  8. self.attributes = attributes
  9. self.diabetes = "yes" if "yes" in attributes else "no" if "no" in attributes else ""
  10.  
  11. def __str__(self):
  12. string = ""
  13. for attributeNum, attribute in enumerate(self.attributes):
  14. if attributeNum == len(self.attributes) - 1:
  15. string += str(attribute)
  16. else:
  17. string += str(attribute) + ','
  18. return string
  19.  
  20. def set_ifdiabetes(self, diabetes):
  21. self.diabetes = diabetes
  22.  
  23. def euclidean(self, other_entry):
  24. sum = 0.0
  25. for i in range(len(self.attributes)):
  26. sum += m.pow(float(self.attributes[i]) - float(other_entry.attributes[i]), 2)
  27. return m.sqrt(sum)
  28.  
  29. class NB:
  30. def __init__(self, training_data, testing_data):
  31. self.training_data = training_data
  32. self.testing_data = testing_data
  33.  
  34. self.training_entries = []
  35. self.testing_entries = []
  36.  
  37. self.diabetes_yes = []
  38. self.diabetes_no = []
  39.  
  40. self.mu_diabetesyes = []
  41. self.mu_diabetesno = []
  42.  
  43. self.sigma_diabetesyes = []
  44. self.sigma_diabetesno = []
  45.  
  46. self.num_attributes = 0
  47.  
  48. self.p_diabetesyes = 0
  49. self.p_diabetesno = 0
  50.  
  51. self.num_diabetesyes = 0
  52. self.num_diabetesno = 0
  53.  
  54. def train(self):
  55. self.traincleandata()
  56. self.get_mus()
  57. self.get_sigmas()
  58.  
  59. def traincleandata(self):
  60. for line in self.training_data:
  61. if self.num_attributes == 0:
  62. self.num_attributes = len(line.split(','))
  63. for i in range(self.num_attributes):
  64. self.diabetes_yes.append([])
  65. self.diabetes_no.append([])
  66. self.mu_diabetesyes.append(0.0)
  67. self.mu_diabetesno.append(0.0)
  68. self.sigma_diabetesyes.append(0.0)
  69. self.sigma_diabetesno.append(0.0)
  70. params = line.split(',')
  71. cleanparams = getcleanparams(params)
  72. entry = Entry(cleanparams)
  73. self.training_entries.append(entry)
  74.  
  75. if (entry.diabetes == "yes"):
  76. for i in range(len(entry.attributes) - 1):
  77. self.diabetes_yes[i].append(Decimal(entry.attributes[i]))
  78. self.p_diabetesyes += 1
  79. self.num_diabetesyes += 1
  80. else:
  81. for i in range(len(entry.attributes) - 1):
  82. self.diabetes_no[i].append(Decimal(entry.attributes[i]))
  83. self.p_diabetesno += 1
  84. self.num_diabetesno += 1
  85.  
  86. self.p_diabetesyes = Decimal(self.p_diabetesyes) / len(self.training_entries)
  87. self.p_diabetesno = Decimal(self.p_diabetesno) / len(self.training_entries)
  88.  
  89. def get_mus(self):
  90. for i in range(self.num_attributes - 1):
  91. self.mu_diabetesyes[i] = sum(self.diabetes_yes[i]) / len(self.diabetes_yes[i])
  92. self.mu_diabetesno[i] = sum(self.diabetes_no[i]) / len(self.diabetes_no[i])
  93.  
  94. def get_sigmas(self):
  95.  
  96. sigSumYes = [0] * self.num_attributes
  97. sigSumNo = [0] * self.num_attributes
  98.  
  99. for i in range(self.num_attributes - 1):
  100. for j in range(self.num_diabetesyes):
  101. sigSumYes[i] += m.pow(self.diabetes_yes[i][j] - self.mu_diabetesyes[i], 2)
  102. self.sigma_diabetesyes[i] = m.sqrt(sigSumYes[i] / (len(self.diabetes_yes[i]) - 1))
  103. for j in range(self.num_diabetesno):
  104. sigSumNo[i] += m.pow(self.diabetes_no[i][j] - self.mu_diabetesno[i], 2)
  105. self.sigma_diabetesno[i] = m.sqrt(sigSumNo[i] / (len(self.diabetes_no[i]) - 1))
  106.  
  107. def test(self):
  108. self.testcleandata()
  109. self.testalgo()
  110.  
  111. def testcleandata(self):
  112. for line in self.testing_data:
  113. params = line.split(',')
  114. cleanparams = getcleanparams(params)
  115. entry = Entry(cleanparams)
  116. self.testing_entries.append(entry)
  117.  
  118. def testalgo(self):
  119. counter = 1
  120. P_diabetesyes = [0] * self.num_attributes
  121. P_diabetesno = [0] * self.num_attributes
  122. for entry in self.testing_entries:
  123. pYesEntry = 1
  124. pNoEntry = 1
  125. for i in range(self.num_attributes - 1):
  126. P_diabetesyes[i] = Decimal((1 / (self.sigma_diabetesyes[i] * m.sqrt(2 * m.pi))) * m.pow(m.e, (-m.pow(Decimal(entry.attributes[i]) - self.mu_diabetesyes[i], 2) / (2 * m.pow(self.sigma_diabetesyes[i], 2)))))
  127. P_diabetesno[i] = Decimal((1 / (self.sigma_diabetesno[i] * m.sqrt(2 * m.pi))) * m.pow(m.e, (-m.pow(Decimal(entry.attributes[i]) -self.mu_diabetesno[i], 2) / (2 * m.pow(self.sigma_diabetesno[i], 2)))))
  128. pYesEntry *= float(P_diabetesyes[i])
  129. pNoEntry *= float(P_diabetesno[i])
  130.  
  131. pYesEntry *= float(self.p_diabetesyes)
  132. pNoEntry *= float(self.p_diabetesno)
  133.  
  134. entry.set_ifdiabetes("yes") if (pYesEntry/pNoEntry >= 1) else entry.set_ifdiabetes("no")
  135. counter += 1
  136.  
  137. class kNN:
  138. def __init__(self, training_data, testing_data, k):
  139. self.k = k
  140. self.training_data = training_data
  141. self.testing_data = testing_data
  142. self.training_entries = []
  143. self.testing_entries = []
  144.  
  145. def __str__(self):
  146. string_to_return = ''
  147. for entry in self.training_entries:
  148. string_to_return = string_to_return + str(entry) + 'n'
  149. return string_to_return
  150.  
  151. def train(self):
  152. for line in self.training_data:
  153. params = line.split(',')
  154. self.training_entries.append(Entry(getcleanparams(params)))
  155.  
  156. def test(self):
  157. counter = 1
  158. for line in self.testing_data:
  159. params = line.split(',')
  160. self.testing_entries.append(Entry(getcleanparams(params)))
  161. for testEntry in self.testing_entries:
  162. nearest = []
  163. for trainEntry in self.training_entries:
  164. current_entry = (testEntry.euclidean(trainEntry), str(trainEntry.diabetes), str(trainEntry))
  165. nearest.append(current_entry)
  166. heapq.heapify(nearest)
  167. nearest.sort()
  168. nearest = nearest[:int(self.k)]
  169. num_diabetes = 0
  170. for entry in nearest:
  171. if (entry[1] == "yes"):
  172. num_diabetes += 1
  173. testEntry.set_ifdiabetes("yes") if (num_diabetes >= int(self.k)/2) else testEntry.set_ifdiabetes("no")
  174. counter += 1
  175.  
  176. def compare(self):
  177. for entry in self.testing_entries:
  178. print("-----------")
  179. print(entry.compare(self.training_entries[0]))
  180. print(entry)
  181. print(self.training_entries[0])
  182.  
  183.  
  184. def getcleanparams(params):
  185. params_nospaceortab = []
  186. for param in params:
  187. params_nospaceortab.append(param.strip())
  188. return params_nospaceortab
  189.  
  190.  
  191. if __name__ == '__main__':
  192. training_file = open(sys.argv[1])
  193. testing_file = open(sys.argv[2])
  194. classifier_input = sys.argv[3]
  195.  
  196. training_lines = training_file.readlines()
  197. training_cleanlines = getcleanparams(training_lines)
  198.  
  199. testing_lines = testing_file.readlines()
  200. testing_cleanlines = getcleanparams(testing_lines)
  201.  
  202. if classifier_input == "NB":
  203. classifier = NB(training_cleanlines, testing_cleanlines)
  204. classifier.train()
  205. classifier.test()
  206. for entry in classifier.testing_entries:
  207. print(entry.diabetes)
  208. elif "NN" in classifier_input:
  209. classifier = kNN(training_cleanlines, testing_cleanlines, (classifier_input.index('NN') - 1))
  210. classifier.train()
  211. classifier.test()
  212. for entry in classifier.testing_entries:
  213. print(entry.diabetes)
  214. else:
  215. print("Error: unknown classifier type")
  216. sys.exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement