Advertisement
ivanakarpuzova

printGrank

Sep 10th, 2019
278
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.58 KB | None | 0 0
  1. """
  2. Да се промени функцијата за предвидување, така што при изминувањето ќе печати информации за:
  3. со која колона и вредност се споредува
  4. за која е тековната вредност на тест примерокот за бараната колона
  5. која е следната гранка што ќе се изминува низ дрвото (True branch или False branch)
  6. преостанатиот дел од дрвото што треба да се измине
  7. празна линија
  8. Потоа да се испечати истренираното дрво, да се вчита непознат тренинг примерок од стандардниот влез и
  9. истиот да се класифицира со новата функција за предвидување.
  10. """
  11.  
  12. trainingData=[['twitter','USA','yes',18,'None'],
  13. ['google','France','yes',23,'Premium'],
  14. ['google','France','no',26,'Basic'],
  15. ['google','Macedonia','yes',13,'None'],
  16. ['pinterest','USA','yes',24,'Basic'],
  17. ['bing','France','yes',23,'Basic'],
  18. ['google','UK','no',21,'Premium'],
  19. ['facebook','New Zealand','no',12,'None'],
  20. ['facebook','UK','no',21,'Basic'],
  21. ['google','USA','no',24,'Premium'],
  22. ['twitter','France','yes',19,'None'],
  23. ['pinterest','USA','no',18,'None'],
  24. ['google','UK','no',18,'None'],
  25. ['bing','UK','yes',19,'Premium'],
  26. ['bing','Macedonia','no',10,'None'],
  27. ['facebook','Macedonia','no',16,'Basic'],
  28. ['bing','UK','no',19,'Basic'],
  29. ['pinterest','Germany','no',2,'None'],
  30. ['pinterest','USA','yes',12,'Basic'],
  31. ['twitter','UK','no',21,'None'],
  32. ['twitter','UK','yes',26,'Premium'],
  33. ['google','UK','yes',18,'Basic'],
  34. ['bing','France','yes',19,'Basic']]
  35.  
  36. test_cases=[['google','MK','no',24,'Unknown'],
  37. ['google','MK','no',15,'Unknown'],
  38. ['pinterest','UK','yes',21,'Unknown'],
  39. ['pinterest','UK','no',25,'Unknown']]
  40.  
  41. # trainingData=[line.split('\t') for line in file('decision_tree_example.txt')]
  42.  
  43.  
  44.  
  45. class decisionnode:
  46. def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
  47. self.col = col
  48. self.value = value
  49. self.results = results
  50. self.tb = tb
  51. self.fb = fb
  52.  
  53.  
  54. def sporedi_broj(row, column, value):
  55. return row[column] >= value
  56.  
  57.  
  58. def sporedi_string(row, column, value):
  59. return row[column] == value
  60.  
  61.  
  62. # Divides a set on a specific column. Can handle numeric
  63. # or nominal values
  64. def divideset(rows, column, value):
  65. # Make a function that tells us if a row is in
  66. # the first group (true) or the second group (false)
  67. split_function = None
  68. if isinstance(value, int) or isinstance(value, float): # ako vrednosta so koja sporeduvame e od tip int ili float
  69. # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  70. split_function = sporedi_broj
  71. else:
  72. # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  73. split_function = sporedi_string
  74.  
  75. # Divide the rows into two sets and return them
  76. set_false = []
  77. set_true = []
  78. for row in rows:
  79. if split_function(row, column, value):
  80. set_true.append(row)
  81. else:
  82. set_false.append(row)
  83. set1 = [row for row in rows if
  84. split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja true
  85. set2 = [row for row in rows if
  86. not split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja false
  87. # return (set1, set2)
  88. return (set_true, set_false)
  89.  
  90.  
  91.  
  92.  
  93. # Create counts of possible results (the last column of
  94. # each row is the result)
  95. def uniquecounts(rows):
  96. results = {}
  97. for row in rows:
  98. # The result is the last column
  99. r = row[-1]
  100. results.setdefault(r, 0)
  101. results[r] += 1
  102.  
  103. return results
  104.  
  105.  
  106. # Probability that a randomly placed item will
  107. # be in the wrong category
  108.  
  109. def log2(x):
  110. from math import log
  111. l2 = log(x) / log(2)
  112. return l2
  113.  
  114.  
  115. # Entropy is the sum of p(x)log(p(x)) across all
  116. # the different possible results
  117. def entropy(rows):
  118. results = uniquecounts(rows)
  119. # Now calculate the entropy
  120. ent = 0.0
  121. for r in results.keys():
  122. p = float(results[r]) / len(rows)
  123. ent = ent - p * log2(p)
  124. return ent
  125.  
  126.  
  127. def buildtree(rows, scoref=entropy):
  128. if len(rows) == 0: return decisionnode()
  129. current_score = scoref(rows)
  130.  
  131. # Set up some variables to track the best criteria
  132. best_gain = 0.0
  133. best_column = -1
  134. best_value = None
  135. best_subsetf = None
  136. best_subsett = None
  137.  
  138. column_count = len(rows[0]) - 1
  139. for col in range(column_count):
  140. # Generate the list of different values in
  141. # this column
  142. column_values = set()
  143. for row in rows:
  144. column_values.add(row[col])
  145. # Now try dividing the rows up for each value
  146. # in this column
  147. for value in column_values:
  148. (set1, set2) = divideset(rows, col, value)
  149.  
  150. # Information gain
  151. p = float(len(set1)) / len(rows)
  152. gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
  153. if gain > best_gain and len(set1) > 0 and len(set2) > 0:
  154. best_gain = gain
  155. best_column = col
  156. best_value = value
  157. best_subsett = set1
  158. best_subsetf = set2
  159. # best_criteria = (col, value)
  160. # best_sets = (set1, set2)
  161.  
  162. # Create the subbranches
  163. if best_gain > 0:
  164. trueBranch = buildtree(best_subsett, scoref)
  165. falseBranch = buildtree(best_subsetf, scoref)
  166. return decisionnode(col=best_column, value=best_value,
  167. tb=trueBranch, fb=falseBranch)
  168. else:
  169. return decisionnode(results=uniquecounts(rows))
  170.  
  171.  
  172.  
  173. def printtree(tree, indent=''):
  174. # Is this a leaf node?
  175. if tree.results != None:
  176. print(indent + str(sorted(tree.results.items())))
  177. else:
  178. # Print the criteria
  179. print(indent + str(tree.col) + ':' + str(tree.value) + '? ')
  180. # Print the branches
  181. print(indent + 'T->')
  182. printtree(tree.tb, indent + ' ')
  183. print(indent + 'F->')
  184. printtree(tree.fb, indent + ' ')
  185.  
  186.  
  187.  
  188.  
  189. def classify(observation, tree):
  190. if tree.results != None:
  191. return tree.results
  192. else:
  193. vrednost = observation[tree.col]
  194. branch = None
  195.  
  196. if isinstance(vrednost, int) or isinstance(vrednost, float):
  197. if vrednost >= tree.value:
  198. branch = tree.tb
  199. else:
  200. branch = tree.fb
  201. else:
  202. if vrednost == tree.value:
  203. branch = tree.tb
  204. else:
  205. branch = tree.fb
  206.  
  207. return classify(observation, branch)
  208.  
  209. def classify2(observation, tree):
  210. if tree.results != None:
  211. results = [(value,key) for key,value in tree.results.items()]
  212. results.sort()
  213. return results[0][1]
  214. else:
  215. vrednost = observation[tree.col]
  216. branch = None
  217. granka = 'True branch'
  218. if isinstance(vrednost, int) or isinstance(vrednost, float):
  219. if vrednost >= tree.value:
  220. branch = tree.tb
  221. else:
  222. branch = tree.fb
  223. granka = 'False branch'
  224. else:
  225. if vrednost == tree.value:
  226. branch = tree.tb
  227. else:
  228. branch = tree.fb
  229. granka = 'False branch'
  230. print('Sporeduvam kolona i vrednost', (tree.col, tree.value))
  231. print('Tekovna vrednost:', vrednost)
  232. print ('Sledna granka', granka)
  233. print('Preostanata granka za izminuvanje:')
  234. printtree(branch)
  235. print()
  236. return classify2(observation, branch)
  237.  
  238.  
  239.  
  240.  
  241.  
  242. if __name__ == "__main__":
  243. referrer = 'google'
  244. location = 'USA'
  245. readFAQ = 'no'
  246. pagesVisited = 25
  247. serviceChosen = 'Unknown'
  248.  
  249.  
  250. testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
  251. t=buildtree(trainingData)
  252. printtree(t)
  253. print(classify2(testCase,t))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement