Advertisement
nanorocks

exam_python_SNZ_1_decision_trees

Nov 5th, 2017
919
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.98 KB | None | 0 0
  1. """
  2. Дрва за одлучување (80 поени) Problem 1 (0 / 0)
  3. Да се промени функцијата за предвидување, така што при изминувањето ќе печати информации за:
  4.  
  5. со која колона и вредност се споредува
  6. за која е тековната вредност на тест примерокот за бараната колона
  7. која е следната гранка што ќе се изминува низ дрвото (True branch или False branch)
  8. преостанатиот дел од дрвото што треба да се измине
  9. празна линија
  10. Потоа да се испечати истренираното дрво, да се вчита непознат тренинг примерок од стандардниот влез и
  11. истиот да се класифицира со новата функција за предвидување.
  12. # trainingData=[line.split('\t') for line in file('decision_tree_example.txt')]
  13.  
  14.  
  15. """
  16.  
  17.  
  18. trainingData=[['slashdot','USA','yes',18,'None'],
  19.         ['google','France','yes',23,'Premium'],
  20.         ['google','France','yes',23,'Basic'],
  21.         ['google','France','yes',23,'Basic'],
  22.         ['digg','USA','yes',24,'Basic'],
  23.         ['kiwitobes','France','yes',23,'Basic'],
  24.         ['google','UK','no',21,'Premium'],
  25.         ['(direct)','New Zealand','no',12,'None'],
  26.         ['(direct)','UK','no',21,'Basic'],
  27.         ['google','USA','no',24,'Premium'],
  28.         ['slashdot','France','yes',19,'None'],
  29.         ['digg','USA','no',18,'None'],
  30.         ['google','UK','no',18,'None'],
  31.         ['kiwitobes','UK','no',19,'None'],
  32.         ['digg','New Zealand','yes',12,'Basic'],
  33.         ['slashdot','UK','no',21,'None'],
  34.         ['google','UK','yes',18,'Basic'],
  35.         ['kiwitobes','France','yes',19,'Basic']]
  36.  
  37.  
  38. class decisionnode:
  39.     def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
  40.         self.col = col
  41.         self.value = value
  42.         self.results = results
  43.         self.tb = tb
  44.         self.fb = fb
  45.  
  46.  
  47. def sporedi_broj(row, column, value):
  48.     return row[column] >= value
  49.  
  50.  
  51. def sporedi_string(row, column, value):
  52.     return row[column] == value
  53.  
  54.  
  55. # Divides a set on a specific column. Can handle numeric
  56. # or nominal values
  57. def divideset(rows, column, value):
  58.     # Make a function that tells us if a row is in
  59.     # the first group (true) or the second group (false)
  60.     split_function = None
  61.     if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
  62.         # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  63.         split_function = sporedi_broj
  64.     else:
  65.         # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  66.         split_function = sporedi_string
  67.  
  68.     # Divide the rows into two sets and return them
  69.     set_false = []
  70.     set_true = []
  71.     for row in rows:
  72.         if split_function(row, column, value):
  73.             set_true.append(row)
  74.         else:
  75.             set_false.append(row)
  76.     set1 = [row for row in rows if
  77.             split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
  78.     set2 = [row for row in rows if
  79.             not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
  80.     # return (set1, set2)
  81.     return (set_true, set_false)
  82.  
  83.  
  84. #st, sf = divideset(my_data, 3, 20)
  85. #print(sf)
  86. #print(st)
  87.  
  88.  
  89. # Create counts of possible results (the last column of
  90. # each row is the result)
  91. def uniquecounts(rows):
  92.     results = {}
  93.     for row in rows:
  94.         # The result is the last column
  95.         r = row[-1]
  96.         results.setdefault(r, 0)
  97.         results[r] += 1
  98.  
  99.     return results
  100.  
  101.  
  102. #print(uniquecounts(my_data))
  103. #print(uniquecounts(st))
  104. #print(uniquecounts(sf))
  105.  
  106.  
  107. # Probability that a randomly placed item will
  108. # be in the wrong category
  109.  
  110. def log2(x):
  111.     from math import log
  112.     l2 = log(x) / log(2)
  113.     return l2
  114.  
  115.  
  116. # Entropy is the sum of p(x)log(p(x)) across all
  117. # the different possible results
  118. def entropy(rows):
  119.     results = uniquecounts(rows)
  120.     # Now calculate the entropy
  121.     # print results.items()
  122.  
  123.     ent = 0.0
  124.     for r in results.keys():
  125.         p = float(results[r]) / len(rows)
  126.         ent = ent - p * log2(p)
  127.     return ent
  128.  
  129.  
  130. #print(entropy(my_data), entropy(st), entropy(sf))
  131.  
  132.  
  133.  
  134. def buildtree(rows, scoref=entropy):
  135.     if len(rows) == 0: return decisionnode()
  136.     current_score = scoref(rows)
  137.  
  138.     # Set up some variables to track the best criteria
  139.     best_gain = 0.0
  140.     #best_column = -1
  141.     #best_value = None
  142.     #best_subsetf = None
  143.     #best_subsett = None
  144.     best_criteria = None
  145.     best_sets = None
  146.     column_count = len(rows[0]) - 1
  147.     for col in range(column_count):
  148.         # Generate the list of different values in
  149.         # this column
  150.         column_values = {}
  151.         for row in rows:
  152.             column_values[row[col]] = 1
  153.         # Now try dividing the rows up for each value
  154.         # in this column
  155.         for value in column_values:
  156.             (set1, set2) = divideset(rows, col, value)
  157.  
  158.             # Information gain
  159.             p = float(len(set1)) / len(rows)
  160.             gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
  161.  
  162.  
  163.             if gain > best_gain and len(set1) > 0 and len(set2) > 0:
  164.                 best_gain = gain
  165.                 #best_column = col
  166.                 #best_value = value
  167.                 #best_subsett = set1
  168.                 #best_subsetf = set2
  169.                 best_criteria = (col, value)
  170.                 best_sets = (set1, set2)
  171.  
  172.     # Create the subbranches
  173.     if best_gain > 0:
  174.         trueBranch = buildtree(best_sets[0], scoref)
  175.  
  176.         falseBranch = buildtree(best_sets[1], scoref)
  177.         #print trueBranch
  178.         return decisionnode(col=best_criteria[0], value=best_criteria[1],
  179.                             tb=trueBranch, fb=falseBranch)
  180.     else:
  181.         return decisionnode(results=uniquecounts(rows))
  182.  
  183. def takeSecond(elem):
  184.     return elem[1]
  185.  
  186. def printtree(tree, indent=''):
  187.     # Is this a leaf node?
  188.     if tree.results != None:
  189.         print(str(tree.results))
  190.     else:
  191.         # Print the criteria
  192.         print(str(tree.col) + ':' + str(tree.value) + '? ')
  193.         # Print the branches
  194.         print(indent + 'T->'),
  195.         printtree(tree.tb, indent + '  ')
  196.         print(indent + 'F->'),
  197.         printtree(tree.fb, indent + '  ')
  198.  
  199.  
  200.  
  201. def classify(observation, tree):
  202.     if tree.results != None:
  203.         print  tree.results
  204.         return tree.results
  205.     else:
  206.         vrednost = observation[tree.col]
  207.         branch = None
  208.         print "Sporeduvam kolona i vrednost:",(tree.col,vrednost)
  209.         print "Tekovna vrednost:",tree.value
  210.  
  211.         if isinstance(vrednost, int) or isinstance(vrednost, float):
  212.             if vrednost >= tree.value:
  213.                 branch = tree.tb
  214.                 print "Sledna granka za izminuvanje: True branch"
  215.             else:
  216.                 branch = tree.fb
  217.                 print "Sledna granka: False branch"
  218.         else:
  219.             if vrednost == tree.value:
  220.                 branch = tree.tb
  221.                 print "Sledna granka: True branch"
  222.             else:
  223.                 branch = tree.fb
  224.                 print "Sledna granka: False branch"
  225.  
  226.         print "Preostanata granka za izminuvanje:"
  227.         printtree(branch)
  228.         return classify(observation, branch)
  229.  
  230.  
  231. if __name__ == "__main__":
  232.     #referrer = input()
  233.     #location = input()
  234.     #readFAQ = input()
  235.     #pagesVisited = input()
  236.     serviceChosen = 'Unknown'
  237.  
  238.     referrer = 'google'
  239.     location = 'UK'
  240.     readFAQ = 'no'
  241.     pagesVisited = 22
  242.  
  243.     testCase = [referrer, location, readFAQ, pagesVisited, serviceChosen]
  244.  
  245.     t = buildtree(trainingData)
  246.     printtree(t)
  247.     classify(testCase, t)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement