Advertisement
nanorocks

lab_python_SNZ_1_decision_trees

Nov 5th, 2017
379
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.93 KB | None | 0 0
  1. """
  2.  
  3. Задача 1 Problem 1 (2 / 12)
  4. Да се промени класата за дрво на одлука да чува и информација на кое ниво во дрвото се наоѓа јазолот. Потоа да се променат и функциите за градење и печатење на дрвото така што за секој јазол ќе се печати и нивото. Коренот е на нулто ниво. На излез со функцијата printTree треба да се испечати даденото тренинг множество. Прочитана инстанца од стандарден влез да се додаде на тренинг множеството и потоа да се истренира и испечати истото.
  5.  
  6.  
  7. """
  8.  
  9. trainingData=[['slashdot','USA','yes',18,'None'],
  10.         ['google','France','yes',23,'Premium'],
  11.         ['google','France','yes',23,'Basic'],
  12.         ['google','France','yes',23,'Basic'],
  13.         ['digg','USA','yes',24,'Basic'],
  14.         ['kiwitobes','France','yes',23,'Basic'],
  15.         ['google','UK','no',21,'Premium'],
  16.         ['(direct)','New Zealand','no',12,'None'],
  17.         ['(direct)','UK','no',21,'Basic'],
  18.         ['google','USA','no',24,'Premium'],
  19.         ['slashdot','France','yes',19,'None'],
  20.         ['digg','USA','no',18,'None'],
  21.         ['google','UK','no',18,'None'],
  22.         ['kiwitobes','UK','no',19,'None'],
  23.         ['digg','New Zealand','yes',12,'Basic'],
  24.         ['slashdot','UK','no',21,'None'],
  25.         ['google','UK','yes',18,'Basic'],
  26.         ['kiwitobes','France','yes',19,'Basic']]
  27.  
  28. class decisionnode:
  29.     def __init__(self, col=-1, value=None, results=None, tb=None, fb=None, l=None):
  30.         self.col = col
  31.         self.value = value
  32.         self.results = results
  33.         self.tb = tb
  34.         self.fb = fb
  35.         self.l = l
  36.  
  37.  
  38. def sporedi_broj(row, column, value):
  39.     return row[column] >= value
  40.  
  41.  
  42. def sporedi_string(row, column, value):
  43.     return row[column] == value
  44.  
  45.  
  46. # Divides a set on a specific column. Can handle numeric
  47. # or nominal values
  48. def divideset(rows, column, value):
  49.     # Make a function that tells us if a row is in
  50.     # the first group (true) or the second group (false)
  51.     split_function = None
  52.     if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
  53.         # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  54.         split_function = sporedi_broj
  55.     else:
  56.         # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  57.         split_function = sporedi_string
  58.  
  59.     # Divide the rows into two sets and return them
  60.     set_false = []
  61.     set_true = []
  62.     for row in rows:
  63.         if split_function(row, column, value):
  64.             set_true.append(row)
  65.         else:
  66.             set_false.append(row)
  67.     set1 = [row for row in rows if
  68.             split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
  69.     set2 = [row for row in rows if
  70.             not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
  71.     # return (set1, set2)
  72.     return (set_true, set_false)
  73.  
  74.  
  75. # Create counts of possible results (the last column of
  76. # each row is the result)
  77. def uniquecounts(rows):
  78.     results = {}
  79.     for row in rows:
  80.         # The result is the last column
  81.         r = row[-1]
  82.         results.setdefault(r, 0)
  83.         results[r] += 1
  84.  
  85.     return results
  86.  
  87.  
  88. def log2(x):
  89.     from math import log
  90.     l2 = log(x) / log(2)
  91.     return l2
  92.  
  93.  
  94. def entropy(rows):
  95.     results = uniquecounts(rows)
  96.     # Now calculate the entropy
  97.     ent = 0.0
  98.     for r in results.keys():
  99.         p = float(results[r]) / len(rows)
  100.         ent = ent - p * log2(p)
  101.     return ent
  102.  
  103.  
  104. def buildtree(rows, l=-1, scoref=entropy):
  105.     if len(rows) == 0: return decisionnode()
  106.     current_score = scoref(rows)
  107.  
  108.     # Set up some variables to track the best criteria
  109.     best_gain = 0.0
  110.     best_column = -1
  111.     #best_value = None
  112.     #best_subsetf = None
  113.     #best_subsett = None
  114.     best_criteria = None
  115.     best_sets = None
  116.  
  117.  
  118.     column_count = len(rows[0]) - 1
  119.     for col in range(column_count):
  120.         # Generate the list of different values in
  121.         # this column
  122.         column_values = {}
  123.         for row in rows:
  124.             column_values[row[col]] = 1
  125.         # Now try dividing the rows up for each value
  126.         # in this column
  127.         for value in column_values:
  128.             (set1, set2) = divideset(rows, col, value)
  129.  
  130.             # Information gain
  131.             p = float(len(set1)) / len(rows)
  132.             gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
  133.             if gain > best_gain and len(set1) > 0 and len(set2) > 0:
  134.                 best_gain = gain
  135.                 #best_column = col
  136.                 #best_value = value
  137.                 #best_subsett = set1
  138.                 #best_subsetf = set2
  139.                 best_criteria = (col, value)
  140.                 best_sets = (set1, set2)
  141.  
  142.     # Create the subbranches
  143.     if best_gain > 0:
  144.         l = l+1
  145.         trueBranch = buildtree(best_sets[0],l, scoref)
  146.         falseBranch = buildtree(best_sets[1],l, scoref)
  147.         return decisionnode(col=best_criteria[0], value=best_criteria[1],
  148.                             tb=trueBranch, fb=falseBranch,l=l)
  149.     else:
  150.         return decisionnode(results=uniquecounts(rows))
  151.  
  152.  
  153.  
  154. def printtree(tree, indent=''):
  155.     # Is this a leaf node?
  156.     if tree.results != None:
  157.         print str(tree.results)
  158.     else:
  159.         # Print the criteria
  160.         print(str(tree.col) + ':' + str(tree.value) + '? ') + 'Level=' + str(tree.l)
  161.         # Print the branches
  162.         print(indent + 'T->'),
  163.         printtree(tree.tb, indent + '  ')
  164.         print(indent + 'F->'),
  165.         printtree(tree.fb, indent + '  ')
  166.  
  167.  
  168. def classify(observation, tree):
  169.     if tree.results != None:
  170.         return tree.results
  171.     else:
  172.         vrednost = observation[tree.col]
  173.         branch = None
  174.  
  175.         if isinstance(vrednost, int) or isinstance(vrednost, float):
  176.             if vrednost >= tree.value:
  177.                 branch = tree.tb
  178.             else:
  179.                 branch = tree.fb
  180.         else:
  181.             if vrednost == tree.value:
  182.                 branch = tree.tb
  183.             else:
  184.                 branch = tree.fb
  185.  
  186.         return classify(observation, branch)
  187.  
  188.  
  189.  
  190. if __name__ == "__main__":
  191.     referrer=input()
  192.     location=input()
  193.     readFAQ=input()
  194.     pagesVisited=input()
  195.     serviceChosen=input()
  196.  
  197.     #referrer = 'google'
  198.     #location = 'UK'
  199.     #readFAQ = 'no',
  200.     #pagesVisited = 18
  201.     #serviceChosen = 'None'
  202.  
  203.  
  204.     tmp = [referrer,location,readFAQ,pagesVisited,serviceChosen]
  205.     trainingData.append(tmp)
  206.     t = buildtree(trainingData)
  207.  
  208.     printtree(t)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement