Advertisement
Guest User

snz_lab_02_zad_02

a guest
Oct 28th, 2016
214
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.87 KB | None | 0 0
  1. trainingData=[['slashdot','USA','yes',18,'None'],
  2.         ['google','France','yes',23,'Premium'],
  3.         ['google','France','yes',23,'Basic'],
  4.         ['google','France','yes',23,'Basic'],
  5.         ['digg','USA','yes',24,'Basic'],
  6.         ['kiwitobes','France','yes',23,'Basic'],
  7.         ['google','UK','no',21,'Premium'],
  8.         ['(direct)','New Zealand','no',12,'None'],
  9.         ['(direct)','UK','no',21,'Basic'],
  10.         ['google','USA','no',24,'Premium'],
  11.         ['slashdot','France','yes',19,'None'],
  12.         ['digg','USA','no',18,'None'],
  13.         ['google','UK','no',18,'None'],
  14.         ['kiwitobes','UK','no',19,'None'],
  15.         ['digg','New Zealand','yes',12,'Basic'],
  16.         ['slashdot','UK','no',21,'None'],
  17.         ['google','UK','yes',18,'Basic'],
  18.         ['kiwitobes','France','yes',19,'Basic']]
  19.  
  20.  
  21. class decision_node:
  22.     def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
  23.         self.col = col
  24.         self.value = value
  25.         self.results = results
  26.         self.tb = tb
  27.         self.fb = fb
  28.  
  29.  
  30. def compare_number(row, column, value):
  31.     return row[column] >= value
  32.  
  33.  
  34. def compare_string(row, column, value):
  35.     return row[column] == value
  36.  
  37.  
  38. def divide_set(rows, column, value):
  39.     # Make a function that tells us if a row is in
  40.     # the first group (true) or the second group (false)
  41.     split_function = None
  42.  
  43.     if isinstance(value, int) or isinstance(value, float):
  44.         # ako vrednosta so koja sporeduvame e od tip int ili float
  45.         split_function = compare_number
  46.     else:
  47.         split_function = compare_string
  48.  
  49.     # Divide the rows into two sets and return them
  50.     set_true = []
  51.     set_false = []
  52.  
  53.     for row in rows:
  54.         if split_function(row, column, value):
  55.             set_true.append(row)
  56.         else:
  57.             set_false.append(row)
  58.  
  59.     return (set_true, set_false)
  60.  
  61.  
  62. def unique_counts(rows):
  63.     # count unque classes
  64.     # and how many times each class apears
  65.     #
  66.     # class is last column in the row, in this example (basic, none, or premium)
  67.     result = {}
  68.  
  69.     for row in rows:
  70.         clas = row[len(row) - 1]    # current class
  71.         if clas not in result:
  72.             result[clas] = 1  # add class in the dictionary and asociate value of 1 ocourance
  73.         else:
  74.             result[clas] += 1      # increment the number of ocurances by +1
  75.  
  76.     return result
  77.  
  78.  
  79. def entrophy(rows):
  80.     from math import log
  81.  
  82.     log2 = lambda x: log(x) / log(2)
  83.     results = unique_counts(rows)       # result is dictionary{key, value}
  84.  
  85.     # now calculate the entrophy
  86.     ent = 0.0
  87.  
  88.     for r in results.keys():
  89.         pi = float(results[r]) / len(rows)
  90.         ent = ent - pi * log2(pi)
  91.  
  92.     return ent
  93.  
  94.  
  95. def buildtree(rows, score_function = entrophy):
  96.     if len(rows) == 0:
  97.         return decision_node()
  98.  
  99.     # score function is entrophy
  100.     # declared in method definition
  101.     current_score = score_function(rows)
  102.  
  103.     # promenlivi so koi sledime koj krietrium e najdobar
  104.     best_gain = 0.0
  105.     best_criteria = None
  106.     best_sets = None
  107.  
  108.     column_count = len(rows[0]) - 1
  109.  
  110.     for col in range(0, column_count):
  111.         # za sekoja kolona (col se dvizi od 0 - col_count -1)
  112.         # generiranje na recnik {key, value} od vrednosti vo ovaa kolona
  113.         column_values = {}
  114.  
  115.         for row in rows:
  116.             # za sekoja redica se zema vrednsota vo tekovnata kolona
  117.             # i se postavuva za kluc vo column_values
  118.             column_values[row[col]] = 1
  119.  
  120.         for value in column_values.keys():
  121.             # za sekoja razlicna vrednost na klucot (kolona)
  122.             # se pravi podelba na mnozestvo
  123.             (set_true, set_false) = divide_set(rows, col, value)
  124.  
  125.             # presmetaj dobivka za konkretnata podelba
  126.             p = float(len(set_true)) / len(rows)        # verojatnost da se pogodi tekonvata vrednosta na ovoj atribut
  127.             gain = current_score - p * score_function(set_true) - (1 - p) * score_function(set_false)
  128.  
  129.             if gain > best_gain and len(set_true) > 0 and len(set_false) > 0:
  130.                 # ovaa podelba e najdobra do sega pa ke ja zachuvame
  131.                 best_gain = gain
  132.                 best_criteria = (col, value)
  133.                 best_sets = (set_true, set_false)
  134.  
  135.     # create subtrees
  136.     if best_gain > 0:
  137.         # bidejki dobivkata > 0 mozna e druga podelba
  138.         true_branch = buildtree(best_sets[0])
  139.         false_branch = buildtree(best_sets[1])
  140.         return decision_node(col = best_criteria[0], value = best_criteria[1],
  141.                              tb = true_branch, fb = false_branch)
  142.     else:
  143.         # ne e mozna druga podelba bidejki site instaci vo ova mnozestvo ja imaat istata klasa
  144.         # rezultatot sto ke se vrati ke sodrzi recnik so kluc klasata koja se predvidva
  145.         # vo ovoj list i vrednost kolku pati bila sretnata vo trening mnozestvoto
  146.         return decision_node(results = unique_counts(rows))
  147.  
  148.  
  149. def print_tree(tree, indent = ' '):
  150.     # ako ova e list
  151.     if tree.results != None:
  152.         print str(tree.results)
  153.     else:
  154.         # se pecatu uslovod (indeks na kolonata i vrednost)
  155.         print str(tree.col) + ":" + str(tree.value)+ "? "
  156.  
  157.         # se pecatat True grabkite pa potoa False grankite
  158.         print indent + "T->",
  159.         print_tree(tree.tb, indent + "   ")
  160.         print indent + "F->",
  161.         print_tree(tree.fb, indent + "   ")
  162.  
  163.  
  164. def classify(observation,tree):
  165.     if tree.results!=None:
  166.         result = ""
  167.         dictionary = tree.results
  168.         keys = dictionary.keys()
  169.  
  170.         if len(keys) > 1:
  171.             # find key with greatest value
  172.             max_value = dictionary[keys[0]]
  173.             for key in keys:
  174.                 current = dictionary[key]
  175.                 if current > max_value :
  176.                     max_value = current
  177.                     result = key
  178.         else:
  179.             result = keys[0]
  180.  
  181.         # return tree.results
  182.         # print "dictionary " + str(dictionary)
  183.         # print "keys" + str(keys)
  184.         return result
  185.     else:
  186.         vrednost = observation[tree.col]
  187.         branch = None
  188.  
  189.         if isinstance(vrednost, int) or isinstance(vrednost, float):
  190.             if vrednost >= tree.value:
  191.                 branch = tree.tb
  192.             else:
  193.                 branch = tree.fb
  194.         else:
  195.             if vrednost == tree.value:
  196.                 branch=tree.tb
  197.             else:
  198.                 branch = tree.fb
  199.  
  200.         return classify(observation, branch)
  201.  
  202.  
  203. if __name__ == "__main__":
  204.     # referrer='slashdot'
  205.     # location='UK'
  206.     # readFAQ='no'
  207.     # pagesVisited=21
  208.     # serviceChosen='Unknown'
  209.  
  210.     referrer=input()
  211.     location=input()
  212.     readFAQ=input()
  213.     pagesVisited=input()
  214.     serviceChosen=input()
  215.  
  216.     testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
  217.  
  218.     t=buildtree(trainingData)
  219.     print classify(testCase,t)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement