Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- trainingData=[['slashdot','USA','yes',18,'None'],
- ['google','France','yes',23,'Premium'],
- ['google','France','yes',23,'Basic'],
- ['google','France','yes',23,'Basic'],
- ['digg','USA','yes',24,'Basic'],
- ['kiwitobes','France','yes',23,'Basic'],
- ['google','UK','no',21,'Premium'],
- ['(direct)','New Zealand','no',12,'None'],
- ['(direct)','UK','no',21,'Basic'],
- ['google','USA','no',24,'Premium'],
- ['slashdot','France','yes',19,'None'],
- ['digg','USA','no',18,'None'],
- ['google','UK','no',18,'None'],
- ['kiwitobes','UK','no',19,'None'],
- ['digg','New Zealand','yes',12,'Basic'],
- ['slashdot','UK','no',21,'None'],
- ['google','UK','yes',18,'Basic'],
- ['kiwitobes','France','yes',19,'Basic']]
- class decision_node:
- def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
- self.col = col
- self.value = value
- self.results = results
- self.tb = tb
- self.fb = fb
- def compare_number(row, column, value):
- return row[column] >= value
- def compare_string(row, column, value):
- return row[column] == value
- def divide_set(rows, column, value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function = None
- if isinstance(value, int) or isinstance(value, float):
- # ako vrednosta so koja sporeduvame e od tip int ili float
- split_function = compare_number
- else:
- split_function = compare_string
- # Divide the rows into two sets and return them
- set_true = []
- set_false = []
- for row in rows:
- if split_function(row, column, value):
- set_true.append(row)
- else:
- set_false.append(row)
- return (set_true, set_false)
- def unique_counts(rows):
- # count unque classes
- # and how many times each class apears
- #
- # class is last column in the row, in this example (basic, none, or premium)
- result = {}
- for row in rows:
- clas = row[len(row) - 1] # current class
- if clas not in result:
- result[clas] = 1 # add class in the dictionary and asociate value of 1 ocourance
- else:
- result[clas] += 1 # increment the number of ocurances by +1
- return result
- def entrophy(rows):
- from math import log
- log2 = lambda x: log(x) / log(2)
- results = unique_counts(rows) # result is dictionary{key, value}
- # now calculate the entrophy
- ent = 0.0
- for r in results.keys():
- pi = float(results[r]) / len(rows)
- ent = ent - pi * log2(pi)
- return ent
- def buildtree(rows, score_function = entrophy):
- if len(rows) == 0:
- return decision_node()
- # score function is entrophy
- # declared in method definition
- current_score = score_function(rows)
- # promenlivi so koi sledime koj krietrium e najdobar
- best_gain = 0.0
- best_criteria = None
- best_sets = None
- column_count = len(rows[0]) - 1
- for col in range(0, column_count):
- # za sekoja kolona (col se dvizi od 0 - col_count -1)
- # generiranje na recnik {key, value} od vrednosti vo ovaa kolona
- column_values = {}
- for row in rows:
- # za sekoja redica se zema vrednsota vo tekovnata kolona
- # i se postavuva za kluc vo column_values
- column_values[row[col]] = 1
- for value in column_values.keys():
- # za sekoja razlicna vrednost na klucot (kolona)
- # se pravi podelba na mnozestvo
- (set_true, set_false) = divide_set(rows, col, value)
- # presmetaj dobivka za konkretnata podelba
- p = float(len(set_true)) / len(rows) # verojatnost da se pogodi tekonvata vrednosta na ovoj atribut
- gain = current_score - p * score_function(set_true) - (1 - p) * score_function(set_false)
- if gain > best_gain and len(set_true) > 0 and len(set_false) > 0:
- # ovaa podelba e najdobra do sega pa ke ja zachuvame
- best_gain = gain
- best_criteria = (col, value)
- best_sets = (set_true, set_false)
- # create subtrees
- if best_gain > 0:
- # bidejki dobivkata > 0 mozna e druga podelba
- true_branch = buildtree(best_sets[0])
- false_branch = buildtree(best_sets[1])
- return decision_node(col = best_criteria[0], value = best_criteria[1],
- tb = true_branch, fb = false_branch)
- else:
- # ne e mozna druga podelba bidejki site instaci vo ova mnozestvo ja imaat istata klasa
- # rezultatot sto ke se vrati ke sodrzi recnik so kluc klasata koja se predvidva
- # vo ovoj list i vrednost kolku pati bila sretnata vo trening mnozestvoto
- return decision_node(results = unique_counts(rows))
- def print_tree(tree, indent = ' '):
- # ako ova e list
- if tree.results != None:
- print str(tree.results)
- else:
- # se pecatu uslovod (indeks na kolonata i vrednost)
- print str(tree.col) + ":" + str(tree.value)+ "? "
- # se pecatat True grabkite pa potoa False grankite
- print indent + "T->",
- print_tree(tree.tb, indent + " ")
- print indent + "F->",
- print_tree(tree.fb, indent + " ")
- def classify(observation,tree):
- if tree.results!=None:
- result = ""
- dictionary = tree.results
- keys = dictionary.keys()
- if len(keys) > 1:
- # find key with greatest value
- max_value = dictionary[keys[0]]
- for key in keys:
- current = dictionary[key]
- if current > max_value :
- max_value = current
- result = key
- else:
- result = keys[0]
- # return tree.results
- # print "dictionary " + str(dictionary)
- # print "keys" + str(keys)
- return result
- else:
- vrednost = observation[tree.col]
- branch = None
- if isinstance(vrednost, int) or isinstance(vrednost, float):
- if vrednost >= tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- else:
- if vrednost == tree.value:
- branch=tree.tb
- else:
- branch = tree.fb
- return classify(observation, branch)
- if __name__ == "__main__":
- # referrer='slashdot'
- # location='UK'
- # readFAQ='no'
- # pagesVisited=21
- # serviceChosen='Unknown'
- referrer=input()
- location=input()
- readFAQ=input()
- pagesVisited=input()
- serviceChosen=input()
- testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
- t=buildtree(trainingData)
- print classify(testCase,t)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement