Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Да се промени функцијата за предвидување, така што при изминувањето ќе печати информации за:
- со која колона и вредност се споредува
- за која е тековната вредност на тест примерокот за бараната колона
- која е следната гранка што ќе се изминува низ дрвото (True branch или False branch)
- преостанатиот дел од дрвото што треба да се измине
- празна линија
- Потоа да се испечати истренираното дрво, да се вчита непознат тренинг примерок од стандардниот влез и
- истиот да се класифицира со новата функција за предвидување.
- """
- trainingData=[['twitter','USA','yes',18,'None'],
- ['google','France','yes',23,'Premium'],
- ['google','France','no',26,'Basic'],
- ['google','Macedonia','yes',13,'None'],
- ['pinterest','USA','yes',24,'Basic'],
- ['bing','France','yes',23,'Basic'],
- ['google','UK','no',21,'Premium'],
- ['facebook','New Zealand','no',12,'None'],
- ['facebook','UK','no',21,'Basic'],
- ['google','USA','no',24,'Premium'],
- ['twitter','France','yes',19,'None'],
- ['pinterest','USA','no',18,'None'],
- ['google','UK','no',18,'None'],
- ['bing','UK','yes',19,'Premium'],
- ['bing','Macedonia','no',10,'None'],
- ['facebook','Macedonia','no',16,'Basic'],
- ['bing','UK','no',19,'Basic'],
- ['pinterest','Germany','no',2,'None'],
- ['pinterest','USA','yes',12,'Basic'],
- ['twitter','UK','no',21,'None'],
- ['twitter','UK','yes',26,'Premium'],
- ['google','UK','yes',18,'Basic'],
- ['bing','France','yes',19,'Basic']]
- test_cases=[['google','MK','no',24,'Unknown'],
- ['google','MK','no',15,'Unknown'],
- ['pinterest','UK','yes',21,'Unknown'],
- ['pinterest','UK','no',25,'Unknown']]
- # trainingData=[line.split('\t') for line in file('decision_tree_example.txt')]
- class decisionnode:
- def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
- self.col = col
- self.value = value
- self.results = results
- self.tb = tb
- self.fb = fb
- def sporedi_broj(row, column, value):
- return row[column] >= value
- def sporedi_string(row, column, value):
- return row[column] == value
- # Divides a set on a specific column. Can handle numeric
- # or nominal values
- def divideset(rows, column, value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function = None
- if isinstance(value, int) or isinstance(value, float): # ako vrednosta so koja sporeduvame e od tip int ili float
- # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
- split_function = sporedi_broj
- else:
- # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
- split_function = sporedi_string
- # Divide the rows into two sets and return them
- set_false = []
- set_true = []
- for row in rows:
- if split_function(row, column, value):
- set_true.append(row)
- else:
- set_false.append(row)
- set1 = [row for row in rows if
- split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja true
- set2 = [row for row in rows if
- not split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja false
- # return (set1, set2)
- return (set_true, set_false)
- # Create counts of possible results (the last column of
- # each row is the result)
- def uniquecounts(rows):
- results = {}
- for row in rows:
- # The result is the last column
- r = row[-1]
- results.setdefault(r, 0)
- results[r] += 1
- return results
- # Probability that a randomly placed item will
- # be in the wrong category
- def log2(x):
- from math import log
- l2 = log(x) / log(2)
- return l2
- # Entropy is the sum of p(x)log(p(x)) across all
- # the different possible results
- def entropy(rows):
- results = uniquecounts(rows)
- # Now calculate the entropy
- ent = 0.0
- for r in results.keys():
- p = float(results[r]) / len(rows)
- ent = ent - p * log2(p)
- return ent
- def buildtree(rows, scoref=entropy):
- if len(rows) == 0: return decisionnode()
- current_score = scoref(rows)
- # Set up some variables to track the best criteria
- best_gain = 0.0
- best_column = -1
- best_value = None
- best_subsetf = None
- best_subsett = None
- column_count = len(rows[0]) - 1
- for col in range(column_count):
- # Generate the list of different values in
- # this column
- column_values = set()
- for row in rows:
- column_values.add(row[col])
- # Now try dividing the rows up for each value
- # in this column
- for value in column_values:
- (set1, set2) = divideset(rows, col, value)
- # Information gain
- p = float(len(set1)) / len(rows)
- gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
- if gain > best_gain and len(set1) > 0 and len(set2) > 0:
- best_gain = gain
- best_column = col
- best_value = value
- best_subsett = set1
- best_subsetf = set2
- # best_criteria = (col, value)
- # best_sets = (set1, set2)
- # Create the subbranches
- if best_gain > 0:
- trueBranch = buildtree(best_subsett, scoref)
- falseBranch = buildtree(best_subsetf, scoref)
- return decisionnode(col=best_column, value=best_value,
- tb=trueBranch, fb=falseBranch)
- else:
- return decisionnode(results=uniquecounts(rows))
- def printtree(tree, indent=''):
- # Is this a leaf node?
- if tree.results != None:
- print(indent + str(sorted(tree.results.items())))
- else:
- # Print the criteria
- print(indent + str(tree.col) + ':' + str(tree.value) + '? ')
- # Print the branches
- print(indent + 'T->')
- printtree(tree.tb, indent + ' ')
- print(indent + 'F->')
- printtree(tree.fb, indent + ' ')
- def classify(observation, tree):
- if tree.results != None:
- return tree.results
- else:
- vrednost = observation[tree.col]
- branch = None
- if isinstance(vrednost, int) or isinstance(vrednost, float):
- if vrednost >= tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- else:
- if vrednost == tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- return classify(observation, branch)
- def classify2(observation, tree):
- if tree.results != None:
- results = [(value,key) for key,value in tree.results.items()]
- results.sort()
- return results[0][1]
- else:
- vrednost = observation[tree.col]
- branch = None
- granka = 'True branch'
- if isinstance(vrednost, int) or isinstance(vrednost, float):
- if vrednost >= tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- granka = 'False branch'
- else:
- if vrednost == tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- granka = 'False branch'
- print('Sporeduvam kolona i vrednost', (tree.col, tree.value))
- print('Tekovna vrednost:', vrednost)
- print ('Sledna granka', granka)
- print('Preostanata granka za izminuvanje:')
- printtree(branch)
- print()
- return classify2(observation, branch)
- if __name__ == "__main__":
- referrer = 'google'
- location = 'USA'
- readFAQ = 'no'
- pagesVisited = 25
- serviceChosen = 'Unknown'
- testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
- t=buildtree(trainingData)
- printtree(t)
- print(classify2(testCase,t))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement