Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- trainingData=[['slashdot','USA','yes',18,'None'],
- ['google','France','yes',23,'Premium'],
- ['google','France','yes',23,'Basic'],
- ['google','France','yes',23,'Basic'],
- ['digg','USA','yes',24,'Basic'],
- ['kiwitobes','France','yes',23,'Basic'],
- ['google','UK','no',21,'Premium'],
- ['(direct)','New Zealand','no',12,'None'],
- ['(direct)','UK','no',21,'Basic'],
- ['google','USA','no',24,'Premium'],
- ['slashdot','France','yes',19,'None'],
- ['digg','USA','no',18,'None'],
- ['google','UK','no',18,'None'],
- ['kiwitobes','UK','no',19,'None'],
- ['digg','New Zealand','yes',12,'Basic'],
- ['slashdot','UK','no',21,'None'],
- ['google','UK','yes',18,'Basic'],
- ['kiwitobes','France','yes',19,'Basic']]
- class decisionnode:
- def __init__(self,col=-1,value=None,results=None,tb=None,fb=None,nivo=None):
- self.col=col
- self.value=value
- self.results=results
- self.tb=tb
- self.fb=fb
- self.nivo=nivo
- def sporedi_broj(row,column,value):
- return row[column]>=value
- def sporedi_string(row,column,value):
- return row[column]==value
- def divideset(rows,column,value):
- split_function=None
- if isinstance(value,int) or isinstance(value,float):
- split_function=sporedi_broj
- else:
- # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
- split_function=sporedi_string
- # Divide the rows into two sets and return them
- # set1=[row for row in rows if split_function(row)] # za sekoj row od rows za koj split_function vrakja true
- # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
- set1=[row for row in rows if split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja true
- set2=[row for row in rows if not split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja false
- return (set1,set2)
- # Divides a set on a specific column. Can handle numeric
- # or nominal values
- def divideset2(rows,column,value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function=None
- if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
- #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
- split_function=sporedi_broj
- else:
- # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
- split_function=sporedi_string
- # Divide the rows into two sets and return them
- # set1=[row for row in rows if split_function(row)] # za sekoj row od rows za koj split_function vrakja true
- # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
- set1=[]
- set2=[]
- for row in rows:
- if split_function(row,column,value):
- set1.append(row)
- else:
- set2.append(row)
- return (set1,set2)
- # Create counts of possible results (the last column of
- # each row is the result)
- def uniquecounts(rows):
- results={}
- for row in rows:
- # The result is the last column
- r=row[len(row)-1]
- if r not in results: results[r]=0
- results[r]+=1
- return results
- def uniquecounts2(rows):
- results={}
- for row in rows:
- # The result is the last column
- r=row[-1]
- results.setdefault(r,0)
- results[r]+=1
- return results
- def entropy(rows):
- from math import log
- log2=lambda x:log(x)/log(2)
- results=uniquecounts(rows)
- ent=0.0
- for r in results.keys():
- # print r,results[r]
- p=float(results[r])/len(rows)
- ent=ent-p*log2(p)
- return ent
- def entropy2(rows):
- from math import log2
- results=uniquecounts(rows)
- # Now calculate the entropy
- ent=0.0
- for r in results.keys():
- p=float(results[r])/len(rows)
- ent=ent-p*log2(p)
- return ent
- def buildtree(rows, scoref=entropy,nivo=0):
- if len(rows)==0: return decisionnode()
- current_score=scoref(rows)
- best_gain=0.0
- best_criteria=None
- best_sets=None
- column_count=len(rows[0])-1
- for col in range(0,column_count):
- #global column_values
- # Generate the list of different values in
- # this column
- column_values={}
- for row in rows:
- column_values[row[col]]=1
- for value in column_values.keys():
- (set1,set2)=divideset(rows,col,value)
- # Information gain
- p=float(len(set1))/len(rows)
- gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
- if gain>best_gain and len(set1)>0 and len(set2)>0:
- best_gain=gain
- best_criteria=(col,value)
- best_sets=(set1,set2)
- if best_gain>0:
- trueBranch=buildtree(best_sets[0],nivo=nivo+1)
- falseBranch=buildtree(best_sets[1],nivo=nivo+1)
- return decisionnode(col=best_criteria[0],value=best_criteria[1],
- tb=trueBranch, fb=falseBranch,nivo=nivo) #tekovno nivo i ne treba +1
- else:
- return decisionnode(results=uniquecounts(rows),nivo=nivo)
- def buildtree2(rows, scoref=entropy):
- if len(rows)==0: return decisionnode()
- current_score=scoref(rows)
- # Set up some variables to track the best criteria
- best_gain=0.0
- best_col=None
- best_value=None
- best_ts=None
- best_fs=None
- column_count=len(rows[0])-1
- for col in range(0,column_count):
- # Generate the list of different values in
- # this column
- column_values={}
- for row in rows:
- column_values[row[col]]=1
- # print row[col]
- # print
- # print col,column_values.keys()
- # Now try dividing the rows up for each value
- # in this column
- for value in column_values.keys():
- (set1,set2)=divideset(rows,col,value)
- # Information gain
- p=float(len(set1))/len(rows)
- gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
- print col, value, gain, len(set1), len(set2)
- # print set1, set2, gain
- if gain>best_gain and len(set1)>0 and len(set2)>0:
- best_gain=gain
- best_col=col
- best_value=value
- best_ts=set1
- best_fs=set2
- best_criteria=(col,value)
- best_sets=(set1,set2)
- print
- # return
- print best_gain, best_col, best_value
- # Create the subbranches
- if best_gain>0:
- trueBranch=buildtree(best_sets[0])
- falseBranch=buildtree(best_sets[1])
- return decisionnode(col=best_criteria[0],value=best_criteria[1],
- tb=trueBranch, fb=falseBranch)
- trueBranch=buildtree(best_ts)
- falseBranch=buildtree(best_fs)
- return decisionnode(col=best_col,value=best_value,
- tb=trueBranch, fb=falseBranch)
- else:
- return decisionnode(results=uniquecounts(rows))
- def printtree(tree,indent=''):
- if tree.results!=None:
- print str(tree.results)
- else:
- print str(tree.col)+':'+str(tree.value)+'? Level='+str(tree.nivo)
- print indent+'T->',
- printtree(tree.tb,indent+' ')
- print indent+'F->',
- printtree(tree.fb,indent+' ')
- #t=buildtree2(my_data)
- def classify(observation,tree):
- if tree.results!=None:
- return tree.results
- else:
- vrednost=observation[tree.col]
- branch=None
- if isinstance(vrednost,int) or isinstance(vrednost,float):
- if vrednost>=tree.value: branch=tree.tb
- else: branch=tree.fb
- else:
- if vrednost==tree.value: branch=tree.tb
- else: branch=tree.fb
- return classify(observation,branch)
- if __name__ == "__main__":
- # referrer='slashdot'
- # location='US'
- # readFAQ='no'
- # pagesVisited=19
- # serviceChosen='None'
- referrer=input()
- location=input()
- readFAQ=input()
- pagesVisited=input()
- serviceChosen=input()
- testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
- trainingData.append(testCase)
- t=buildtree(trainingData)
- printtree(t)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement