decision_tree_exam_2018

trainingData=[['twitter','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['google','France','no',26,'Basic'],
        ['google','Macedonia','yes',13,'None'],
        ['pinterest','USA','yes',24,'Basic'],
        ['bing','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['facebook','New Zealand','no',12,'None'],
        ['facebook','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['twitter','France','yes',19,'None'],
        ['pinterest','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['bing','UK','yes',19,'Premium'],
        ['bing','Macedonia','no',10,'None'],
        ['facebook','Macedonia','no',16,'Basic'],
        ['bing','UK','no',19,'Basic'],
        ['pinterest','Germany','no',2,'None'],
        ['pinterest','USA','yes',12,'Basic'],
        ['twitter','UK','no',21,'None'],
        ['twitter','UK','yes',26,'Premium'],
        ['google','UK','yes',18,'Basic'],
        ['bing','France','yes',19,'Basic']]

test_cases=[['google','MK','no',24,'Unknown'],
            ['google','MK','no',15,'Unknown'],
            ['pinterest','UK','yes',21,'Unknown'],
            ['pinterest','UK','no',25,'Unknown']]

# trainingData=[line.split('\t') for line in file('decision_tree_example.txt')]

class decisionnode:
      def __init__(self,col=-1,value=None,results=None,tb=None,fb=None,level=0):
         self.level=level
         self.col=col
         self.value=value
         self.results=results
         self.tb=tb
         self.fb=fb
         self.level=level

def sporedi_broj(row,column,value):
  return row[column]>=value

def sporedi_string(row,column,value):
  return row[column]==value

# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows,column,value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function=None
    if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
       #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
       split_function=sporedi_broj
    else:
       # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
       split_function=sporedi_string

    # Divide the rows into two sets and return them
    # set1=[row for row in rows if split_function(row)]  # za sekoj row od rows za koj split_function vrakja true
    # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
    set1=[row for row in rows if split_function(row,column,value)]  # za sekoj row od rows za koj split_function vrakja true
    set2=[row for row in rows if not split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja false

    return (set1,set2)

# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset2(rows,column,value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function=None
    if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
       #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
       split_function=sporedi_broj
    else:
       # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
       split_function=sporedi_string

    # Divide the rows into two sets and return them
    # set1=[row for row in rows if split_function(row)]  # za sekoj row od rows za koj split_function vrakja true
    # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
    set1=[]
    set2=[]
    for row in rows:
      if split_function(row,column,value):
        set1.append(row)
      else:
        set2.append(row)
    return (set1,set2)


# Create counts of possible results (the last column of
# each row is the result)
def uniquecounts(rows):
  results={}
  for row in rows:
     # The result is the last column
     r=row[len(row)-1]
     if r not in results: results[r]=0
     results[r]+=1
  return results


# Entropy is the sum of p(x)log(p(x)) across all
# the different possible results
def entropy(rows):
      from math import log
      log2=lambda x:log(x)/log(2)
      results=uniquecounts(rows)
      # Now calculate the entropy
      ent=0.0
      for r in results.keys():
            p=float(results[r])/len(rows)
            ent=ent-p*log2(p)
      return ent

def buildtree(rows,scoref=entropy,level=0):
      if len(rows)==0: return decisionnode()
      current_score=scoref(rows)

      # Set up some variables to track the best criteria
      best_gain=0.0
      best_criteria=None
      best_sets=None

      column_count=len(rows[0])-1
      for col in range(0,column_count):
            # Generate the list of different values in
            # this column
            column_values={}
            for row in rows:
                  column_values[row[col]]=1
                  # print row[col]
            # print
            # print column_values
            # Now try dividing the rows up for each value
            # in this column
            for value in column_values.keys():
                  (set1,set2)=divideset(rows,col,value)

                  # Information gain
                  p=float(len(set1))/len(rows)
                  gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
                  # print set1, set2, gain
                  if gain>best_gain and len(set1)>0 and len(set2)>0:
                        best_gain=gain
                        best_criteria=(col,value)
                        best_sets=(set1,set2)

      # Create the subbranches
      if best_gain>0:
            trueBranch=buildtree(best_sets[0],level=level+1)
            falseBranch=buildtree(best_sets[1], level=level+1)
            return decisionnode(col=best_criteria[0],value=best_criteria[1],
                            tb=trueBranch, fb=falseBranch, level=level)
      else:
            return decisionnode(results=uniquecounts(rows))

def printtree(tree,indent=''):
      # Is this a leaf node?
      if tree.results!=None:
            print str(tree.results)
      else:
            # Print the criteria
            print str(tree.col)+':'+str(tree.value)+'?' + ' Level='+str(tree.level)
            # Print the branches
            print indent+'T->',
            printtree(tree.tb,indent+'  ')
            print indent+'F->',
            printtree(tree.fb,indent+'  ')

def classify(observation,tree):
    if tree.results!=None:
        results=[(value,key) for key,value in tree.results.items()]
        results.sort()
        return results[0][1]
    else:
        vrednost=observation[tree.col]
        branch=None

        if isinstance(vrednost,int) or isinstance(vrednost,float):
            if vrednost>=tree.value: branch=tree.tb
            else: branch=tree.fb
        else:
           if vrednost==tree.value: branch=tree.tb
           else: branch=tree.fb

        return classify(observation,branch)


def classify2(observation,tree):
    if tree.results!=None:
        results=[(value,key) for key,value in tree.results.items()]
        results.sort()
        return results[0][1]
    else:
        vrednost=observation[tree.col]
        branch=None

        if isinstance(vrednost,int) or isinstance(vrednost,float):
            if vrednost>=tree.value: branch=tree.tb
            else: branch=tree.fb
        else:
           if vrednost==tree.value: branch=tree.tb
           else: branch=tree.fb

        return classify2(observation,branch)

def classify3(observation,tree):
    if tree.results!=None:
        results=[(value,key) for key,value in tree.results.items()]
        results.sort()
        return results[0][1]
    else:
        vrednost=observation[tree.col]
        branch=None
        granka='True branch'
        if isinstance(vrednost,int) or isinstance(vrednost,float):
            if vrednost>=tree.value:
                branch=tree.tb
            else:
                branch=tree.fb
                granka='False branch'
        else:
           if vrednost==tree.value:
               branch=tree.tb
           else:
               branch=tree.fb
               granka='False branch'
        print 'Sporeduvam kolona i vrednost', (tree.col, tree.value)
        print 'Tekovna vrednost:', vrednost
        print 'Sledna granka:',granka
        print 'Preostanata granka za izminuvanje:'
        printtree(branch)
        print
        return classify3(observation,branch)

if __name__ == "__main__":
    referrer=input()
    location=input()
    readFAQ=input()
    pagesVisited=input()
    serviceChosen='Unknown'


    testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]

    t=buildtree(trainingData)
    printtree(t)
    print classify3(testCase,t)