exam_python_SNZ_1_decision_trees

"""
Дрва за одлучување (80 поени) Problem 1 (0 / 0)
Да се промени функцијата за предвидување, така што при изминувањето ќе печати информации за:

со која колона и вредност се споредува
за која е тековната вредност на тест примерокот за бараната колона
која е следната гранка што ќе се изминува низ дрвото (True branch или False branch)
преостанатиот дел од дрвото што треба да се измине
празна линија
Потоа да се испечати истренираното дрво, да се вчита непознат тренинг примерок од стандардниот влез и
истиот да се класифицира со новата функција за предвидување.
# trainingData=[line.split('\t') for line in file('decision_tree_example.txt')]


"""


trainingData=[['slashdot','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['google','France','yes',23,'Basic'],
        ['google','France','yes',23,'Basic'],
        ['digg','USA','yes',24,'Basic'],
        ['kiwitobes','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['(direct)','New Zealand','no',12,'None'],
        ['(direct)','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['slashdot','France','yes',19,'None'],
        ['digg','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['kiwitobes','UK','no',19,'None'],
        ['digg','New Zealand','yes',12,'Basic'],
        ['slashdot','UK','no',21,'None'],
        ['google','UK','yes',18,'Basic'],
        ['kiwitobes','France','yes',19,'Basic']]


class decisionnode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb


def sporedi_broj(row, column, value):
    return row[column] >= value


def sporedi_string(row, column, value):
    return row[column] == value


# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows, column, value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function = None
    if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
        # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
        split_function = sporedi_broj
    else:
        # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
        split_function = sporedi_string

    # Divide the rows into two sets and return them
    set_false = []
    set_true = []
    for row in rows:
        if split_function(row, column, value):
            set_true.append(row)
        else:
            set_false.append(row)
    set1 = [row for row in rows if
            split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
    set2 = [row for row in rows if
            not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
    # return (set1, set2)
    return (set_true, set_false)


#st, sf = divideset(my_data, 3, 20)
#print(sf)
#print(st)


# Create counts of possible results (the last column of
# each row is the result)
def uniquecounts(rows):
    results = {}
    for row in rows:
        # The result is the last column
        r = row[-1]
        results.setdefault(r, 0)
        results[r] += 1

    return results


#print(uniquecounts(my_data))
#print(uniquecounts(st))
#print(uniquecounts(sf))


# Probability that a randomly placed item will
# be in the wrong category

def log2(x):
    from math import log
    l2 = log(x) / log(2)
    return l2


# Entropy is the sum of p(x)log(p(x)) across all
# the different possible results
def entropy(rows):
    results = uniquecounts(rows)
    # Now calculate the entropy
    # print results.items()

    ent = 0.0
    for r in results.keys():
        p = float(results[r]) / len(rows)
        ent = ent - p * log2(p)
    return ent


#print(entropy(my_data), entropy(st), entropy(sf))


def buildtree(rows, scoref=entropy):
    if len(rows) == 0: return decisionnode()
    current_score = scoref(rows)

    # Set up some variables to track the best criteria
    best_gain = 0.0
    #best_column = -1
    #best_value = None
    #best_subsetf = None
    #best_subsett = None
    best_criteria = None
    best_sets = None
    column_count = len(rows[0]) - 1
    for col in range(column_count):
        # Generate the list of different values in
        # this column
        column_values = {}
        for row in rows:
            column_values[row[col]] = 1
        # Now try dividing the rows up for each value
        # in this column
        for value in column_values:
            (set1, set2) = divideset(rows, col, value)

            # Information gain
            p = float(len(set1)) / len(rows)
            gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)


            if gain > best_gain and len(set1) > 0 and len(set2) > 0:
                best_gain = gain
                #best_column = col
                #best_value = value
                #best_subsett = set1
                #best_subsetf = set2
                best_criteria = (col, value)
                best_sets = (set1, set2)

    # Create the subbranches
    if best_gain > 0:
        trueBranch = buildtree(best_sets[0], scoref)

        falseBranch = buildtree(best_sets[1], scoref)
        #print trueBranch
        return decisionnode(col=best_criteria[0], value=best_criteria[1],
                            tb=trueBranch, fb=falseBranch)
    else:
        return decisionnode(results=uniquecounts(rows))

def takeSecond(elem):
    return elem[1]

def printtree(tree, indent=''):
    # Is this a leaf node?
    if tree.results != None:
        print(str(tree.results))
    else:
        # Print the criteria
        print(str(tree.col) + ':' + str(tree.value) + '? ')
        # Print the branches
        print(indent + 'T->'),
        printtree(tree.tb, indent + '  ')
        print(indent + 'F->'),
        printtree(tree.fb, indent + '  ')


def classify(observation, tree):
    if tree.results != None:
        print  tree.results
        return tree.results
    else:
        vrednost = observation[tree.col]
        branch = None
        print "Sporeduvam kolona i vrednost:",(tree.col,vrednost)
        print "Tekovna vrednost:",tree.value

        if isinstance(vrednost, int) or isinstance(vrednost, float):
            if vrednost >= tree.value:
                branch = tree.tb
                print "Sledna granka za izminuvanje: True branch"
            else:
                branch = tree.fb
                print "Sledna granka: False branch"
        else:
            if vrednost == tree.value:
                branch = tree.tb
                print "Sledna granka: True branch"
            else:
                branch = tree.fb
                print "Sledna granka: False branch"

        print "Preostanata granka za izminuvanje:"
        printtree(branch)
        return classify(observation, branch)


if __name__ == "__main__":
    #referrer = input()
    #location = input()
    #readFAQ = input()
    #pagesVisited = input()
    serviceChosen = 'Unknown'

    referrer = 'google'
    location = 'UK'
    readFAQ = 'no'
    pagesVisited = 22

    testCase = [referrer, location, readFAQ, pagesVisited, serviceChosen]

    t = buildtree(trainingData)
    printtree(t)
    classify(testCase, t)