printGrank

"""
Да се промени функцијата за предвидување, така што при изминувањето ќе печати информации за:
со која колона и вредност се споредува
за која е тековната вредност на тест примерокот за бараната колона
која е следната гранка што ќе се изминува низ дрвото (True branch или False branch)
преостанатиот дел од дрвото што треба да се измине
празна линија
Потоа да се испечати истренираното дрво, да се вчита непознат тренинг примерок од стандардниот влез и
истиот да се класифицира со новата функција за предвидување.
"""

trainingData=[['twitter','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['google','France','no',26,'Basic'],
        ['google','Macedonia','yes',13,'None'],
        ['pinterest','USA','yes',24,'Basic'],
        ['bing','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['facebook','New Zealand','no',12,'None'],
        ['facebook','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['twitter','France','yes',19,'None'],
        ['pinterest','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['bing','UK','yes',19,'Premium'],
        ['bing','Macedonia','no',10,'None'],
        ['facebook','Macedonia','no',16,'Basic'],
        ['bing','UK','no',19,'Basic'],
        ['pinterest','Germany','no',2,'None'],
        ['pinterest','USA','yes',12,'Basic'],
        ['twitter','UK','no',21,'None'],
        ['twitter','UK','yes',26,'Premium'],
        ['google','UK','yes',18,'Basic'],
        ['bing','France','yes',19,'Basic']]

test_cases=[['google','MK','no',24,'Unknown'],
            ['google','MK','no',15,'Unknown'],
            ['pinterest','UK','yes',21,'Unknown'],
            ['pinterest','UK','no',25,'Unknown']]

# trainingData=[line.split('\t') for line in file('decision_tree_example.txt')]


class decisionnode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb


def sporedi_broj(row, column, value):
    return row[column] >= value


def sporedi_string(row, column, value):
    return row[column] == value


# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows, column, value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function = None
    if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
        # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
        split_function = sporedi_broj
    else:
        # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
        split_function = sporedi_string

    # Divide the rows into two sets and return them
    set_false = []
    set_true = []
    for row in rows:
        if split_function(row, column, value):
            set_true.append(row)
        else:
            set_false.append(row)
    set1 = [row for row in rows if
            split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
    set2 = [row for row in rows if
            not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
    # return (set1, set2)
    return (set_true, set_false)


# Create counts of possible results (the last column of
# each row is the result)
def uniquecounts(rows):
    results = {}
    for row in rows:
        # The result is the last column
        r = row[-1]
        results.setdefault(r, 0)
        results[r] += 1

    return results


# Probability that a randomly placed item will
# be in the wrong category

def log2(x):
    from math import log
    l2 = log(x) / log(2)
    return l2


# Entropy is the sum of p(x)log(p(x)) across all
# the different possible results
def entropy(rows):
    results = uniquecounts(rows)
    # Now calculate the entropy
    ent = 0.0
    for r in results.keys():
        p = float(results[r]) / len(rows)
        ent = ent - p * log2(p)
    return ent


def buildtree(rows, scoref=entropy):
    if len(rows) == 0: return decisionnode()
    current_score = scoref(rows)

    # Set up some variables to track the best criteria
    best_gain = 0.0
    best_column = -1
    best_value = None
    best_subsetf = None
    best_subsett = None

    column_count = len(rows[0]) - 1
    for col in range(column_count):
        # Generate the list of different values in
        # this column
        column_values = set()
        for row in rows:
            column_values.add(row[col])
        # Now try dividing the rows up for each value
        # in this column
        for value in column_values:
            (set1, set2) = divideset(rows, col, value)

            # Information gain
            p = float(len(set1)) / len(rows)
            gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
            if gain > best_gain and len(set1) > 0 and len(set2) > 0:
                best_gain = gain
                best_column = col
                best_value = value
                best_subsett = set1
                best_subsetf = set2
                # best_criteria = (col, value)
                # best_sets = (set1, set2)

    # Create the subbranches
    if best_gain > 0:
        trueBranch = buildtree(best_subsett, scoref)
        falseBranch = buildtree(best_subsetf, scoref)
        return decisionnode(col=best_column, value=best_value,
                            tb=trueBranch, fb=falseBranch)
    else:
        return decisionnode(results=uniquecounts(rows))


def printtree(tree, indent=''):
    # Is this a leaf node?
    if tree.results != None:
        print(indent + str(sorted(tree.results.items())))
    else:
        # Print the criteria
        print(indent + str(tree.col) + ':' + str(tree.value) + '? ')
        # Print the branches
        print(indent + 'T->')
        printtree(tree.tb, indent + '  ')
        print(indent + 'F->')
        printtree(tree.fb, indent + '  ')


def classify(observation, tree):
    if tree.results != None:
        return tree.results
    else:
        vrednost = observation[tree.col]
        branch = None

        if isinstance(vrednost, int) or isinstance(vrednost, float):
            if vrednost >= tree.value:
                branch = tree.tb
            else:
                branch = tree.fb
        else:
            if vrednost == tree.value:
                branch = tree.tb
            else:
                branch = tree.fb

        return classify(observation, branch)

def classify2(observation, tree):
        if tree.results != None:
            results = [(value,key) for key,value in tree.results.items()]
            results.sort()
            return results[0][1]
        else:
            vrednost = observation[tree.col]
            branch = None
            granka = 'True branch'
            if isinstance(vrednost, int) or isinstance(vrednost, float):
                if vrednost >= tree.value:
                    branch = tree.tb
                else:
                    branch = tree.fb
                    granka = 'False branch'
            else:
                if vrednost == tree.value:
                    branch = tree.tb
                else:
                    branch = tree.fb
                    granka = 'False branch'
            print('Sporeduvam kolona i vrednost', (tree.col, tree.value))
            print('Tekovna vrednost:', vrednost)
            print ('Sledna granka', granka)
            print('Preostanata granka za izminuvanje:')
            printtree(branch)
            print()
            return classify2(observation, branch)


if __name__ == "__main__":
    referrer = 'google'
    location = 'USA'
    readFAQ = 'no'
    pagesVisited = 25
    serviceChosen = 'Unknown'


    testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
    t=buildtree(trainingData)
    printtree(t)
    print(classify2(testCase,t))