snz_lab_02_zad_02

trainingData=[['slashdot','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['google','France','yes',23,'Basic'],
        ['google','France','yes',23,'Basic'],
        ['digg','USA','yes',24,'Basic'],
        ['kiwitobes','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['(direct)','New Zealand','no',12,'None'],
        ['(direct)','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['slashdot','France','yes',19,'None'],
        ['digg','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['kiwitobes','UK','no',19,'None'],
        ['digg','New Zealand','yes',12,'Basic'],
        ['slashdot','UK','no',21,'None'],
        ['google','UK','yes',18,'Basic'],
        ['kiwitobes','France','yes',19,'Basic']]


class decision_node:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb


def compare_number(row, column, value):
    return row[column] >= value


def compare_string(row, column, value):
    return row[column] == value


def divide_set(rows, column, value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function = None

    if isinstance(value, int) or isinstance(value, float):
        # ako vrednosta so koja sporeduvame e od tip int ili float
        split_function = compare_number
    else:
        split_function = compare_string

    # Divide the rows into two sets and return them
    set_true = []
    set_false = []

    for row in rows:
        if split_function(row, column, value):
            set_true.append(row)
        else:
            set_false.append(row)

    return (set_true, set_false)


def unique_counts(rows):
    # count unque classes
    # and how many times each class apears
    #
    # class is last column in the row, in this example (basic, none, or premium)
    result = {}

    for row in rows:
        clas = row[len(row) - 1]    # current class
        if clas not in result:
            result[clas] = 1  # add class in the dictionary and asociate value of 1 ocourance
        else:
            result[clas] += 1      # increment the number of ocurances by +1

    return result


def entrophy(rows):
    from math import log

    log2 = lambda x: log(x) / log(2)
    results = unique_counts(rows)       # result is dictionary{key, value}

    # now calculate the entrophy
    ent = 0.0

    for r in results.keys():
        pi = float(results[r]) / len(rows)
        ent = ent - pi * log2(pi)

    return ent


def buildtree(rows, score_function = entrophy):
    if len(rows) == 0:
        return decision_node()

    # score function is entrophy
    # declared in method definition
    current_score = score_function(rows)

    # promenlivi so koi sledime koj krietrium e najdobar
    best_gain = 0.0
    best_criteria = None
    best_sets = None

    column_count = len(rows[0]) - 1

    for col in range(0, column_count):
        # za sekoja kolona (col se dvizi od 0 - col_count -1)
        # generiranje na recnik {key, value} od vrednosti vo ovaa kolona
        column_values = {}

        for row in rows:
            # za sekoja redica se zema vrednsota vo tekovnata kolona
            # i se postavuva za kluc vo column_values
            column_values[row[col]] = 1

        for value in column_values.keys():
            # za sekoja razlicna vrednost na klucot (kolona)
            # se pravi podelba na mnozestvo
            (set_true, set_false) = divide_set(rows, col, value)

            # presmetaj dobivka za konkretnata podelba
            p = float(len(set_true)) / len(rows)        # verojatnost da se pogodi tekonvata vrednosta na ovoj atribut
            gain = current_score - p * score_function(set_true) - (1 - p) * score_function(set_false)

            if gain > best_gain and len(set_true) > 0 and len(set_false) > 0:
                # ovaa podelba e najdobra do sega pa ke ja zachuvame
                best_gain = gain
                best_criteria = (col, value)
                best_sets = (set_true, set_false)

    # create subtrees
    if best_gain > 0:
        # bidejki dobivkata > 0 mozna e druga podelba
        true_branch = buildtree(best_sets[0])
        false_branch = buildtree(best_sets[1])
        return decision_node(col = best_criteria[0], value = best_criteria[1],
                             tb = true_branch, fb = false_branch)
    else:
        # ne e mozna druga podelba bidejki site instaci vo ova mnozestvo ja imaat istata klasa
        # rezultatot sto ke se vrati ke sodrzi recnik so kluc klasata koja se predvidva
        # vo ovoj list i vrednost kolku pati bila sretnata vo trening mnozestvoto
        return decision_node(results = unique_counts(rows))


def print_tree(tree, indent = ' '):
    # ako ova e list
    if tree.results != None:
        print str(tree.results)
    else:
        # se pecatu uslovod (indeks na kolonata i vrednost)
        print str(tree.col) + ":" + str(tree.value)+ "? "

        # se pecatat True grabkite pa potoa False grankite
        print indent + "T->",
        print_tree(tree.tb, indent + "   ")
        print indent + "F->",
        print_tree(tree.fb, indent + "   ")


def classify(observation,tree):
    if tree.results!=None:
        result = ""
        dictionary = tree.results
        keys = dictionary.keys()

        if len(keys) > 1:
            # find key with greatest value
            max_value = dictionary[keys[0]]
            for key in keys:
                current = dictionary[key]
                if current > max_value :
                    max_value = current
                    result = key
        else:
            result = keys[0]

        # return tree.results
        # print "dictionary " + str(dictionary)
        # print "keys" + str(keys)
        return result
    else:
        vrednost = observation[tree.col]
        branch = None

        if isinstance(vrednost, int) or isinstance(vrednost, float):
            if vrednost >= tree.value:
                branch = tree.tb
            else:
                branch = tree.fb
        else:
            if vrednost == tree.value:
                branch=tree.tb
            else:
                branch = tree.fb

        return classify(observation, branch)


if __name__ == "__main__":
    # referrer='slashdot'
    # location='UK'
    # readFAQ='no'
    # pagesVisited=21
    # serviceChosen='Unknown'

    referrer=input()
    location=input()
    readFAQ=input()
    pagesVisited=input()
    serviceChosen=input()

    testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]

    t=buildtree(trainingData)
    print classify(testCase,t)