Untitled

import numpy as np
from data import Data

DATA_DIR = 'data/'


data = np.loadtxt(DATA_DIR + 'train.csv', delimiter=',', dtype=str)
data_obj = Data(data=data)
print(np.unique(data_obj.get_column('label'), return_counts=True))

def entropy(column):
    """
    Calculate the entropy of the labels
    column specifies the target column
    """
    elements, counts = np.unique(column, return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy


def infogain(data, attribute, target_name='label'):
    """
    Calculate information gain of the dataset:
    :param data: The dataset for which the information gain is calculated
    :param attribute: the name of the feature for which the information gain should be calculated
    :param target_name: the name of the target feature; default is "bruises"
    :return: the information gain for this split
    """

    #calculate total entropy of the data
    total_entropy = entropy(data_obj.get_column(target_name))

    #count the labels for the subsets of the split
    vals, counts = np.unique(data_obj.get_column(attribute), return_counts=True)


    def weighted_entropy(attribute):
        vals, counts = np.unique(data_obj.get_column(attribute), return_counts=True)

        def subs(attribute):
            values = data_obj.attributes[attribute].possible_vals  # gets possible values for given attribute
            subs = []  # creates list of subtables (partitions of data_obj) each containing every row instance of a value
            for i in values:
                subs.append(data_obj.get_row_subset(attribute, i).raw_data)
            return subs

        w_ent = np.sum([(counts[i] / np.sum(counts)) * entropy(subs(attribute)[i][:, 0]) for i in range(len(vals))])
        return w_ent


    # Calculate the information gain
    Information_Gain = total_entropy - weighted_entropy
    return Information_Gain

def id3(data, originaldata=data_obj, features, target_attribute_name="label", parent_node_class=None):
    """

    :param data: the data for which the ID3 algorithm should be run (in the first run, the whole dataset)
    :param originaldata: the original dataset (for finding the most frequent label of the original dataset)
    :param features: the feature space of the dataset (needed for the recursive call)
    :param target_attribute_name: the name of the target attribute
    :param parent_node_class: most frequently appearing label for the direct parent node
    :return: a classification decision tree
    """
    # define the stopping criteria; if satisfied, return leaf node
    # if all target_values have the same value, return this value
    if len(np.unique(data_obj.get_column(target_attribute_name))) <= 1:
        return np.unique(data_obj.get_column(target_attribute_name))[0]

    # if the dataset is empty, return the most frequently appearing label of original dataset
    elif len(data) == 0:
        return np.unique(originaldata.get_column('label'))[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]

    # if the feature space is empty, return the most frequently occurring label of the direct parent node
    elif len(features) == 0:
        return parent_node_class

    # If none of the above holds true, grow the tree!
    else:
        # set the default value for this node (the most commonly occurring label of the current node                !!!!!problem line below
        parent_node_class = np.unique(data_obj.get_column(target_attribute_name))[np.argmax(np.unique(data_obj.get_column(target_attribute_name)],return_counts=True)[1])]