Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- from data import Data
- DATA_DIR = 'data/'
- data = np.loadtxt(DATA_DIR + 'train.csv', delimiter=',', dtype=str)
- data_obj = Data(data=data)
- print(np.unique(data_obj.get_column('label'), return_counts=True))
- def entropy(column):
- """
- Calculate the entropy of the labels
- column specifies the target column
- """
- elements, counts = np.unique(column, return_counts = True)
- entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
- return entropy
- def infogain(data, attribute, target_name='label'):
- """
- Calculate information gain of the dataset:
- :param data: The dataset for which the information gain is calculated
- :param attribute: the name of the feature for which the information gain should be calculated
- :param target_name: the name of the target feature; default is "bruises"
- :return: the information gain for this split
- """
- #calculate total entropy of the data
- total_entropy = entropy(data_obj.get_column(target_name))
- #count the labels for the subsets of the split
- vals, counts = np.unique(data_obj.get_column(attribute), return_counts=True)
- def weighted_entropy(attribute):
- vals, counts = np.unique(data_obj.get_column(attribute), return_counts=True)
- def subs(attribute):
- values = data_obj.attributes[attribute].possible_vals # gets possible values for given attribute
- subs = [] # creates list of subtables (partitions of data_obj) each containing every row instance of a value
- for i in values:
- subs.append(data_obj.get_row_subset(attribute, i).raw_data)
- return subs
- w_ent = np.sum([(counts[i] / np.sum(counts)) * entropy(subs(attribute)[i][:, 0]) for i in range(len(vals))])
- return w_ent
- # Calculate the information gain
- Information_Gain = total_entropy - weighted_entropy
- return Information_Gain
- def id3(data, originaldata=data_obj, features, target_attribute_name="label", parent_node_class=None):
- """
- :param data: the data for which the ID3 algorithm should be run (in the first run, the whole dataset)
- :param originaldata: the original dataset (for finding the most frequent label of the original dataset)
- :param features: the feature space of the dataset (needed for the recursive call)
- :param target_attribute_name: the name of the target attribute
- :param parent_node_class: most frequently appearing label for the direct parent node
- :return: a classification decision tree
- """
- # define the stopping criteria; if satisfied, return leaf node
- # if all target_values have the same value, return this value
- if len(np.unique(data_obj.get_column(target_attribute_name))) <= 1:
- return np.unique(data_obj.get_column(target_attribute_name))[0]
- # if the dataset is empty, return the most frequently appearing label of original dataset
- elif len(data) == 0:
- return np.unique(originaldata.get_column('label'))[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]
- # if the feature space is empty, return the most frequently occurring label of the direct parent node
- elif len(features) == 0:
- return parent_node_class
- # If none of the above holds true, grow the tree!
- else:
- # set the default value for this node (the most commonly occurring label of the current node !!!!!problem line below
- parent_node_class = np.unique(data_obj.get_column(target_attribute_name))[np.argmax(np.unique(data_obj.get_column(target_attribute_name)],return_counts=True)[1])]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement