Assg23

#  creating a frequency table
import pandas as pd

df = pd.read_csv("data.csv")
df.head()

table = df['Age '].value_counts()    # will give us the frequency count ie number of times that value appeared in that column
print(table)

my_keys = table.keys()     # listing all the keys in table
my_keys

my_value = table.values    # listing all the values corresponding to keys in table
my_value

# A DataFrame is a data structure that organizes data into a 2-dimensional table of rows and columns, much like a spreadsheet.
freq_table = pd.DataFrame({'Age': my_keys, 'Frequency':my_value})    #  creating a dataframe of keys and values
freq_table

#  if we want we can reorder our table
freq_table = freq_table.sort_values(by=['Age'])
# sorting or reordering the keys in terms of values
freq_table = freq_table.reset_index(drop=True)
freq_table

freq_table['Percent'] = freq_table['Frequency']/freq_table['Frequency'].sum() * 100
freq_table

def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])

    #Find the unique values in the column
    values = data[split_name].unique()


    # Make two subsets of the data, based on the unique values
    left_split = data[data[split_name] == values[0]]
    right_split = data[data[split_name] == values[1]]

    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0])
        to_subtract += prob * calc_entropy(subset[target_name])

    # Return information gain
    return original_entropy - to_subtract

calc_information_gain(freq_table, 'Age', table)
table