Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # creating a frequency table
- import pandas as pd
- df = pd.read_csv("data.csv")
- df.head()
- table = df['Age '].value_counts() # will give us the frequency count ie number of times that value appeared in that column
- print(table)
- my_keys = table.keys() # listing all the keys in table
- my_keys
- my_value = table.values # listing all the values corresponding to keys in table
- my_value
- # A DataFrame is a data structure that organizes data into a 2-dimensional table of rows and columns, much like a spreadsheet.
- freq_table = pd.DataFrame({'Age': my_keys, 'Frequency':my_value}) # creating a dataframe of keys and values
- freq_table
- # if we want we can reorder our table
- freq_table = freq_table.sort_values(by=['Age'])
- # sorting or reordering the keys in terms of values
- freq_table = freq_table.reset_index(drop=True)
- freq_table
- freq_table['Percent'] = freq_table['Frequency']/freq_table['Frequency'].sum() * 100
- freq_table
- def calc_information_gain(data, split_name, target_name):
- """
- Calculate information gain given a data set, column to split on, and target
- """
- # Calculate the original entropy
- original_entropy = calc_entropy(data[target_name])
- #Find the unique values in the column
- values = data[split_name].unique()
- # Make two subsets of the data, based on the unique values
- left_split = data[data[split_name] == values[0]]
- right_split = data[data[split_name] == values[1]]
- # Loop through the splits and calculate the subset entropies
- to_subtract = 0
- for subset in [left_split, right_split]:
- prob = (subset.shape[0] / data.shape[0])
- to_subtract += prob * calc_entropy(subset[target_name])
- # Return information gain
- return original_entropy - to_subtract
- calc_information_gain(freq_table, 'Age', table)
- table
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement