Advertisement
ShrekOP

Assg23

Dec 14th, 2022
44
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.84 KB | None | 0 0
  1. # creating a frequency table
  2. import pandas as pd
  3.  
  4. df = pd.read_csv("data.csv")
  5. df.head()
  6.  
  7. table = df['Age '].value_counts() # will give us the frequency count ie number of times that value appeared in that column
  8. print(table)
  9.  
  10. my_keys = table.keys() # listing all the keys in table
  11. my_keys
  12.  
  13. my_value = table.values # listing all the values corresponding to keys in table
  14. my_value
  15.  
  16. # A DataFrame is a data structure that organizes data into a 2-dimensional table of rows and columns, much like a spreadsheet.
  17. freq_table = pd.DataFrame({'Age': my_keys, 'Frequency':my_value}) # creating a dataframe of keys and values
  18. freq_table
  19.  
  20. # if we want we can reorder our table
  21. freq_table = freq_table.sort_values(by=['Age'])
  22. # sorting or reordering the keys in terms of values
  23. freq_table = freq_table.reset_index(drop=True)
  24. freq_table
  25.  
  26. freq_table['Percent'] = freq_table['Frequency']/freq_table['Frequency'].sum() * 100
  27. freq_table
  28.  
  29. def calc_information_gain(data, split_name, target_name):
  30. """
  31. Calculate information gain given a data set, column to split on, and target
  32. """
  33. # Calculate the original entropy
  34. original_entropy = calc_entropy(data[target_name])
  35.  
  36. #Find the unique values in the column
  37. values = data[split_name].unique()
  38.  
  39.  
  40. # Make two subsets of the data, based on the unique values
  41. left_split = data[data[split_name] == values[0]]
  42. right_split = data[data[split_name] == values[1]]
  43.  
  44. # Loop through the splits and calculate the subset entropies
  45. to_subtract = 0
  46. for subset in [left_split, right_split]:
  47. prob = (subset.shape[0] / data.shape[0])
  48. to_subtract += prob * calc_entropy(subset[target_name])
  49.  
  50. # Return information gain
  51. return original_entropy - to_subtract
  52.  
  53. calc_information_gain(freq_table, 'Age', table)
  54. table
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement