Untitled

In [1]: import pandas as pd

In [2]: df = pd.read_csv("/home/greg/Downloads/slipknot_fans.csv")

In [3]: %edit
IPython will make a temporary file named: /tmp/ipython_edit_jgkmcag0/ipython_edit_d4b4hpdb.py
Editing... done. Executing edited code...
Out[3]: "def merge_columns(df, column_prefix, new_aggregate_prefix):\n    temp = df[[x for x in df.columns if column_prefix in x]] == True\n    # Breaking this down a little\n    # Find all the column names that are likely to be the category I want\n    # in this case `income_9_cat_xx`\n\n    # As a list-comprehension this is\n    income_cat_cols = [x for x in df.columns if column_prefix in x]\n\n    # Now we want to select just these columns from the dataframe\n    temp = df[income_cat_cols]\n\n    # Next we observe that we have some NaN / NA in the dataframe\n    # For boolean data I am going to _assume_ NaN is false\n    # or ... moreover I only care about where the value is True, so I am\n    # going to _project_ over only true values (True == True is ... True right?)\n    temp = temp == True\n\n\n    # Now I am going to pull out of this set of data the column label that is true\n    temp = temp.idxmax(axis=1)\n\n    # Now I have my columns I want to make useful I am going to make them into a category\n    # That is data that is categorically described by label\n    # In this weird case the label is the column name that we originally had\n    x = temp.astype('category')\n\n    # This gives me a x that is a series with category super powers, I can get the codes, names and assignements for cateogies\n    # e.g. the codes are the index like so\n    x.cat.categories\n    # or if you are uncomfortable working with pandas Index objects\n    # list(enumerate(x.cat.categories))\n\n    # The codes that machine learning will probably want are like so\n    x.cat.codes\n\n    # For now I am going to assign the cat labels back to the original dataframe, and the codes for ML for when I am ready to do stuff\n    df[new_aggregate_prefix + '_category'] = x\n    df[new_aggregate_prefix + '_code'] = x.cat.codes\n\n    # Finally drop the columns after the merge\n    df.drop(columns=income_cat_cols, inplace=True)\n"

In [4]: merge_columns(df, 'income_9_cat', 'income')

In [5]: df.income_category.cat
Out[5]: <pandas.core.arrays.categorical.CategoricalAccessor object at 0x7fc23b454358>

In [6]: df.income_category.cat.categories
Out[6]:
Index(['income_9_cat_000_015k_ind', 'income_9_cat_016_020k_ind',
       'income_9_cat_021_030k_ind', 'income_9_cat_031_040k_ind',
       'income_9_cat_041_050k_ind', 'income_9_cat_051_075k_ind',
       'income_9_cat_076_100k_ind', 'income_9_cat_101_125k_ind',
       'income_9_cat_126k_plus_ind'],
      dtype='object')

In [7]: df.income_category.cat.categories[0]
Out[7]: 'income_9_cat_000_015k_ind'

In [8]: df.income_category.cat.categories[1]
Out[8]: 'income_9_cat_016_020k_ind'

In [9]: df.income_category.cat.categories[2]
Out[9]: 'income_9_cat_021_030k_ind'

In [10]: df.income_category.cat.categories[3]
Out[10]: 'income_9_cat_031_040k_ind'

In [11]: # or

In [12]: codes_to_cat = {k: df.income_category.cat.categories[k] for k in df.income_category.cat.codes}

In [13]: codes_to_cat
Out[13]:
{5: 'income_9_cat_051_075k_ind',
 6: 'income_9_cat_076_100k_ind',
 1: 'income_9_cat_016_020k_ind',
 0: 'income_9_cat_000_015k_ind',
 4: 'income_9_cat_041_050k_ind',
 2: 'income_9_cat_021_030k_ind',
 7: 'income_9_cat_101_125k_ind',
 3: 'income_9_cat_031_040k_ind',
 8: 'income_9_cat_126k_plus_ind'}

In [14]: