Advertisement
Guest User

Untitled

a guest
Oct 15th, 2019
139
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.43 KB | None | 0 0
  1. In [1]: import pandas as pd
  2.  
  3. In [2]: df = pd.read_csv("/home/greg/Downloads/slipknot_fans.csv")
  4.  
  5. In [3]: %edit
  6. IPython will make a temporary file named: /tmp/ipython_edit_jgkmcag0/ipython_edit_d4b4hpdb.py
  7. Editing... done. Executing edited code...
  8. Out[3]: "def merge_columns(df, column_prefix, new_aggregate_prefix):\n temp = df[[x for x in df.columns if column_prefix in x]] == True\n # Breaking this down a little\n # Find all the column names that are likely to be the category I want\n # in this case `income_9_cat_xx`\n\n # As a list-comprehension this is\n income_cat_cols = [x for x in df.columns if column_prefix in x]\n\n # Now we want to select just these columns from the dataframe\n temp = df[income_cat_cols]\n\n # Next we observe that we have some NaN / NA in the dataframe\n # For boolean data I am going to _assume_ NaN is false\n # or ... moreover I only care about where the value is True, so I am\n # going to _project_ over only true values (True == True is ... True right?)\n temp = temp == True\n\n\n # Now I am going to pull out of this set of data the column label that is true\n temp = temp.idxmax(axis=1)\n\n # Now I have my columns I want to make useful I am going to make them into a category\n # That is data that is categorically described by label\n # In this weird case the label is the column name that we originally had\n x = temp.astype('category')\n\n # This gives me a x that is a series with category super powers, I can get the codes, names and assignements for cateogies\n # e.g. the codes are the index like so\n x.cat.categories\n # or if you are uncomfortable working with pandas Index objects\n # list(enumerate(x.cat.categories))\n\n # The codes that machine learning will probably want are like so\n x.cat.codes\n\n # For now I am going to assign the cat labels back to the original dataframe, and the codes for ML for when I am ready to do stuff\n df[new_aggregate_prefix + '_category'] = x\n df[new_aggregate_prefix + '_code'] = x.cat.codes\n\n # Finally drop the columns after the merge\n df.drop(columns=income_cat_cols, inplace=True)\n"
  9.  
  10. In [4]: merge_columns(df, 'income_9_cat', 'income')
  11.  
  12. In [5]: df.income_category.cat
  13. Out[5]: <pandas.core.arrays.categorical.CategoricalAccessor object at 0x7fc23b454358>
  14.  
  15. In [6]: df.income_category.cat.categories
  16. Out[6]:
  17. Index(['income_9_cat_000_015k_ind', 'income_9_cat_016_020k_ind',
  18. 'income_9_cat_021_030k_ind', 'income_9_cat_031_040k_ind',
  19. 'income_9_cat_041_050k_ind', 'income_9_cat_051_075k_ind',
  20. 'income_9_cat_076_100k_ind', 'income_9_cat_101_125k_ind',
  21. 'income_9_cat_126k_plus_ind'],
  22. dtype='object')
  23.  
  24. In [7]: df.income_category.cat.categories[0]
  25. Out[7]: 'income_9_cat_000_015k_ind'
  26.  
  27. In [8]: df.income_category.cat.categories[1]
  28. Out[8]: 'income_9_cat_016_020k_ind'
  29.  
  30. In [9]: df.income_category.cat.categories[2]
  31. Out[9]: 'income_9_cat_021_030k_ind'
  32.  
  33. In [10]: df.income_category.cat.categories[3]
  34. Out[10]: 'income_9_cat_031_040k_ind'
  35.  
  36. In [11]: # or
  37.  
  38. In [12]: codes_to_cat = {k: df.income_category.cat.categories[k] for k in df.income_category.cat.codes}
  39.  
  40. In [13]: codes_to_cat
  41. Out[13]:
  42. {5: 'income_9_cat_051_075k_ind',
  43. 6: 'income_9_cat_076_100k_ind',
  44. 1: 'income_9_cat_016_020k_ind',
  45. 0: 'income_9_cat_000_015k_ind',
  46. 4: 'income_9_cat_041_050k_ind',
  47. 2: 'income_9_cat_021_030k_ind',
  48. 7: 'income_9_cat_101_125k_ind',
  49. 3: 'income_9_cat_031_040k_ind',
  50. 8: 'income_9_cat_126k_plus_ind'}
  51.  
  52. In [14]:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement