Guest User

Untitled

a guest
Oct 18th, 2017
238
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.37 KB | None | 0 0
  1. import pandas as pd
  2. data = pd.read_excel("./data/Healthy Minds 2015-2016 Survey Data.xlsx")
  3. race_columns = ["race_white", "race_black", "race_asian", "race_ainaan", "race_mides", "race_pi", "race_haw", "race_other"]
  4. fill_to_zero = dict(zip(race_columns, [0.0]*len(race_columns)))
  5. data.loc[:,race_columns] = data.loc[:,race_columns].fillna(fill_to_zero) # Could just do fillna(0)
  6.  
  7. age_by_race = data.loc[:,race_columns + ["age"]].dropna() ## Drop anyone who didn't report their age
  8. # Tall meaning tall data aka a biracial person appears twice
  9. tall_age_by_race = pd.melt(age_by_race, id_vars="age")
  10. tall_age_by_race = tall_age_by_race[tall_age_by_race["value"] == 1.0]
  11. del tall_age_by_race["value"]
  12. tall_age_by_race.columns = ["age", "race"]
  13.  
  14. # What is the average age of each race?
  15. tall_age_by_race.groupby("race").mean()
  16.  
  17.  
  18. ### Let's make a biracial column
  19. reported_num_of_races = data.loc[:,race_columns].sum(axis=1) # Sum across rows
  20. # 1181 people did not report their race at all
  21. reported_num_of_races.value_counts()
  22.  
  23. race = pd.Series(index=data.index)
  24. race[reported_num_of_races == 0.0] = "Unknown"
  25. race[reported_num_of_races > 1.0] = "Biracial"
  26. single_race = reported_num_of_races == 1.0
  27. x = data.loc[single_race, race_columns]
  28. def getColumnIndex(row):
  29. return list(filter(lambda tup: tup[1] == 1.0, enumerate(row)))[0][0]
  30. race[single_race] = x.columns[x.apply(getColumnIndex, axis=1)].tolist()
Add Comment
Please, Sign In to add comment