Advertisement
Guest User

Untitled

a guest
Oct 23rd, 2019
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.50 KB | None | 0 0
  1. def corr_ratio(cont_vec, cat_vec):
  2.  
  3. """
  4. Calculates the correlation ratio between a continuous variable and a categorical variable
  5.  
  6. Parameters
  7. ----------
  8. cont_vec : numpy 1-d ndarray, pandas Series
  9. Vector for continuous data
  10.  
  11. cat_vec : numpy 1-d ndarray, pandas Series
  12. Vector for categorical data
  13.  
  14. Returns
  15. -------
  16. eta : float
  17. Correlation ratio. Eta is the greek letter eta, not an acronym
  18. """
  19. # make sure to drop rows with NaN values in any colum
  20. cat_cont_df = pd.concat([cat_vec, cont_vec], axis=1)
  21. cat_cont_df = cat_cont_df.dropna()
  22.  
  23. # unpack to Series
  24. cat_vec = cat_cont_df.iloc[:,0]
  25. cont_vec = cat_cont_df.iloc[:,1]
  26.  
  27. # for each category store its average and count of the continuous values
  28. cat_count = []
  29. cat_avg = []
  30.  
  31. for cat in cat_vec.unique():
  32. cont_vec_label = cont_vec[cat_vec == cat] # store category values in this variable
  33. cat_count.append(len(cont_vec_label))
  34. cat_avg.append(np.average(cont_vec_label))
  35.  
  36. # cast to numpy array so vectorized operation can be done
  37. cat_count = np.array(cat_count, dtype=np.float)
  38. cat_avg = np.array(cat_avg, dtype=np.float)
  39.  
  40. # calculate eta - the correlation ratio
  41. total_avg = np.sum(cat_count * cat_avg) / np.sum(cat_count) # cont_vec.mean()
  42. numerator = np.sum(cat_count*((cat_avg - total_avg)**2))
  43. denominator = np.sum((cont_vec - total_avg)**2)
  44. eta = np.sqrt(numerator / denominator)
  45.  
  46. return eta
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement