Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def corr_ratio(cont_vec, cat_vec):
- """
- Calculates the correlation ratio between a continuous variable and a categorical variable
- Parameters
- ----------
- cont_vec : numpy 1-d ndarray, pandas Series
- Vector for continuous data
- cat_vec : numpy 1-d ndarray, pandas Series
- Vector for categorical data
- Returns
- -------
- eta : float
- Correlation ratio. Eta is the greek letter eta, not an acronym
- """
- # make sure to drop rows with NaN values in any colum
- cat_cont_df = pd.concat([cat_vec, cont_vec], axis=1)
- cat_cont_df = cat_cont_df.dropna()
- # unpack to Series
- cat_vec = cat_cont_df.iloc[:,0]
- cont_vec = cat_cont_df.iloc[:,1]
- # for each category store its average and count of the continuous values
- cat_count = []
- cat_avg = []
- for cat in cat_vec.unique():
- cont_vec_label = cont_vec[cat_vec == cat] # store category values in this variable
- cat_count.append(len(cont_vec_label))
- cat_avg.append(np.average(cont_vec_label))
- # cast to numpy array so vectorized operation can be done
- cat_count = np.array(cat_count, dtype=np.float)
- cat_avg = np.array(cat_avg, dtype=np.float)
- # calculate eta - the correlation ratio
- total_avg = np.sum(cat_count * cat_avg) / np.sum(cat_count) # cont_vec.mean()
- numerator = np.sum(cat_count*((cat_avg - total_avg)**2))
- denominator = np.sum((cont_vec - total_avg)**2)
- eta = np.sqrt(numerator / denominator)
- return eta
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement