Untitled

def corr_ratio(cont_vec, cat_vec):

    """
    Calculates the correlation ratio between a continuous variable and a categorical variable

    Parameters
    ----------
    cont_vec : numpy 1-d ndarray, pandas Series
        Vector for continuous data

    cat_vec : numpy 1-d ndarray, pandas Series
        Vector for categorical data

    Returns
    -------
    eta : float
        Correlation ratio. Eta is the greek letter eta, not an acronym
    """
    # make sure to drop rows with NaN values in any colum
    cat_cont_df = pd.concat([cat_vec, cont_vec], axis=1)
    cat_cont_df = cat_cont_df.dropna()

    # unpack to Series
    cat_vec = cat_cont_df.iloc[:,0]
    cont_vec = cat_cont_df.iloc[:,1]

    # for each category store its average and count of the continuous values
    cat_count = []
    cat_avg = []

    for cat in cat_vec.unique():
        cont_vec_label = cont_vec[cat_vec == cat]    # store category values in this variable
        cat_count.append(len(cont_vec_label))
        cat_avg.append(np.average(cont_vec_label))

    # cast to numpy array so vectorized operation can be done
    cat_count = np.array(cat_count, dtype=np.float)
    cat_avg = np.array(cat_avg, dtype=np.float)

    # calculate eta - the correlation ratio
    total_avg = np.sum(cat_count * cat_avg) / np.sum(cat_count)    # cont_vec.mean()
    numerator = np.sum(cat_count*((cat_avg - total_avg)**2))
    denominator = np.sum((cont_vec - total_avg)**2)
    eta = np.sqrt(numerator / denominator)

    return eta