Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def cdf_diff(df, var, grp='label', col=None, rm_outlier=None, hard_lim=None, ax=None, xlim=None):
- '''Plot cummulative distributions of multiple groups for comparison.
- Arguments:
- df: DataFrame
- var: string, name of column to be plotted
- grp: string, grouping variable
- col: list, colors to use for each group
- rm_outlier: None|float, remove datapoints beyond this many sigma.
- ax: axis on which to plot. Default none will return a new figure
- Examples:
- cdf_diff(feats_labeled,var='Creatinine' ,rm_outlier=4.0)
- fig, ax = plt.subplots(1, 2)
- psLearn.cdf_diff(feats_labeled,var='Creatinine' ,ax=ax[0],rm_outlier=4.0)
- psLearn.cdf_diff(feats_labeled,var='Sodium Level',ax=ax[1])
- '''
- if col is None:
- col = ['green', 'red']
- import statsmodels.api as sm
- if ax is None:
- fig, ax = plt.subplots(1, 1)
- grps = df[grp].unique()
- if len(df[var].unique()) == 2:
- df.groupby(grp)[var].mean().plot(ax = ax,kind='bar',color=col)
- ax.set_title(var)
- else:
- for g in grps:
- sample = df[df[grp]==g][var]
- sample = sample[np.isfinite(sample.values)]
- if rm_outlier is not None:
- sigma = sample.std()
- mu = sample.mean()
- sample = sample[sample > mu - rm_outlier * sigma ]
- sample = sample[sample < mu + rm_outlier * sigma ]
- if hard_lim is not None:
- sample = sample[sample > hard_lim[0] ]
- sample = sample[sample < hard_lim[1] ]
- ecdf = sm.distributions.ECDF(sample)
- sample = sample[ecdf(sample) < 0.99]
- x = np.linspace(min(sample), max(sample), 1000)
- y = ecdf(x)
- #x = np.append(x, [max(sample)])
- #y = np.append(y, [0])
- ax.step(x, y,label='%s = %s' % (grp,str(g)),c=col[int(g)])
- ax.set_title(var)
- ax.set_ylim([0,1])
- if xlim:
- ax.set_xlim(xlim)
Add Comment
Please, Sign In to add comment