Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # coding: utf-8
- ### Imports ###
- ## Requires to install matplotlib, numpy, pandas, statsmodels and palettable ##
- import matplotlib.pyplot as plt # Plotting
- import numpy as np # Linear algebra and random data generation
- import pandas as pd # I/O and data manipulation
- import statsmodels.api as sm # Regressions and summaries
- import statsmodels.formula.api as smf # Wrapper for nice formulas
- from palettable.wesanderson import GrandBudapest1_4 # Pretty color schemes
- cmap = GrandBudapest1_4.mpl_colors
- from numpy.random import default_rng
- rng = default_rng()
- import matplotlib
- matplotlib.rcParams['mathtext.fontset'] = 'stix'
- matplotlib.rcParams['font.family'] = 'STIXGeneral'
- def gen_dep_data(N=300, outcome_pos_ratio=0.5):
- """Generate random data related to the outcome variable
- with different strengths of relationships. To do that, I generate a
- uniformly distributed array [0, 1) of size N. I then sample from the two same
- distributions with different parameters depending on the outcome and the
- strength of the relationship.
- """
- rand_arr = rng.uniform(size=N)
- y = rng.choice([0, 1],
- size=N,
- p=[1 - outcome_pos_ratio, outcome_pos_ratio])
- λ1, λ2 = 30, 20
- exp1 = rng.exponential(λ1, size=N)
- exp2 = rng.exponential(λ2, size=N)
- weak_rel = 0.2
- weak_rel_exp = np.where((rand_arr < weak_rel) & (y == 1), exp1, exp2)
- k1, θ1 = 2, 5
- k2, θ2 = 1, 6
- med_rel = 0.5
- gamma1 = rng.gamma(k1, θ1, size=N)
- gamma2 = rng.gamma(k2, θ2, size=N)
- med_rel_gamma = np.where((rand_arr < med_rel) & (y == 1), gamma1, gamma2)
- μ1, σ1 = 10, 10
- μ2, σ2 = 5, 20
- gauss1 = rng.normal(μ1, σ1, size=N)
- gauss2 = rng.normal(μ2, σ2, size=N)
- med_rel_gauss = np.where((rand_arr < med_rel) & (y == 1), gauss1, gauss2)
- d1, d2 = 3, 6
- d3, d4 = 5, 8
- strong_rel = 0.8
- f_dist1 = rng.f(d1, d2, size=N)
- f_dist2 = rng.f(d3, d3, size=N)
- strong_rel_fdist = np.where((rand_arr < strong_rel) & (y == 1), f_dist1,
- f_dist2)
- unrelated = rng.choice([0, 1], size=N)
- data = {
- "y": y,
- "weak_related_exp": weak_rel_exp,
- "med_related_gamma": med_rel_gamma,
- "med_related_gauss": med_rel_gauss,
- "strong_related_fdist": strong_rel_fdist,
- "unrelated": unrelated
- }
- df = pd.DataFrame(data)
- return df
- N = 600
- df = gen_dep_data(N)
- def gen_formula(df, dep_var_name="y"):
- "Create formula for a given dataframe because I'm lazy"
- rhs = "".join(f"{c} + " for c in df.columns if c != "y").rstrip('+ ')
- formula = dep_var_name + " ~ " + rhs
- return formula
- formula = gen_formula(df)
- lpm = smf.ols(formula, data=df).fit()
- logreg = smf.logit(formula, data=df).fit(disp=False) # no convergence message
- lpm_params = lpm.params
- logreg_params = logreg.params
- col_names = {0: "Linear Probability Model", 1: "Logit Model"}
- results = pd.DataFrame((lpm_params, logreg_params)).T.rename(columns=col_names)
- m1 = lpm.predict()
- m2 = sm.Logit(df["y"].values, m1).fit(disp=False).predict()
- m3 = logreg.predict()
- # Make m1 behave
- m1[m1 < 0], m1[m1 > 1] = 0, 1
- random = rng.uniform(size=N)
- def pretty_plots(save=False):
- fig, ax = plt.subplots(nrows=1,
- ncols=3,
- sharey=True,
- figsize=(12, 4),
- dpi=300)
- for idx, m in enumerate([m1, m2, m3]):
- ax[idx].hist(m, bins=40, color=cmap[idx])
- ax[idx].yaxis.set_tick_params(labelbottom=True)
- for i, j in enumerate(
- ["LPM predictions", "y ~ LPM preds", "Logit predictions"]):
- ax[i].set_title(j, pad=10)
- fig.suptitle("Distributions of predicted probabilities",
- fontsize=16,
- y=1.05)
- if save:
- plt.savefig("img/lpm-vs-logit.png", dpi=300, bbox_inches="tight")
- plt.show()
- pretty_plots()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement