lpm-vs-logit

#!/usr/bin/env python
# coding: utf-8

### Imports ###

## Requires to install matplotlib, numpy, pandas, statsmodels and palettable ##

import matplotlib.pyplot as plt  # Plotting
import numpy as np  # Linear algebra and random data generation
import pandas as pd  # I/O and data manipulation
import statsmodels.api as sm  # Regressions and summaries
import statsmodels.formula.api as smf  # Wrapper for nice formulas
from palettable.wesanderson import GrandBudapest1_4  # Pretty color schemes
cmap = GrandBudapest1_4.mpl_colors

from numpy.random import default_rng
rng = default_rng()

import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'


def gen_dep_data(N=300, outcome_pos_ratio=0.5):
    """Generate random data related to the outcome variable
    with different strengths of relationships. To do that, I generate a
    uniformly distributed array [0, 1) of size N. I then sample from the two same
    distributions with different parameters depending on the outcome and the
    strength of the relationship.
    """
    rand_arr = rng.uniform(size=N)
    y = rng.choice([0, 1],
                   size=N,
                   p=[1 - outcome_pos_ratio, outcome_pos_ratio])

    λ1, λ2 = 30, 20
    exp1 = rng.exponential(λ1, size=N)
    exp2 = rng.exponential(λ2, size=N)
    weak_rel = 0.2

    weak_rel_exp = np.where((rand_arr < weak_rel) & (y == 1), exp1, exp2)
    k1, θ1 = 2, 5
    k2, θ2 = 1, 6
    med_rel = 0.5
    gamma1 = rng.gamma(k1, θ1, size=N)
    gamma2 = rng.gamma(k2, θ2, size=N)
    med_rel_gamma = np.where((rand_arr < med_rel) & (y == 1), gamma1, gamma2)

    μ1, σ1 = 10, 10
    μ2, σ2 = 5, 20
    gauss1 = rng.normal(μ1, σ1, size=N)
    gauss2 = rng.normal(μ2, σ2, size=N)
    med_rel_gauss = np.where((rand_arr < med_rel) & (y == 1), gauss1, gauss2)
    d1, d2 = 3, 6
    d3, d4 = 5, 8
    strong_rel = 0.8
    f_dist1 = rng.f(d1, d2, size=N)
    f_dist2 = rng.f(d3, d3, size=N)
    strong_rel_fdist = np.where((rand_arr < strong_rel) & (y == 1), f_dist1,
                                f_dist2)

    unrelated = rng.choice([0, 1], size=N)

    data = {
        "y": y,
        "weak_related_exp": weak_rel_exp,
        "med_related_gamma": med_rel_gamma,
        "med_related_gauss": med_rel_gauss,
        "strong_related_fdist": strong_rel_fdist,
        "unrelated": unrelated
    }
    df = pd.DataFrame(data)

    return df


N = 600
df = gen_dep_data(N)


def gen_formula(df, dep_var_name="y"):
    "Create formula for a given dataframe because I'm lazy"
    rhs = "".join(f"{c} + " for c in df.columns if c != "y").rstrip('+ ')
    formula = dep_var_name + " ~ " + rhs
    return formula


formula = gen_formula(df)

lpm = smf.ols(formula, data=df).fit()

logreg = smf.logit(formula, data=df).fit(disp=False)  # no convergence message

lpm_params = lpm.params
logreg_params = logreg.params

col_names = {0: "Linear Probability Model", 1: "Logit Model"}
results = pd.DataFrame((lpm_params, logreg_params)).T.rename(columns=col_names)

m1 = lpm.predict()
m2 = sm.Logit(df["y"].values, m1).fit(disp=False).predict()
m3 = logreg.predict()

# Make m1 behave

m1[m1 < 0], m1[m1 > 1] = 0, 1

random = rng.uniform(size=N)


def pretty_plots(save=False):
    fig, ax = plt.subplots(nrows=1,
                           ncols=3,
                           sharey=True,
                           figsize=(12, 4),
                           dpi=300)
    for idx, m in enumerate([m1, m2, m3]):
        ax[idx].hist(m, bins=40, color=cmap[idx])
        ax[idx].yaxis.set_tick_params(labelbottom=True)
    for i, j in enumerate(
        ["LPM predictions", "y ~ LPM preds", "Logit predictions"]):
        ax[i].set_title(j, pad=10)
    fig.suptitle("Distributions of predicted probabilities",
                 fontsize=16,
                 y=1.05)
    if save:
        plt.savefig("img/lpm-vs-logit.png", dpi=300, bbox_inches="tight")
    plt.show()


pretty_plots()