Someone is wrong on the internet!

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d434c129",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import statsmodels.api as sm\n",
    "from patsy import dmatrices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "2249d109",
   "metadata": {},
   "outputs": [],
   "source": [
    "#def file_reader(fname):\n",
    "#    d = defaultdict(float)\n",
    "#    with open(fname, 'r') as f:\n",
    "#        for line in f:\n",
    "#            bits = line.strip().split()\n",
    "#            if len(bits) < 2:\n",
    "#                continue\n",
    "#            d[bits[0]] = float(bits[-1])\n",
    "#    return d\n",
    "\n",
    "LETTERS = [chr(i) for i in range(ord('A'), ord('Z') + 1)]\n",
    "#uk = file_reader('uk.txt')\n",
    "#usa = file_reader('usa.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "d7df9b9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# From https://one-name.org/modern-british-surnames/teaching/your-surname/micro-scale/\n",
    "# Data from UK 1957\n",
    "\n",
    "uk =    {'A': 3.0,\n",
    "         'B': 10.5,\n",
    "         'C': 7.9,\n",
    "         'D': 4.5,\n",
    "         'E': 2.2,\n",
    "         'F': 3.4,\n",
    "         'G': 4.9,\n",
    "         'H': 8.8,\n",
    "         'I': 0.4,\n",
    "         'J': 3.2,\n",
    "         'K': 2.1,\n",
    "         'L': 4.1,\n",
    "         'M': 8.9,\n",
    "         'N': 1.6,\n",
    "         'O': 1.3,\n",
    "         'P': 5.5,\n",
    "         'Q': 0.1,\n",
    "         'R': 5.1,\n",
    "         'S': 8.9,\n",
    "         'T': 4.1,\n",
    "         'U': 0.2,\n",
    "         'V': 0.4,\n",
    "         'W': 8.5,\n",
    "         'X': 0.0,\n",
    "         'Y': 0.0}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "5e926dec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# From https://www.dataminingdna.com/most-common-first-letters-of-last-names/\n",
    "# Data from USA 2010 (whole population)\n",
    "\n",
    "usa =   {'M': 9.6,\n",
    "         'S': 9.4,\n",
    "         'B': 8.5,\n",
    "         'C': 7.7,\n",
    "         'H': 7.1,\n",
    "         'R': 5.9,\n",
    "         'G': 5.6,\n",
    "         'W': 5.5,\n",
    "         'P': 5.0,\n",
    "         'L': 4.9,\n",
    "         'D': 4.5,\n",
    "         'A': 3.8,\n",
    "         'T': 3.5,\n",
    "         'F': 3.4,\n",
    "         'K': 3.3,\n",
    "         'J': 3.0,\n",
    "         'N': 1.9,\n",
    "         'E': 1.9,\n",
    "         'V': 1.8,\n",
    "         'O': 1.5,\n",
    "         'Y': 0.6,\n",
    "         'Z': 0.6,\n",
    "         'I': 0.4,\n",
    "         'Q': 0.2,\n",
    "         'U': 0.2,\n",
    "         'X': 0.04}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb178ed8",
   "metadata": {},
   "source": [
    "### Rationale\n",
    "\n",
    "My assumption is that difference between \"UK distribution in 1957\" and \"USA distribution in 2010\" is either similar in magnitude to or less than the difference between \"Hispanic distribution in 2010\" and \"Non-Hispanic distribution in 2010\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "51147989",
   "metadata": {},
   "outputs": [],
   "source": [
    "s = sum(uk.values())\n",
    "uk = defaultdict(float, {k: v/s for k, v in uk.items()})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "9b3d0267",
   "metadata": {},
   "outputs": [],
   "source": [
    "s = sum(usa.values())\n",
    "usa = defaultdict(float, {k: v/s for k, v in usa.items()})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0007b84e",
   "metadata": {},
   "source": [
    "### Parameters\n",
    "\n",
    "* `DELTA` represents the difference (in units of sigma) between our two subpopulations (Hispanic and Non-Hispanic, or UK and USA)\n",
    "* `WEIGHT` represents the proportion of people in the former subpopulation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c49ff62d",
   "metadata": {},
   "outputs": [],
   "source": [
    "DELTA = 0.5\n",
    "WEIGHT = 0.19"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3fc8e87d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'A': 0.036552218489341974,\n",
       " 'B': 0.08899045702038924,\n",
       " 'C': 0.0775402330475749,\n",
       " 'D': 0.04509275081093604,\n",
       " 'E': 0.019611450610132836,\n",
       " 'F': 0.03407007839048501,\n",
       " 'G': 0.05478008186592523,\n",
       " 'H': 0.07438931205591595,\n",
       " 'I': 0.004008244516527648,\n",
       " 'J': 0.030443359978375033,\n",
       " 'K': 0.030778860634847072,\n",
       " 'L': 0.047574890909793006,\n",
       " 'M': 0.09486252703120171,\n",
       " 'N': 0.01846687229687982,\n",
       " 'O': 0.01464939083256101,\n",
       " 'P': 0.05105687171763978,\n",
       " 'Q': 0.0018133592060549889,\n",
       " 'R': 0.057595502201112134,\n",
       " 'S': 0.09323993087735556,\n",
       " 'T': 0.03621671783286993,\n",
       " 'U': 0.002004122258263824,\n",
       " 'V': 0.015366417593450723,\n",
       " 'W': 0.06083625366852022,\n",
       " 'X': 0.0003245192307692307,\n",
       " 'Y': 0.00486778846153846,\n",
       " 'Z': 0.00486778846153846}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def combine(d1, d2, wt, alpha, beta):\n",
    "    return {k: alpha*wt*d1[k] + beta*(1 - wt)*d2[k] for k in d1 | d2}\n",
    "\n",
    "combined = combine(uk, usa, WEIGHT, 1, 1)\n",
    "combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "80a3015a",
   "metadata": {},
   "outputs": [],
   "source": [
    "sum(combined.values())\n",
    "assert(abs(sum(combined.values()) - 1) < 1e-9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ebad4268",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'A': 0.07828377869777947,\n",
       " 'B': 0.11254083388592143,\n",
       " 'C': 0.0971771719802003,\n",
       " 'D': 0.09518533683373878,\n",
       " 'E': 0.1069983866064957,\n",
       " 'F': 0.0951853368337388,\n",
       " 'G': 0.08531741136414106,\n",
       " 'H': 0.11283306788587572,\n",
       " 'I': 0.0951853368337388,\n",
       " 'J': 0.1002586060641616,\n",
       " 'K': 0.06507752421234884,\n",
       " 'L': 0.0821997170250188,\n",
       " 'M': 0.08948692480541899,\n",
       " 'N': 0.08264011323284749,\n",
       " 'O': 0.08464241643423065,\n",
       " 'P': 0.10274785272303556,\n",
       " 'Q': 0.05259935581760589,\n",
       " 'R': 0.0844589880358986,\n",
       " 'S': 0.09104420974377642,\n",
       " 'T': 0.10797893360540431,\n",
       " 'U': 0.0951853368337388,\n",
       " 'V': 0.024828565415291062,\n",
       " 'W': 0.13326641977414694,\n",
       " 'X': 0.0,\n",
       " 'Y': 0.0,\n",
       " 'Z': 0.0}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "means = {k: DELTA*WEIGHT*uk[k]/(WEIGHT*uk[k] + (1 - WEIGHT)*usa[k]) for k in LETTERS}\n",
    "means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c2712813",
   "metadata": {},
   "outputs": [],
   "source": [
    "grand_mean = DELTA*WEIGHT\n",
    "grand_mean\n",
    "\n",
    "assert(abs( sum(means[k]*combined[k] for k in LETTERS) - grand_mean ) < 1e-9)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b3b77df",
   "metadata": {},
   "source": [
    "### Now we pass over to numpy\n",
    "\n",
    "The plan is to do an OLS regression of Height on Letter and compute the F-statistic and its p-value. If F is larger than expected, that will reject the null hypothesis that Letter has no effect on Height."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "8e7ea653",
   "metadata": {},
   "outputs": [],
   "source": [
    "v_ltrs = np.array(LETTERS)\n",
    "v_probs_uk = np.array([uk[k] for k in LETTERS])\n",
    "v_probs_usa = np.array([usa[k] for k in LETTERS])\n",
    "\n",
    "def gen_sample(N):\n",
    "    idents = np.random.binomial(1, 0.19, N)\n",
    "    ltrs_uk = np.random.choice(v_ltrs, N, p=v_probs_uk)\n",
    "    ltrs_usa = np.random.choice(v_ltrs, N, p=v_probs_usa)\n",
    "    ltrs = np.where(idents == 0, ltrs_usa, ltrs_uk)\n",
    "    heights = idents*DELTA + np.random.normal(0, 1, N)\n",
    "    df = pd.DataFrame(data=np.column_stack((idents, ltrs, heights)))\n",
    "    df.columns = ('Ident', 'Letter', 'Height')\n",
    "    df = df.astype({'Height': 'float'})\n",
    "    return df\n",
    "\n",
    "#df = gen_sample(10000)\n",
    "\n",
    "def fit(df):\n",
    "    y, X = dmatrices('Height ~ Letter', data=df, return_type='dataframe')\n",
    "    mod = sm.OLS(y, X)\n",
    "    res = mod.fit()\n",
    "    return res\n",
    "\n",
    "#f = fit(df)\n",
    "\n",
    "#print(f.f_pvalue, f.f_test, f.fvalue)\n",
    "#print(f.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "28dac3c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def trials(K, N):\n",
    "    return np.array([fit(gen_sample(N)).f_pvalue for _ in range(K)])\n",
    "\n",
    "DELTA = 0.5\n",
    "\n",
    "gold = trials(1000, 100000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "8cd063d0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "978"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(1 if p < 0.05 else 0 for p in gold)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8052b17a",
   "metadata": {},
   "source": [
    "The previous result tells us that 98% of the time, a sample of 100,000 is sufficient to reject the null hypothesis, assuming `DELTA = 0.5` (which is unrealistically low)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "b6fd3904",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "475"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "DELTA = 1\n",
    "\n",
    "silver = trials(500, 25000)\n",
    "sum(1 if p < 0.05 else 0 for p in silver)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b58a5afa",
   "metadata": {},
   "source": [
    "The previous result tells us that when `DELTA = 1`, a sample of 25,000 is sufficient to reject the null hypothesis 95% of the tme."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}