Guest User

Someone is wrong on the internet!

a guest
Jan 21st, 2023
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
JSON 11.44 KB | None | 0 0
  1. {
  2.  "cells": [
  3.   {
  4.    "cell_type": "code",
  5.    "execution_count": 2,
  6.    "id": "d434c129",
  7.    "metadata": {},
  8.    "outputs": [],
  9.    "source": [
  10.     "from collections import defaultdict\n",
  11.     "import numpy as np\n",
  12.     "import pandas as pd\n",
  13.     "import statsmodels.api as sm\n",
  14.     "from patsy import dmatrices"
  15.    ]
  16.   },
  17.   {
  18.    "cell_type": "code",
  19.    "execution_count": 28,
  20.    "id": "2249d109",
  21.    "metadata": {},
  22.    "outputs": [],
  23.    "source": [
  24.     "#def file_reader(fname):\n",
  25.     "#    d = defaultdict(float)\n",
  26.     "#    with open(fname, 'r') as f:\n",
  27.     "#        for line in f:\n",
  28.     "#            bits = line.strip().split()\n",
  29.     "#            if len(bits) < 2:\n",
  30.     "#                continue\n",
  31.     "#            d[bits[0]] = float(bits[-1])\n",
  32.     "#    return d\n",
  33.     "\n",
  34.     "LETTERS = [chr(i) for i in range(ord('A'), ord('Z') + 1)]\n",
  35.     "#uk = file_reader('uk.txt')\n",
  36.     "#usa = file_reader('usa.txt')"
  37.    ]
  38.   },
  39.   {
  40.    "cell_type": "code",
  41.    "execution_count": 32,
  42.    "id": "d7df9b9b",
  43.    "metadata": {},
  44.    "outputs": [],
  45.    "source": [
  46.     "# From https://one-name.org/modern-british-surnames/teaching/your-surname/micro-scale/\n",
  47.     "# Data from UK 1957\n",
  48.     "\n",
  49.     "uk =    {'A': 3.0,\n",
  50.     "         'B': 10.5,\n",
  51.     "         'C': 7.9,\n",
  52.     "         'D': 4.5,\n",
  53.     "         'E': 2.2,\n",
  54.     "         'F': 3.4,\n",
  55.     "         'G': 4.9,\n",
  56.     "         'H': 8.8,\n",
  57.     "         'I': 0.4,\n",
  58.     "         'J': 3.2,\n",
  59.     "         'K': 2.1,\n",
  60.     "         'L': 4.1,\n",
  61.     "         'M': 8.9,\n",
  62.     "         'N': 1.6,\n",
  63.     "         'O': 1.3,\n",
  64.     "         'P': 5.5,\n",
  65.     "         'Q': 0.1,\n",
  66.     "         'R': 5.1,\n",
  67.     "         'S': 8.9,\n",
  68.     "         'T': 4.1,\n",
  69.     "         'U': 0.2,\n",
  70.     "         'V': 0.4,\n",
  71.     "         'W': 8.5,\n",
  72.     "         'X': 0.0,\n",
  73.     "         'Y': 0.0}"
  74.    ]
  75.   },
  76.   {
  77.    "cell_type": "code",
  78.    "execution_count": 33,
  79.    "id": "5e926dec",
  80.    "metadata": {},
  81.    "outputs": [],
  82.    "source": [
  83.     "# From https://www.dataminingdna.com/most-common-first-letters-of-last-names/\n",
  84.     "# Data from USA 2010 (whole population)\n",
  85.     "\n",
  86.     "usa =   {'M': 9.6,\n",
  87.     "         'S': 9.4,\n",
  88.     "         'B': 8.5,\n",
  89.     "         'C': 7.7,\n",
  90.     "         'H': 7.1,\n",
  91.     "         'R': 5.9,\n",
  92.     "         'G': 5.6,\n",
  93.     "         'W': 5.5,\n",
  94.     "         'P': 5.0,\n",
  95.     "         'L': 4.9,\n",
  96.     "         'D': 4.5,\n",
  97.     "         'A': 3.8,\n",
  98.     "         'T': 3.5,\n",
  99.     "         'F': 3.4,\n",
  100.     "         'K': 3.3,\n",
  101.     "         'J': 3.0,\n",
  102.     "         'N': 1.9,\n",
  103.     "         'E': 1.9,\n",
  104.     "         'V': 1.8,\n",
  105.     "         'O': 1.5,\n",
  106.     "         'Y': 0.6,\n",
  107.     "         'Z': 0.6,\n",
  108.     "         'I': 0.4,\n",
  109.     "         'Q': 0.2,\n",
  110.     "         'U': 0.2,\n",
  111.     "         'X': 0.04}"
  112.    ]
  113.   },
  114.   {
  115.    "cell_type": "markdown",
  116.    "id": "bb178ed8",
  117.    "metadata": {},
  118.    "source": [
  119.     "### Rationale\n",
  120.     "\n",
  121.     "My assumption is that difference between \"UK distribution in 1957\" and \"USA distribution in 2010\" is either similar in magnitude to or less than the difference between \"Hispanic distribution in 2010\" and \"Non-Hispanic distribution in 2010\""
  122.    ]
  123.   },
  124.   {
  125.    "cell_type": "code",
  126.    "execution_count": 35,
  127.    "id": "51147989",
  128.    "metadata": {},
  129.    "outputs": [],
  130.    "source": [
  131.     "s = sum(uk.values())\n",
  132.     "uk = defaultdict(float, {k: v/s for k, v in uk.items()})"
  133.    ]
  134.   },
  135.   {
  136.    "cell_type": "code",
  137.    "execution_count": 37,
  138.    "id": "9b3d0267",
  139.    "metadata": {},
  140.    "outputs": [],
  141.    "source": [
  142.     "s = sum(usa.values())\n",
  143.     "usa = defaultdict(float, {k: v/s for k, v in usa.items()})"
  144.    ]
  145.   },
  146.   {
  147.    "cell_type": "markdown",
  148.    "id": "0007b84e",
  149.    "metadata": {},
  150.    "source": [
  151.     "### Parameters\n",
  152.     "\n",
  153.     "* `DELTA` represents the difference (in units of sigma) between our two subpopulations (Hispanic and Non-Hispanic, or UK and USA)\n",
  154.     "* `WEIGHT` represents the proportion of people in the former subpopulation"
  155.    ]
  156.   },
  157.   {
  158.    "cell_type": "code",
  159.    "execution_count": 9,
  160.    "id": "c49ff62d",
  161.    "metadata": {},
  162.    "outputs": [],
  163.    "source": [
  164.     "DELTA = 0.5\n",
  165.     "WEIGHT = 0.19"
  166.    ]
  167.   },
  168.   {
  169.    "cell_type": "code",
  170.    "execution_count": 10,
  171.    "id": "3fc8e87d",
  172.    "metadata": {},
  173.    "outputs": [
  174.     {
  175.      "data": {
  176.       "text/plain": [
  177.        "{'A': 0.036552218489341974,\n",
  178.        " 'B': 0.08899045702038924,\n",
  179.        " 'C': 0.0775402330475749,\n",
  180.        " 'D': 0.04509275081093604,\n",
  181.        " 'E': 0.019611450610132836,\n",
  182.        " 'F': 0.03407007839048501,\n",
  183.        " 'G': 0.05478008186592523,\n",
  184.        " 'H': 0.07438931205591595,\n",
  185.        " 'I': 0.004008244516527648,\n",
  186.        " 'J': 0.030443359978375033,\n",
  187.        " 'K': 0.030778860634847072,\n",
  188.        " 'L': 0.047574890909793006,\n",
  189.        " 'M': 0.09486252703120171,\n",
  190.        " 'N': 0.01846687229687982,\n",
  191.        " 'O': 0.01464939083256101,\n",
  192.        " 'P': 0.05105687171763978,\n",
  193.        " 'Q': 0.0018133592060549889,\n",
  194.        " 'R': 0.057595502201112134,\n",
  195.        " 'S': 0.09323993087735556,\n",
  196.        " 'T': 0.03621671783286993,\n",
  197.        " 'U': 0.002004122258263824,\n",
  198.        " 'V': 0.015366417593450723,\n",
  199.        " 'W': 0.06083625366852022,\n",
  200.        " 'X': 0.0003245192307692307,\n",
  201.        " 'Y': 0.00486778846153846,\n",
  202.        " 'Z': 0.00486778846153846}"
  203.       ]
  204.      },
  205.      "execution_count": 10,
  206.      "metadata": {},
  207.      "output_type": "execute_result"
  208.     }
  209.    ],
  210.    "source": [
  211.     "def combine(d1, d2, wt, alpha, beta):\n",
  212.     "    return {k: alpha*wt*d1[k] + beta*(1 - wt)*d2[k] for k in d1 | d2}\n",
  213.     "\n",
  214.     "combined = combine(uk, usa, WEIGHT, 1, 1)\n",
  215.     "combined"
  216.    ]
  217.   },
  218.   {
  219.    "cell_type": "code",
  220.    "execution_count": 11,
  221.    "id": "80a3015a",
  222.    "metadata": {},
  223.    "outputs": [],
  224.    "source": [
  225.     "sum(combined.values())\n",
  226.     "assert(abs(sum(combined.values()) - 1) < 1e-9)"
  227.    ]
  228.   },
  229.   {
  230.    "cell_type": "code",
  231.    "execution_count": 12,
  232.    "id": "ebad4268",
  233.    "metadata": {},
  234.    "outputs": [
  235.     {
  236.      "data": {
  237.       "text/plain": [
  238.        "{'A': 0.07828377869777947,\n",
  239.        " 'B': 0.11254083388592143,\n",
  240.        " 'C': 0.0971771719802003,\n",
  241.        " 'D': 0.09518533683373878,\n",
  242.        " 'E': 0.1069983866064957,\n",
  243.        " 'F': 0.0951853368337388,\n",
  244.        " 'G': 0.08531741136414106,\n",
  245.        " 'H': 0.11283306788587572,\n",
  246.        " 'I': 0.0951853368337388,\n",
  247.        " 'J': 0.1002586060641616,\n",
  248.        " 'K': 0.06507752421234884,\n",
  249.        " 'L': 0.0821997170250188,\n",
  250.        " 'M': 0.08948692480541899,\n",
  251.        " 'N': 0.08264011323284749,\n",
  252.        " 'O': 0.08464241643423065,\n",
  253.        " 'P': 0.10274785272303556,\n",
  254.        " 'Q': 0.05259935581760589,\n",
  255.        " 'R': 0.0844589880358986,\n",
  256.        " 'S': 0.09104420974377642,\n",
  257.        " 'T': 0.10797893360540431,\n",
  258.        " 'U': 0.0951853368337388,\n",
  259.        " 'V': 0.024828565415291062,\n",
  260.        " 'W': 0.13326641977414694,\n",
  261.        " 'X': 0.0,\n",
  262.        " 'Y': 0.0,\n",
  263.        " 'Z': 0.0}"
  264.       ]
  265.      },
  266.      "execution_count": 12,
  267.      "metadata": {},
  268.      "output_type": "execute_result"
  269.     }
  270.    ],
  271.    "source": [
  272.     "means = {k: DELTA*WEIGHT*uk[k]/(WEIGHT*uk[k] + (1 - WEIGHT)*usa[k]) for k in LETTERS}\n",
  273.     "means"
  274.    ]
  275.   },
  276.   {
  277.    "cell_type": "code",
  278.    "execution_count": 13,
  279.    "id": "c2712813",
  280.    "metadata": {},
  281.    "outputs": [],
  282.    "source": [
  283.     "grand_mean = DELTA*WEIGHT\n",
  284.     "grand_mean\n",
  285.     "\n",
  286.     "assert(abs( sum(means[k]*combined[k] for k in LETTERS) - grand_mean ) < 1e-9)"
  287.    ]
  288.   },
  289.   {
  290.    "cell_type": "markdown",
  291.    "id": "0b3b77df",
  292.    "metadata": {},
  293.    "source": [
  294.     "### Now we pass over to numpy\n",
  295.     "\n",
  296.     "The plan is to do an OLS regression of Height on Letter and compute the F-statistic and its p-value. If F is larger than expected, that will reject the null hypothesis that Letter has no effect on Height."
  297.    ]
  298.   },
  299.   {
  300.    "cell_type": "code",
  301.    "execution_count": 40,
  302.    "id": "8e7ea653",
  303.    "metadata": {},
  304.    "outputs": [],
  305.    "source": [
  306.     "v_ltrs = np.array(LETTERS)\n",
  307.     "v_probs_uk = np.array([uk[k] for k in LETTERS])\n",
  308.     "v_probs_usa = np.array([usa[k] for k in LETTERS])\n",
  309.     "\n",
  310.     "def gen_sample(N):\n",
  311.     "    idents = np.random.binomial(1, 0.19, N)\n",
  312.     "    ltrs_uk = np.random.choice(v_ltrs, N, p=v_probs_uk)\n",
  313.     "    ltrs_usa = np.random.choice(v_ltrs, N, p=v_probs_usa)\n",
  314.     "    ltrs = np.where(idents == 0, ltrs_usa, ltrs_uk)\n",
  315.     "    heights = idents*DELTA + np.random.normal(0, 1, N)\n",
  316.     "    df = pd.DataFrame(data=np.column_stack((idents, ltrs, heights)))\n",
  317.     "    df.columns = ('Ident', 'Letter', 'Height')\n",
  318.     "    df = df.astype({'Height': 'float'})\n",
  319.     "    return df\n",
  320.     "\n",
  321.     "#df = gen_sample(10000)\n",
  322.     "\n",
  323.     "def fit(df):\n",
  324.     "    y, X = dmatrices('Height ~ Letter', data=df, return_type='dataframe')\n",
  325.     "    mod = sm.OLS(y, X)\n",
  326.     "    res = mod.fit()\n",
  327.     "    return res\n",
  328.     "\n",
  329.     "#f = fit(df)\n",
  330.     "\n",
  331.     "#print(f.f_pvalue, f.f_test, f.fvalue)\n",
  332.     "#print(f.summary())"
  333.    ]
  334.   },
  335.   {
  336.    "cell_type": "code",
  337.    "execution_count": 22,
  338.    "id": "28dac3c8",
  339.    "metadata": {},
  340.    "outputs": [],
  341.    "source": [
  342.     "def trials(K, N):\n",
  343.     "    return np.array([fit(gen_sample(N)).f_pvalue for _ in range(K)])\n",
  344.     "\n",
  345.     "DELTA = 0.5\n",
  346.     "\n",
  347.     "gold = trials(1000, 100000)"
  348.    ]
  349.   },
  350.   {
  351.    "cell_type": "code",
  352.    "execution_count": 23,
  353.    "id": "8cd063d0",
  354.    "metadata": {},
  355.    "outputs": [
  356.     {
  357.      "data": {
  358.       "text/plain": [
  359.        "978"
  360.       ]
  361.      },
  362.      "execution_count": 23,
  363.      "metadata": {},
  364.      "output_type": "execute_result"
  365.     }
  366.    ],
  367.    "source": [
  368.     "sum(1 if p < 0.05 else 0 for p in gold)"
  369.    ]
  370.   },
  371.   {
  372.    "cell_type": "markdown",
  373.    "id": "8052b17a",
  374.    "metadata": {},
  375.    "source": [
  376.     "The previous result tells us that 98% of the time, a sample of 100,000 is sufficient to reject the null hypothesis, assuming `DELTA = 0.5` (which is unrealistically low)."
  377.    ]
  378.   },
  379.   {
  380.    "cell_type": "code",
  381.    "execution_count": 25,
  382.    "id": "b6fd3904",
  383.    "metadata": {},
  384.    "outputs": [
  385.     {
  386.      "data": {
  387.       "text/plain": [
  388.        "475"
  389.       ]
  390.      },
  391.      "execution_count": 25,
  392.      "metadata": {},
  393.      "output_type": "execute_result"
  394.     }
  395.    ],
  396.    "source": [
  397.     "DELTA = 1\n",
  398.     "\n",
  399.     "silver = trials(500, 25000)\n",
  400.     "sum(1 if p < 0.05 else 0 for p in silver)"
  401.    ]
  402.   },
  403.   {
  404.    "cell_type": "markdown",
  405.    "id": "b58a5afa",
  406.    "metadata": {},
  407.    "source": [
  408.     "The previous result tells us that when `DELTA = 1`, a sample of 25,000 is sufficient to reject the null hypothesis 95% of the tme."
  409.    ]
  410.   }
  411.  ],
  412.  "metadata": {
  413.   "kernelspec": {
  414.    "display_name": "Python 3 (ipykernel)",
  415.    "language": "python",
  416.    "name": "python3"
  417.   },
  418.   "language_info": {
  419.    "codemirror_mode": {
  420.     "name": "ipython",
  421.     "version": 3
  422.    },
  423.    "file_extension": ".py",
  424.    "mimetype": "text/x-python",
  425.    "name": "python",
  426.    "nbconvert_exporter": "python",
  427.    "pygments_lexer": "ipython3",
  428.    "version": "3.10.9"
  429.   }
  430.  },
  431.  "nbformat": 4,
  432.  "nbformat_minor": 5
  433. }
Advertisement
Add Comment
Please, Sign In to add comment