Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Sep 18 12:54:05 PDT 2019\r\n"
     ]
    }
   ],
   "source": [
    "import numpy as np, matplotlib.pyplot as plt, pandas as pd\n",
    "pd.set_option('display.max_rows', 8)\n",
    "!date\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# How to calculate the probability an individual lives in a household with an Active TB case\n",
    "\n",
    "(for a given year and location, say 2017 South Africa)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>hh_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>241</th>\n",
       "      <td>72.145127</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4435</th>\n",
       "      <td>23.869853</td>\n",
       "      <td>male</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>624</th>\n",
       "      <td>54.463637</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9979</th>\n",
       "      <td>6.070569</td>\n",
       "      <td>female</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4546</th>\n",
       "      <td>81.487253</td>\n",
       "      <td>male</td>\n",
       "      <td>2999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8625</th>\n",
       "      <td>79.135925</td>\n",
       "      <td>male</td>\n",
       "      <td>2999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>266</th>\n",
       "      <td>72.139917</td>\n",
       "      <td>male</td>\n",
       "      <td>2999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8846</th>\n",
       "      <td>34.395619</td>\n",
       "      <td>female</td>\n",
       "      <td>2999</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            age     sex  hh_id\n",
       "241   72.145127  female      1\n",
       "4435  23.869853    male      1\n",
       "624   54.463637  female      1\n",
       "9979   6.070569  female      2\n",
       "...         ...     ...    ...\n",
       "4546  81.487253    male   2999\n",
       "8625  79.135925    male   2999\n",
       "266   72.139917    male   2999\n",
       "8846  34.395619  female   2999\n",
       "\n",
       "[10000 rows x 3 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# first load data on individuals, including their age, sex, and household ID\n",
    "\n",
    "# I'll just simulate it for now\n",
    "N = 10_000\n",
    "\n",
    "# set random seed for reproducibility\n",
    "np.random.seed(12345)\n",
    "\n",
    "# simulate data (to be replaced with real data, e.g. from DHS, eventually)\n",
    "df = pd.DataFrame(index=range(N))\n",
    "df['age'] = np.random.uniform(0, 100, size=N)\n",
    "df['sex'] = np.random.choice(['male', 'female'], size=N)\n",
    "df['hh_id'] = np.random.choice(range(3_000), size=N)\n",
    "df.sort_values('hh_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# then for a given age_group/sex combo\n",
    "age_start = 1\n",
    "age_end = 5\n",
    "sex = 'male'\n",
    "\n",
    "# find all the households with such a person\n",
    "hh_with = df.query(f'age >= {age_start} and age < {age_end} and sex == \"{sex}\"').hh_id.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "________________________________________________________________________________\n",
      "[Memory] Calling vivarium_gbd_access.gbd.get_incidence_prevalence...\n",
      "get_incidence_prevalence(entity_id=cid(954), location_id=196, entity_type='cause')\n",
      "_______________________________________get_incidence_prevalence - 177.5s, 3.0min\n"
     ]
    }
   ],
   "source": [
    "# then for each of those households, compute the\n",
    "# probability that there is at least one person with active tb\n",
    "\n",
    "import vivarium_inputs, gbd_mapping\n",
    "prev_ltbi = vivarium_inputs.interface.get_measure(\n",
    "    gbd_mapping.causes.latent_tuberculosis_infection, 'prevalence', 'South Africa')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "________________________________________________________________________________\n",
      "[Memory] Calling vivarium_gbd_access.gbd.get_incidence_prevalence...\n",
      "get_incidence_prevalence(entity_id=cid(297), location_id=196, entity_type='cause')\n",
      "_______________________________________get_incidence_prevalence - 103.1s, 1.7min\n"
     ]
    }
   ],
   "source": [
    "prev_any_tb = vivarium_inputs.interface.get_measure(\n",
    "    gbd_mapping.causes.tuberculosis, 'prevalence', 'South Africa')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>draw</th>\n",
       "      <th>location</th>\n",
       "      <th>sex</th>\n",
       "      <th>age_group_start</th>\n",
       "      <th>age_group_end</th>\n",
       "      <th>year_start</th>\n",
       "      <th>year_end</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">0</th>\n",
       "      <th rowspan=\"4\" valign=\"top\">South Africa</th>\n",
       "      <th rowspan=\"4\" valign=\"top\">Female</th>\n",
       "      <th rowspan=\"4\" valign=\"top\">0.0</th>\n",
       "      <th rowspan=\"4\" valign=\"top\">0.019178</th>\n",
       "      <th>1990</th>\n",
       "      <th>1991</th>\n",
       "      <td>0.000047</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1991</th>\n",
       "      <th>1992</th>\n",
       "      <td>0.000038</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1992</th>\n",
       "      <th>1993</th>\n",
       "      <td>0.000030</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1993</th>\n",
       "      <th>1994</th>\n",
       "      <td>0.000022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">999</th>\n",
       "      <th rowspan=\"4\" valign=\"top\">South Africa</th>\n",
       "      <th rowspan=\"4\" valign=\"top\">Male</th>\n",
       "      <th rowspan=\"4\" valign=\"top\">95.0</th>\n",
       "      <th rowspan=\"4\" valign=\"top\">125.000000</th>\n",
       "      <th>2014</th>\n",
       "      <th>2015</th>\n",
       "      <td>0.009573</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015</th>\n",
       "      <th>2016</th>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016</th>\n",
       "      <th>2017</th>\n",
       "      <td>0.008744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2017</th>\n",
       "      <th>2018</th>\n",
       "      <td>0.008260</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1288000 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                               value\n",
       "draw location     sex    age_group_start age_group_end year_start year_end          \n",
       "0    South Africa Female 0.0             0.019178      1990       1991      0.000047\n",
       "                                                       1991       1992      0.000038\n",
       "                                                       1992       1993      0.000030\n",
       "                                                       1993       1994      0.000022\n",
       "...                                                                              ...\n",
       "999  South Africa Male   95.0            125.000000    2014       2015      0.009573\n",
       "                                                       2015       2016      0.009185\n",
       "                                                       2016       2017      0.008744\n",
       "                                                       2017       2018      0.008260\n",
       "\n",
       "[1288000 rows x 1 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index_cols = ['draw', 'location', 'sex', 'age_group_start', 'age_group_end', 'year_start', 'year_end']\n",
    "prev_active_tb = prev_any_tb.set_index(index_cols) - prev_ltbi.set_index(index_cols)\n",
    "prev_active_tb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# there is probably a way to use vivarium to add a column to df\n",
    "# that includes the probability of active tb for each simulant!\n",
    "\n",
    "# I'll just simulate this for now, too\n",
    "df['pr_active_tb'] = np.random.uniform(0, .01, size=N)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.009567952684606862"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def pr_ac_tb_in_hh(df_hh):\n",
    "    pr_no_tb = 1 - df_hh.pr_active_tb\n",
    "    pr_no_tb_in_hh = np.prod(pr_no_tb)\n",
    "    return 1 - pr_no_tb_in_hh\n",
    "pr_ac_tb_in_hh(df[df.hh_id == 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>hh_id</th>\n",
       "      <th>pr_active_tb</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>241</th>\n",
       "      <td>72.145127</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>0.001116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>624</th>\n",
       "      <td>54.463637</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>0.004184</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4435</th>\n",
       "      <td>23.869853</td>\n",
       "      <td>male</td>\n",
       "      <td>1</td>\n",
       "      <td>0.004295</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            age     sex  hh_id  pr_active_tb\n",
       "241   72.145127  female      1      0.001116\n",
       "624   54.463637  female      1      0.004184\n",
       "4435  23.869853    male      1      0.004295"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.hh_id==1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "hh_id\n",
       "5       0.003730\n",
       "58      0.028011\n",
       "68      0.010399\n",
       "72      0.011235\n",
       "          ...   \n",
       "2939    0.014072\n",
       "2953    0.015080\n",
       "2958    0.028687\n",
       "2963    0.022994\n",
       "Length: 175, dtype: float64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pr_ac_tb = df[df.hh_id.isin(hh_with)].groupby('hh_id').apply(pr_ac_tb_in_hh)\n",
    "pr_ac_tb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.01976982850644556"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pr_ac_tb_age_sex = pr_ac_tb.mean()\n",
    "pr_ac_tb_age_sex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now repeat that for each age and sex group, and you\n",
    "# will get the probability of an active TB case in the household\n",
    "# as a function of age and sex for the location and year\n",
    "\n",
    "age_bins = [0, 1, 5, 10] + list(range(15, 101, 5))\n",
    "for i, age_start in enumerate(age_bins[:-1]):\n",
    "    age_end = age_bins[i+1]\n",
    "    for sex in ['male', 'female']:\n",
    "        # find and store pr_ac_tb_age_sex\n",
    "        pr_ac_tb_age_sex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# and do that for each draw to propagate through the uncertainty\n",
    "# probably good to do a bootstrap resampling of the person data, too"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "vivarium_conic_sqlns",
   "language": "python",
   "name": "vivarium_conic_sqlns"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}