Advertisement
Guest User

Untitled

a guest
Oct 15th, 2019
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 16.40 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [
  8. {
  9. "name": "stdout",
  10. "output_type": "stream",
  11. "text": [
  12. "Wed Sep 18 12:54:05 PDT 2019\r\n"
  13. ]
  14. }
  15. ],
  16. "source": [
  17. "import numpy as np, matplotlib.pyplot as plt, pandas as pd\n",
  18. "pd.set_option('display.max_rows', 8)\n",
  19. "!date\n",
  20. "\n",
  21. "%load_ext autoreload\n",
  22. "%autoreload 2"
  23. ]
  24. },
  25. {
  26. "cell_type": "markdown",
  27. "metadata": {},
  28. "source": [
  29. "# How to calculate the probability an individual lives in a household with an Active TB case\n",
  30. "\n",
  31. "(for a given year and location, say 2017 South Africa)"
  32. ]
  33. },
  34. {
  35. "cell_type": "code",
  36. "execution_count": 2,
  37. "metadata": {},
  38. "outputs": [
  39. {
  40. "data": {
  41. "text/html": [
  42. "<div>\n",
  43. "<style scoped>\n",
  44. " .dataframe tbody tr th:only-of-type {\n",
  45. " vertical-align: middle;\n",
  46. " }\n",
  47. "\n",
  48. " .dataframe tbody tr th {\n",
  49. " vertical-align: top;\n",
  50. " }\n",
  51. "\n",
  52. " .dataframe thead th {\n",
  53. " text-align: right;\n",
  54. " }\n",
  55. "</style>\n",
  56. "<table border=\"1\" class=\"dataframe\">\n",
  57. " <thead>\n",
  58. " <tr style=\"text-align: right;\">\n",
  59. " <th></th>\n",
  60. " <th>age</th>\n",
  61. " <th>sex</th>\n",
  62. " <th>hh_id</th>\n",
  63. " </tr>\n",
  64. " </thead>\n",
  65. " <tbody>\n",
  66. " <tr>\n",
  67. " <th>241</th>\n",
  68. " <td>72.145127</td>\n",
  69. " <td>female</td>\n",
  70. " <td>1</td>\n",
  71. " </tr>\n",
  72. " <tr>\n",
  73. " <th>4435</th>\n",
  74. " <td>23.869853</td>\n",
  75. " <td>male</td>\n",
  76. " <td>1</td>\n",
  77. " </tr>\n",
  78. " <tr>\n",
  79. " <th>624</th>\n",
  80. " <td>54.463637</td>\n",
  81. " <td>female</td>\n",
  82. " <td>1</td>\n",
  83. " </tr>\n",
  84. " <tr>\n",
  85. " <th>9979</th>\n",
  86. " <td>6.070569</td>\n",
  87. " <td>female</td>\n",
  88. " <td>2</td>\n",
  89. " </tr>\n",
  90. " <tr>\n",
  91. " <th>...</th>\n",
  92. " <td>...</td>\n",
  93. " <td>...</td>\n",
  94. " <td>...</td>\n",
  95. " </tr>\n",
  96. " <tr>\n",
  97. " <th>4546</th>\n",
  98. " <td>81.487253</td>\n",
  99. " <td>male</td>\n",
  100. " <td>2999</td>\n",
  101. " </tr>\n",
  102. " <tr>\n",
  103. " <th>8625</th>\n",
  104. " <td>79.135925</td>\n",
  105. " <td>male</td>\n",
  106. " <td>2999</td>\n",
  107. " </tr>\n",
  108. " <tr>\n",
  109. " <th>266</th>\n",
  110. " <td>72.139917</td>\n",
  111. " <td>male</td>\n",
  112. " <td>2999</td>\n",
  113. " </tr>\n",
  114. " <tr>\n",
  115. " <th>8846</th>\n",
  116. " <td>34.395619</td>\n",
  117. " <td>female</td>\n",
  118. " <td>2999</td>\n",
  119. " </tr>\n",
  120. " </tbody>\n",
  121. "</table>\n",
  122. "<p>10000 rows × 3 columns</p>\n",
  123. "</div>"
  124. ],
  125. "text/plain": [
  126. " age sex hh_id\n",
  127. "241 72.145127 female 1\n",
  128. "4435 23.869853 male 1\n",
  129. "624 54.463637 female 1\n",
  130. "9979 6.070569 female 2\n",
  131. "... ... ... ...\n",
  132. "4546 81.487253 male 2999\n",
  133. "8625 79.135925 male 2999\n",
  134. "266 72.139917 male 2999\n",
  135. "8846 34.395619 female 2999\n",
  136. "\n",
  137. "[10000 rows x 3 columns]"
  138. ]
  139. },
  140. "execution_count": 2,
  141. "metadata": {},
  142. "output_type": "execute_result"
  143. }
  144. ],
  145. "source": [
  146. "# first load data on individuals, including their age, sex, and household ID\n",
  147. "\n",
  148. "# I'll just simulate it for now\n",
  149. "N = 10_000\n",
  150. "\n",
  151. "# set random seed for reproducibility\n",
  152. "np.random.seed(12345)\n",
  153. "\n",
  154. "# simulate data (to be replaced with real data, e.g. from DHS, eventually)\n",
  155. "df = pd.DataFrame(index=range(N))\n",
  156. "df['age'] = np.random.uniform(0, 100, size=N)\n",
  157. "df['sex'] = np.random.choice(['male', 'female'], size=N)\n",
  158. "df['hh_id'] = np.random.choice(range(3_000), size=N)\n",
  159. "df.sort_values('hh_id')"
  160. ]
  161. },
  162. {
  163. "cell_type": "code",
  164. "execution_count": 3,
  165. "metadata": {},
  166. "outputs": [],
  167. "source": [
  168. "# then for a given age_group/sex combo\n",
  169. "age_start = 1\n",
  170. "age_end = 5\n",
  171. "sex = 'male'\n",
  172. "\n",
  173. "# find all the households with such a person\n",
  174. "hh_with = df.query(f'age >= {age_start} and age < {age_end} and sex == \"{sex}\"').hh_id.unique()"
  175. ]
  176. },
  177. {
  178. "cell_type": "code",
  179. "execution_count": 4,
  180. "metadata": {},
  181. "outputs": [
  182. {
  183. "name": "stdout",
  184. "output_type": "stream",
  185. "text": [
  186. "________________________________________________________________________________\n",
  187. "[Memory] Calling vivarium_gbd_access.gbd.get_incidence_prevalence...\n",
  188. "get_incidence_prevalence(entity_id=cid(954), location_id=196, entity_type='cause')\n",
  189. "_______________________________________get_incidence_prevalence - 177.5s, 3.0min\n"
  190. ]
  191. }
  192. ],
  193. "source": [
  194. "# then for each of those households, compute the\n",
  195. "# probability that there is at least one person with active tb\n",
  196. "\n",
  197. "import vivarium_inputs, gbd_mapping\n",
  198. "prev_ltbi = vivarium_inputs.interface.get_measure(\n",
  199. " gbd_mapping.causes.latent_tuberculosis_infection, 'prevalence', 'South Africa')"
  200. ]
  201. },
  202. {
  203. "cell_type": "code",
  204. "execution_count": 5,
  205. "metadata": {},
  206. "outputs": [
  207. {
  208. "name": "stdout",
  209. "output_type": "stream",
  210. "text": [
  211. "________________________________________________________________________________\n",
  212. "[Memory] Calling vivarium_gbd_access.gbd.get_incidence_prevalence...\n",
  213. "get_incidence_prevalence(entity_id=cid(297), location_id=196, entity_type='cause')\n",
  214. "_______________________________________get_incidence_prevalence - 103.1s, 1.7min\n"
  215. ]
  216. }
  217. ],
  218. "source": [
  219. "prev_any_tb = vivarium_inputs.interface.get_measure(\n",
  220. " gbd_mapping.causes.tuberculosis, 'prevalence', 'South Africa')"
  221. ]
  222. },
  223. {
  224. "cell_type": "code",
  225. "execution_count": 6,
  226. "metadata": {},
  227. "outputs": [
  228. {
  229. "data": {
  230. "text/html": [
  231. "<div>\n",
  232. "<style scoped>\n",
  233. " .dataframe tbody tr th:only-of-type {\n",
  234. " vertical-align: middle;\n",
  235. " }\n",
  236. "\n",
  237. " .dataframe tbody tr th {\n",
  238. " vertical-align: top;\n",
  239. " }\n",
  240. "\n",
  241. " .dataframe thead th {\n",
  242. " text-align: right;\n",
  243. " }\n",
  244. "</style>\n",
  245. "<table border=\"1\" class=\"dataframe\">\n",
  246. " <thead>\n",
  247. " <tr style=\"text-align: right;\">\n",
  248. " <th></th>\n",
  249. " <th></th>\n",
  250. " <th></th>\n",
  251. " <th></th>\n",
  252. " <th></th>\n",
  253. " <th></th>\n",
  254. " <th></th>\n",
  255. " <th>value</th>\n",
  256. " </tr>\n",
  257. " <tr>\n",
  258. " <th>draw</th>\n",
  259. " <th>location</th>\n",
  260. " <th>sex</th>\n",
  261. " <th>age_group_start</th>\n",
  262. " <th>age_group_end</th>\n",
  263. " <th>year_start</th>\n",
  264. " <th>year_end</th>\n",
  265. " <th></th>\n",
  266. " </tr>\n",
  267. " </thead>\n",
  268. " <tbody>\n",
  269. " <tr>\n",
  270. " <th rowspan=\"4\" valign=\"top\">0</th>\n",
  271. " <th rowspan=\"4\" valign=\"top\">South Africa</th>\n",
  272. " <th rowspan=\"4\" valign=\"top\">Female</th>\n",
  273. " <th rowspan=\"4\" valign=\"top\">0.0</th>\n",
  274. " <th rowspan=\"4\" valign=\"top\">0.019178</th>\n",
  275. " <th>1990</th>\n",
  276. " <th>1991</th>\n",
  277. " <td>0.000047</td>\n",
  278. " </tr>\n",
  279. " <tr>\n",
  280. " <th>1991</th>\n",
  281. " <th>1992</th>\n",
  282. " <td>0.000038</td>\n",
  283. " </tr>\n",
  284. " <tr>\n",
  285. " <th>1992</th>\n",
  286. " <th>1993</th>\n",
  287. " <td>0.000030</td>\n",
  288. " </tr>\n",
  289. " <tr>\n",
  290. " <th>1993</th>\n",
  291. " <th>1994</th>\n",
  292. " <td>0.000022</td>\n",
  293. " </tr>\n",
  294. " <tr>\n",
  295. " <th>...</th>\n",
  296. " <th>...</th>\n",
  297. " <th>...</th>\n",
  298. " <th>...</th>\n",
  299. " <th>...</th>\n",
  300. " <th>...</th>\n",
  301. " <th>...</th>\n",
  302. " <td>...</td>\n",
  303. " </tr>\n",
  304. " <tr>\n",
  305. " <th rowspan=\"4\" valign=\"top\">999</th>\n",
  306. " <th rowspan=\"4\" valign=\"top\">South Africa</th>\n",
  307. " <th rowspan=\"4\" valign=\"top\">Male</th>\n",
  308. " <th rowspan=\"4\" valign=\"top\">95.0</th>\n",
  309. " <th rowspan=\"4\" valign=\"top\">125.000000</th>\n",
  310. " <th>2014</th>\n",
  311. " <th>2015</th>\n",
  312. " <td>0.009573</td>\n",
  313. " </tr>\n",
  314. " <tr>\n",
  315. " <th>2015</th>\n",
  316. " <th>2016</th>\n",
  317. " <td>0.009185</td>\n",
  318. " </tr>\n",
  319. " <tr>\n",
  320. " <th>2016</th>\n",
  321. " <th>2017</th>\n",
  322. " <td>0.008744</td>\n",
  323. " </tr>\n",
  324. " <tr>\n",
  325. " <th>2017</th>\n",
  326. " <th>2018</th>\n",
  327. " <td>0.008260</td>\n",
  328. " </tr>\n",
  329. " </tbody>\n",
  330. "</table>\n",
  331. "<p>1288000 rows × 1 columns</p>\n",
  332. "</div>"
  333. ],
  334. "text/plain": [
  335. " value\n",
  336. "draw location sex age_group_start age_group_end year_start year_end \n",
  337. "0 South Africa Female 0.0 0.019178 1990 1991 0.000047\n",
  338. " 1991 1992 0.000038\n",
  339. " 1992 1993 0.000030\n",
  340. " 1993 1994 0.000022\n",
  341. "... ...\n",
  342. "999 South Africa Male 95.0 125.000000 2014 2015 0.009573\n",
  343. " 2015 2016 0.009185\n",
  344. " 2016 2017 0.008744\n",
  345. " 2017 2018 0.008260\n",
  346. "\n",
  347. "[1288000 rows x 1 columns]"
  348. ]
  349. },
  350. "execution_count": 6,
  351. "metadata": {},
  352. "output_type": "execute_result"
  353. }
  354. ],
  355. "source": [
  356. "index_cols = ['draw', 'location', 'sex', 'age_group_start', 'age_group_end', 'year_start', 'year_end']\n",
  357. "prev_active_tb = prev_any_tb.set_index(index_cols) - prev_ltbi.set_index(index_cols)\n",
  358. "prev_active_tb"
  359. ]
  360. },
  361. {
  362. "cell_type": "code",
  363. "execution_count": 15,
  364. "metadata": {},
  365. "outputs": [],
  366. "source": [
  367. "# there is probably a way to use vivarium to add a column to df\n",
  368. "# that includes the probability of active tb for each simulant!\n",
  369. "\n",
  370. "# I'll just simulate this for now, too\n",
  371. "df['pr_active_tb'] = np.random.uniform(0, .01, size=N)"
  372. ]
  373. },
  374. {
  375. "cell_type": "code",
  376. "execution_count": 16,
  377. "metadata": {},
  378. "outputs": [
  379. {
  380. "data": {
  381. "text/plain": [
  382. "0.009567952684606862"
  383. ]
  384. },
  385. "execution_count": 16,
  386. "metadata": {},
  387. "output_type": "execute_result"
  388. }
  389. ],
  390. "source": [
  391. "def pr_ac_tb_in_hh(df_hh):\n",
  392. " pr_no_tb = 1 - df_hh.pr_active_tb\n",
  393. " pr_no_tb_in_hh = np.prod(pr_no_tb)\n",
  394. " return 1 - pr_no_tb_in_hh\n",
  395. "pr_ac_tb_in_hh(df[df.hh_id == 1])"
  396. ]
  397. },
  398. {
  399. "cell_type": "code",
  400. "execution_count": 17,
  401. "metadata": {},
  402. "outputs": [
  403. {
  404. "data": {
  405. "text/html": [
  406. "<div>\n",
  407. "<style scoped>\n",
  408. " .dataframe tbody tr th:only-of-type {\n",
  409. " vertical-align: middle;\n",
  410. " }\n",
  411. "\n",
  412. " .dataframe tbody tr th {\n",
  413. " vertical-align: top;\n",
  414. " }\n",
  415. "\n",
  416. " .dataframe thead th {\n",
  417. " text-align: right;\n",
  418. " }\n",
  419. "</style>\n",
  420. "<table border=\"1\" class=\"dataframe\">\n",
  421. " <thead>\n",
  422. " <tr style=\"text-align: right;\">\n",
  423. " <th></th>\n",
  424. " <th>age</th>\n",
  425. " <th>sex</th>\n",
  426. " <th>hh_id</th>\n",
  427. " <th>pr_active_tb</th>\n",
  428. " </tr>\n",
  429. " </thead>\n",
  430. " <tbody>\n",
  431. " <tr>\n",
  432. " <th>241</th>\n",
  433. " <td>72.145127</td>\n",
  434. " <td>female</td>\n",
  435. " <td>1</td>\n",
  436. " <td>0.001116</td>\n",
  437. " </tr>\n",
  438. " <tr>\n",
  439. " <th>624</th>\n",
  440. " <td>54.463637</td>\n",
  441. " <td>female</td>\n",
  442. " <td>1</td>\n",
  443. " <td>0.004184</td>\n",
  444. " </tr>\n",
  445. " <tr>\n",
  446. " <th>4435</th>\n",
  447. " <td>23.869853</td>\n",
  448. " <td>male</td>\n",
  449. " <td>1</td>\n",
  450. " <td>0.004295</td>\n",
  451. " </tr>\n",
  452. " </tbody>\n",
  453. "</table>\n",
  454. "</div>"
  455. ],
  456. "text/plain": [
  457. " age sex hh_id pr_active_tb\n",
  458. "241 72.145127 female 1 0.001116\n",
  459. "624 54.463637 female 1 0.004184\n",
  460. "4435 23.869853 male 1 0.004295"
  461. ]
  462. },
  463. "execution_count": 17,
  464. "metadata": {},
  465. "output_type": "execute_result"
  466. }
  467. ],
  468. "source": [
  469. "df[df.hh_id==1]"
  470. ]
  471. },
  472. {
  473. "cell_type": "code",
  474. "execution_count": 23,
  475. "metadata": {},
  476. "outputs": [
  477. {
  478. "data": {
  479. "text/plain": [
  480. "hh_id\n",
  481. "5 0.003730\n",
  482. "58 0.028011\n",
  483. "68 0.010399\n",
  484. "72 0.011235\n",
  485. " ... \n",
  486. "2939 0.014072\n",
  487. "2953 0.015080\n",
  488. "2958 0.028687\n",
  489. "2963 0.022994\n",
  490. "Length: 175, dtype: float64"
  491. ]
  492. },
  493. "execution_count": 23,
  494. "metadata": {},
  495. "output_type": "execute_result"
  496. }
  497. ],
  498. "source": [
  499. "pr_ac_tb = df[df.hh_id.isin(hh_with)].groupby('hh_id').apply(pr_ac_tb_in_hh)\n",
  500. "pr_ac_tb"
  501. ]
  502. },
  503. {
  504. "cell_type": "code",
  505. "execution_count": 24,
  506. "metadata": {},
  507. "outputs": [
  508. {
  509. "data": {
  510. "text/plain": [
  511. "0.01976982850644556"
  512. ]
  513. },
  514. "execution_count": 24,
  515. "metadata": {},
  516. "output_type": "execute_result"
  517. }
  518. ],
  519. "source": [
  520. "pr_ac_tb_age_sex = pr_ac_tb.mean()\n",
  521. "pr_ac_tb_age_sex"
  522. ]
  523. },
  524. {
  525. "cell_type": "code",
  526. "execution_count": 25,
  527. "metadata": {},
  528. "outputs": [],
  529. "source": [
  530. "# Now repeat that for each age and sex group, and you\n",
  531. "# will get the probability of an active TB case in the household\n",
  532. "# as a function of age and sex for the location and year\n",
  533. "\n",
  534. "age_bins = [0, 1, 5, 10] + list(range(15, 101, 5))\n",
  535. "for i, age_start in enumerate(age_bins[:-1]):\n",
  536. " age_end = age_bins[i+1]\n",
  537. " for sex in ['male', 'female']:\n",
  538. " # find and store pr_ac_tb_age_sex\n",
  539. " pr_ac_tb_age_sex"
  540. ]
  541. },
  542. {
  543. "cell_type": "code",
  544. "execution_count": 26,
  545. "metadata": {},
  546. "outputs": [],
  547. "source": [
  548. "# and do that for each draw to propagate through the uncertainty\n",
  549. "# probably good to do a bootstrap resampling of the person data, too"
  550. ]
  551. },
  552. {
  553. "cell_type": "code",
  554. "execution_count": null,
  555. "metadata": {},
  556. "outputs": [],
  557. "source": []
  558. }
  559. ],
  560. "metadata": {
  561. "kernelspec": {
  562. "display_name": "vivarium_conic_sqlns",
  563. "language": "python",
  564. "name": "vivarium_conic_sqlns"
  565. },
  566. "language_info": {
  567. "codemirror_mode": {
  568. "name": "ipython",
  569. "version": 3
  570. },
  571. "file_extension": ".py",
  572. "mimetype": "text/x-python",
  573. "name": "python",
  574. "nbconvert_exporter": "python",
  575. "pygments_lexer": "ipython3",
  576. "version": "3.6.8"
  577. }
  578. },
  579. "nbformat": 4,
  580. "nbformat_minor": 2
  581. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement