Advertisement
Guest User

Untitled

a guest
Aug 17th, 2019
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 17.46 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 6,
  6. "metadata": {},
  7. "outputs": [
  8. {
  9. "name": "stdout",
  10. "output_type": "stream",
  11. "text": [
  12. "# cases \n",
  13. "\n",
  14. "wood\n",
  15. "(-0.027365794640202726, 0.6329200219533746)\n",
  16. "metal\n",
  17. "(0.02705067814646989, 0.6368417634645179)\n",
  18. "\n",
  19. "# controls\n",
  20. "wood\n",
  21. "(0.09576879095559523, 0.09393193478421226)\n",
  22. "metal\n",
  23. "(-0.039643310289723156, 0.4889087597244508)\n"
  24. ]
  25. }
  26. ],
  27. "source": [
  28. "import pandas as pd\n",
  29. "import numpy as np\n",
  30. "import scipy.stats as stats\n",
  31. "import statsmodels.formula.api as smf\n",
  32. "stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)\n",
  33. "\n",
  34. "df = pd.read_csv('genotyped_subset_audrey.csv') # read Audrey data\n",
  35. "\n",
  36. "genotype_transforms = {\"(G;G)\":0, \"(G;T)\":1, \"(T;T)\":2}\n",
  37. "df.genotype = df.genotype.map(genotype_transforms) # change genotype to 0,1,2 (assumes additive model)\n",
  38. "\n",
  39. "def correlations(df):\n",
  40. " \"\"\"\n",
  41. " check for correlations between genotype and wood or metal exposure\n",
  42. " using scipy stats.pearsonr and print the result (coefficient, p-value)\n",
  43. " \"\"\"\n",
  44. " print('wood')\n",
  45. " print(stats.pearsonr(df['genotype'], df['exposed_wood']))\n",
  46. " print('metal')\n",
  47. " print(stats.pearsonr(df['genotype'], df['exposed_metal']))\n",
  48. "\n",
  49. "print(\"# cases \\n\")\n",
  50. "correlations(df[df['case'] == 1])\n",
  51. "\n",
  52. "print(\"\\n# controls\")\n",
  53. "correlations(df[df['case'] == 0])"
  54. ]
  55. },
  56. {
  57. "cell_type": "code",
  58. "execution_count": 7,
  59. "metadata": {},
  60. "outputs": [
  61. {
  62. "name": "stdout",
  63. "output_type": "stream",
  64. "text": [
  65. "Optimization terminated successfully.\n",
  66. " Current function value: 0.690774\n",
  67. " Iterations 4\n",
  68. " Logit Regression Results \n",
  69. "==============================================================================\n",
  70. "Dep. Variable: case No. Observations: 614\n",
  71. "Model: Logit Df Residuals: 612\n",
  72. "Method: MLE Df Model: 1\n",
  73. "Date: Thu, 08 Aug 2019 Pseudo R-squ.: 0.003424\n",
  74. "Time: 11:25:46 Log-Likelihood: -424.14\n",
  75. "converged: True LL-Null: -425.59\n",
  76. " LLR p-value: 0.08781\n",
  77. "=================================================================================\n",
  78. " coef std err z P>|z| [0.025 0.975]\n",
  79. "---------------------------------------------------------------------------------\n",
  80. "Intercept -0.0574 0.087 -0.656 0.512 -0.229 0.114\n",
  81. "exposed_metal 0.3901 0.230 1.697 0.090 -0.060 0.841\n",
  82. "=================================================================================\n",
  83. "Odds Ratios\n",
  84. "======================\n",
  85. " 2.5% 97.5% OR\n",
  86. "Intercept 0.795445 1.120864 0.944238\n",
  87. "exposed_metal 0.941345 2.317783 1.477103\n"
  88. ]
  89. }
  90. ],
  91. "source": [
  92. "model = smf.logit('case ~ exposed_metal', data=df) \n",
  93. "result = model.fit()\n",
  94. "print(result.summary())\n",
  95. "print (\"Odds Ratios\")\n",
  96. "print (\"======================\")\n",
  97. "params = result.params\n",
  98. "conf = result.conf_int()\n",
  99. "conf['OR'] = params\n",
  100. "conf.columns = ['2.5%', '97.5%', 'OR']\n",
  101. "print(np.exp(conf))"
  102. ]
  103. },
  104. {
  105. "cell_type": "code",
  106. "execution_count": 8,
  107. "metadata": {},
  108. "outputs": [
  109. {
  110. "name": "stdout",
  111. "output_type": "stream",
  112. "text": [
  113. "Optimization terminated successfully.\n",
  114. " Current function value: 0.692971\n",
  115. " Iterations 3\n",
  116. " Logit Regression Results \n",
  117. "==============================================================================\n",
  118. "Dep. Variable: case No. Observations: 614\n",
  119. "Model: Logit Df Residuals: 612\n",
  120. "Method: MLE Df Model: 1\n",
  121. "Date: Thu, 08 Aug 2019 Pseudo R-squ.: 0.0002537\n",
  122. "Time: 11:26:27 Log-Likelihood: -425.48\n",
  123. "converged: True LL-Null: -425.59\n",
  124. " LLR p-value: 0.6421\n",
  125. "================================================================================\n",
  126. " coef std err z P>|z| [0.025 0.975]\n",
  127. "--------------------------------------------------------------------------------\n",
  128. "Intercept 0.0105 0.084 0.126 0.900 -0.154 0.175\n",
  129. "exposed_wood -0.1441 0.310 -0.464 0.642 -0.752 0.464\n",
  130. "================================================================================\n",
  131. "Odds Ratios\n",
  132. "======================\n",
  133. " 2.5% 97.5% OR\n",
  134. "Intercept 0.857453 1.191102 1.010601\n",
  135. "exposed_wood 0.471259 1.590732 0.865822\n"
  136. ]
  137. }
  138. ],
  139. "source": [
  140. "model = smf.logit('case ~ exposed_wood', data=df) \n",
  141. "result = model.fit()\n",
  142. "print(result.summary())\n",
  143. "print (\"Odds Ratios\")\n",
  144. "print (\"======================\")\n",
  145. "params = result.params\n",
  146. "conf = result.conf_int()\n",
  147. "conf['OR'] = params\n",
  148. "conf.columns = ['2.5%', '97.5%', 'OR']\n",
  149. "print(np.exp(conf))"
  150. ]
  151. },
  152. {
  153. "cell_type": "code",
  154. "execution_count": 9,
  155. "metadata": {},
  156. "outputs": [
  157. {
  158. "name": "stdout",
  159. "output_type": "stream",
  160. "text": [
  161. "Optimization terminated successfully.\n",
  162. " Current function value: 0.625035\n",
  163. " Iterations 5\n",
  164. " Logit Regression Results \n",
  165. "==============================================================================\n",
  166. "Dep. Variable: case No. Observations: 614\n",
  167. "Model: Logit Df Residuals: 612\n",
  168. "Method: MLE Df Model: 1\n",
  169. "Date: Thu, 08 Aug 2019 Pseudo R-squ.: 0.09826\n",
  170. "Time: 11:26:46 Log-Likelihood: -383.77\n",
  171. "converged: True LL-Null: -425.59\n",
  172. " LLR p-value: 5.932e-20\n",
  173. "==============================================================================\n",
  174. " coef std err z P>|z| [0.025 0.975]\n",
  175. "------------------------------------------------------------------------------\n",
  176. "Intercept -0.6188 0.110 -5.631 0.000 -0.834 -0.403\n",
  177. "genotype 1.3742 0.162 8.469 0.000 1.056 1.692\n",
  178. "==============================================================================\n",
  179. "Odds Ratios\n",
  180. "======================\n",
  181. " 2.5% 97.5% OR\n",
  182. "Intercept 0.434221 0.668037 0.538587\n",
  183. "genotype 2.875396 5.431683 3.951992\n"
  184. ]
  185. }
  186. ],
  187. "source": [
  188. "model = smf.logit('case ~ genotype', data=df) \n",
  189. "result = model.fit()\n",
  190. "print(result.summary())\n",
  191. "print (\"Odds Ratios\")\n",
  192. "print (\"======================\")\n",
  193. "params = result.params\n",
  194. "conf = result.conf_int()\n",
  195. "conf['OR'] = params\n",
  196. "conf.columns = ['2.5%', '97.5%', 'OR']\n",
  197. "print(np.exp(conf))"
  198. ]
  199. },
  200. {
  201. "cell_type": "code",
  202. "execution_count": 11,
  203. "metadata": {},
  204. "outputs": [
  205. {
  206. "name": "stdout",
  207. "output_type": "stream",
  208. "text": [
  209. "Optimization terminated successfully.\n",
  210. " Current function value: 0.681147\n",
  211. " Iterations 5\n",
  212. " Logit Regression Results \n",
  213. "==============================================================================\n",
  214. "Dep. Variable: case No. Observations: 614\n",
  215. "Model: Logit Df Residuals: 610\n",
  216. "Method: MLE Df Model: 3\n",
  217. "Date: Thu, 08 Aug 2019 Pseudo R-squ.: 0.01731\n",
  218. "Time: 11:27:46 Log-Likelihood: -418.22\n",
  219. "converged: True LL-Null: -425.59\n",
  220. " LLR p-value: 0.002057\n",
  221. "=================================================================================\n",
  222. " coef std err z P>|z| [0.025 0.975]\n",
  223. "---------------------------------------------------------------------------------\n",
  224. "Intercept -1.8501 0.776 -2.383 0.017 -3.372 -0.328\n",
  225. "exposed_metal 0.4286 0.232 1.849 0.065 -0.026 0.883\n",
  226. "age 0.0217 0.010 2.116 0.034 0.002 0.042\n",
  227. "packyrs 0.0100 0.004 2.772 0.006 0.003 0.017\n",
  228. "=================================================================================\n",
  229. "Odds Ratios\n",
  230. "======================\n",
  231. " 2.5% 97.5% OR\n",
  232. "Intercept 0.034321 0.720217 0.157221\n",
  233. "exposed_metal 0.974511 2.418135 1.535090\n",
  234. "age 1.001600 1.042627 1.021908\n",
  235. "packyrs 1.002933 1.017220 1.010051\n"
  236. ]
  237. }
  238. ],
  239. "source": [
  240. "model = smf.logit('case ~ exposed_metal + age + packyrs', data=df) \n",
  241. "result = model.fit()\n",
  242. "print(result.summary())\n",
  243. "print (\"Odds Ratios\")\n",
  244. "print (\"======================\")\n",
  245. "params = result.params\n",
  246. "conf = result.conf_int()\n",
  247. "conf['OR'] = params\n",
  248. "conf.columns = ['2.5%', '97.5%', 'OR']\n",
  249. "print(np.exp(conf))"
  250. ]
  251. },
  252. {
  253. "cell_type": "code",
  254. "execution_count": 12,
  255. "metadata": {},
  256. "outputs": [
  257. {
  258. "name": "stdout",
  259. "output_type": "stream",
  260. "text": [
  261. "Optimization terminated successfully.\n",
  262. " Current function value: 0.614869\n",
  263. " Iterations 6\n",
  264. " Logit Regression Results \n",
  265. "==============================================================================\n",
  266. "Dep. Variable: case No. Observations: 614\n",
  267. "Model: Logit Df Residuals: 608\n",
  268. "Method: MLE Df Model: 5\n",
  269. "Date: Thu, 08 Aug 2019 Pseudo R-squ.: 0.1129\n",
  270. "Time: 11:28:03 Log-Likelihood: -377.53\n",
  271. "converged: True LL-Null: -425.59\n",
  272. " LLR p-value: 3.461e-19\n",
  273. "==========================================================================================\n",
  274. " coef std err z P>|z| [0.025 0.975]\n",
  275. "------------------------------------------------------------------------------------------\n",
  276. "Intercept -2.3354 0.842 -2.773 0.006 -3.986 -0.684\n",
  277. "exposed_metal 0.3302 0.309 1.067 0.286 -0.276 0.936\n",
  278. "genotype 1.3246 0.175 7.559 0.000 0.981 1.668\n",
  279. "exposed_metal:genotype 0.3011 0.494 0.610 0.542 -0.667 1.269\n",
  280. "age 0.0202 0.011 1.831 0.067 -0.001 0.042\n",
  281. "packyrs 0.0095 0.004 2.523 0.012 0.002 0.017\n",
  282. "==========================================================================================\n",
  283. "Odds Ratios\n",
  284. "======================\n",
  285. " 2.5% 97.5% OR\n",
  286. "Intercept 0.018568 0.504364 0.096773\n",
  287. "exposed_metal 0.758763 2.550909 1.391235\n",
  288. "genotype 2.667556 5.301737 3.760676\n",
  289. "exposed_metal:genotype 0.513502 3.556446 1.351385\n",
  290. "age 0.998572 1.042785 1.020439\n",
  291. "packyrs 1.002120 1.017013 1.009539\n"
  292. ]
  293. }
  294. ],
  295. "source": [
  296. "model = smf.logit('case ~ exposed_metal*genotype + age + packyrs', data=df) \n",
  297. "result = model.fit()\n",
  298. "print(result.summary())\n",
  299. "print (\"Odds Ratios\")\n",
  300. "print (\"======================\")\n",
  301. "params = result.params\n",
  302. "conf = result.conf_int()\n",
  303. "conf['OR'] = params\n",
  304. "conf.columns = ['2.5%', '97.5%', 'OR']\n",
  305. "print(np.exp(conf))"
  306. ]
  307. },
  308. {
  309. "cell_type": "code",
  310. "execution_count": 13,
  311. "metadata": {},
  312. "outputs": [
  313. {
  314. "name": "stdout",
  315. "output_type": "stream",
  316. "text": [
  317. "Optimization terminated successfully.\n",
  318. " Current function value: 0.615811\n",
  319. " Iterations 5\n",
  320. " Logit Regression Results \n",
  321. "==============================================================================\n",
  322. "Dep. Variable: case No. Observations: 614\n",
  323. "Model: Logit Df Residuals: 608\n",
  324. "Method: MLE Df Model: 5\n",
  325. "Date: Thu, 08 Aug 2019 Pseudo R-squ.: 0.1116\n",
  326. "Time: 11:28:29 Log-Likelihood: -378.11\n",
  327. "converged: True LL-Null: -425.59\n",
  328. " LLR p-value: 6.061e-19\n",
  329. "=========================================================================================\n",
  330. " coef std err z P>|z| [0.025 0.975]\n",
  331. "-----------------------------------------------------------------------------------------\n",
  332. "Intercept -2.2892 0.838 -2.731 0.006 -3.932 -0.646\n",
  333. "exposed_wood 0.2269 0.442 0.513 0.608 -0.640 1.094\n",
  334. "genotype 1.4316 0.171 8.355 0.000 1.096 1.767\n",
  335. "exposed_wood:genotype -0.8695 0.601 -1.447 0.148 -2.047 0.308\n",
  336. "age 0.0201 0.011 1.823 0.068 -0.002 0.042\n",
  337. "packyrs 0.0095 0.004 2.516 0.012 0.002 0.017\n",
  338. "=========================================================================================\n",
  339. "Odds Ratios\n",
  340. "======================\n",
  341. " 2.5% 97.5% OR\n",
  342. "Intercept 0.019603 0.523984 0.101349\n",
  343. "exposed_wood 0.527371 2.984926 1.254656\n",
  344. "genotype 2.991419 5.855585 4.185273\n",
  345. "exposed_wood:genotype 0.129108 1.360924 0.419173\n",
  346. "age 0.998487 1.042558 1.020285\n",
  347. "packyrs 1.002109 1.017109 1.009581\n"
  348. ]
  349. }
  350. ],
  351. "source": [
  352. "model = smf.logit('case ~ exposed_wood*genotype + age + packyrs', data=df) \n",
  353. "result = model.fit()\n",
  354. "print(result.summary())\n",
  355. "print (\"Odds Ratios\")\n",
  356. "print (\"======================\")\n",
  357. "params = result.params\n",
  358. "conf = result.conf_int()\n",
  359. "conf['OR'] = params\n",
  360. "conf.columns = ['2.5%', '97.5%', 'OR']\n",
  361. "print(np.exp(conf))"
  362. ]
  363. },
  364. {
  365. "cell_type": "code",
  366. "execution_count": null,
  367. "metadata": {},
  368. "outputs": [],
  369. "source": []
  370. }
  371. ],
  372. "metadata": {
  373. "kernelspec": {
  374. "display_name": "Python 3",
  375. "language": "python",
  376. "name": "python3"
  377. },
  378. "language_info": {
  379. "codemirror_mode": {
  380. "name": "ipython",
  381. "version": 3
  382. },
  383. "file_extension": ".py",
  384. "mimetype": "text/x-python",
  385. "name": "python",
  386. "nbconvert_exporter": "python",
  387. "pygments_lexer": "ipython3",
  388. "version": "3.5.4"
  389. }
  390. },
  391. "nbformat": 4,
  392. "nbformat_minor": 2
  393. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement