Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# cases \n",
      "\n",
      "wood\n",
      "(-0.027365794640202726, 0.6329200219533746)\n",
      "metal\n",
      "(0.02705067814646989, 0.6368417634645179)\n",
      "\n",
      "# controls\n",
      "wood\n",
      "(0.09576879095559523, 0.09393193478421226)\n",
      "metal\n",
      "(-0.039643310289723156, 0.4889087597244508)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import scipy.stats as stats\n",
    "import statsmodels.formula.api as smf\n",
    "stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)\n",
    "\n",
    "df = pd.read_csv('genotyped_subset_audrey.csv') # read Audrey data\n",
    "\n",
    "genotype_transforms = {\"(G;G)\":0, \"(G;T)\":1, \"(T;T)\":2}\n",
    "df.genotype = df.genotype.map(genotype_transforms) # change genotype to 0,1,2 (assumes additive model)\n",
    "\n",
    "def correlations(df):\n",
    "    \"\"\"\n",
    "    check for correlations between genotype and wood or metal exposure\n",
    "    using scipy stats.pearsonr and print the result (coefficient, p-value)\n",
    "    \"\"\"\n",
    "    print('wood')\n",
    "    print(stats.pearsonr(df['genotype'], df['exposed_wood']))\n",
    "    print('metal')\n",
    "    print(stats.pearsonr(df['genotype'], df['exposed_metal']))\n",
    "\n",
    "print(\"# cases \\n\")\n",
    "correlations(df[df['case'] == 1])\n",
    "\n",
    "print(\"\\n# controls\")\n",
    "correlations(df[df['case'] == 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.690774\n",
      "         Iterations 4\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  614\n",
      "Model:                          Logit   Df Residuals:                      612\n",
      "Method:                           MLE   Df Model:                            1\n",
      "Date:                Thu, 08 Aug 2019   Pseudo R-squ.:                0.003424\n",
      "Time:                        11:25:46   Log-Likelihood:                -424.14\n",
      "converged:                       True   LL-Null:                       -425.59\n",
      "                                        LLR p-value:                   0.08781\n",
      "=================================================================================\n",
      "                    coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------\n",
      "Intercept        -0.0574      0.087     -0.656      0.512      -0.229       0.114\n",
      "exposed_metal     0.3901      0.230      1.697      0.090      -0.060       0.841\n",
      "=================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                   2.5%     97.5%        OR\n",
      "Intercept      0.795445  1.120864  0.944238\n",
      "exposed_metal  0.941345  2.317783  1.477103\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ exposed_metal', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.692971\n",
      "         Iterations 3\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  614\n",
      "Model:                          Logit   Df Residuals:                      612\n",
      "Method:                           MLE   Df Model:                            1\n",
      "Date:                Thu, 08 Aug 2019   Pseudo R-squ.:               0.0002537\n",
      "Time:                        11:26:27   Log-Likelihood:                -425.48\n",
      "converged:                       True   LL-Null:                       -425.59\n",
      "                                        LLR p-value:                    0.6421\n",
      "================================================================================\n",
      "                   coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------\n",
      "Intercept        0.0105      0.084      0.126      0.900      -0.154       0.175\n",
      "exposed_wood    -0.1441      0.310     -0.464      0.642      -0.752       0.464\n",
      "================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                  2.5%     97.5%        OR\n",
      "Intercept     0.857453  1.191102  1.010601\n",
      "exposed_wood  0.471259  1.590732  0.865822\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ exposed_wood', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.625035\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  614\n",
      "Model:                          Logit   Df Residuals:                      612\n",
      "Method:                           MLE   Df Model:                            1\n",
      "Date:                Thu, 08 Aug 2019   Pseudo R-squ.:                 0.09826\n",
      "Time:                        11:26:46   Log-Likelihood:                -383.77\n",
      "converged:                       True   LL-Null:                       -425.59\n",
      "                                        LLR p-value:                 5.932e-20\n",
      "==============================================================================\n",
      "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "Intercept     -0.6188      0.110     -5.631      0.000      -0.834      -0.403\n",
      "genotype       1.3742      0.162      8.469      0.000       1.056       1.692\n",
      "==============================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "               2.5%     97.5%        OR\n",
      "Intercept  0.434221  0.668037  0.538587\n",
      "genotype   2.875396  5.431683  3.951992\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ genotype', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.681147\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  614\n",
      "Model:                          Logit   Df Residuals:                      610\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Thu, 08 Aug 2019   Pseudo R-squ.:                 0.01731\n",
      "Time:                        11:27:46   Log-Likelihood:                -418.22\n",
      "converged:                       True   LL-Null:                       -425.59\n",
      "                                        LLR p-value:                  0.002057\n",
      "=================================================================================\n",
      "                    coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------\n",
      "Intercept        -1.8501      0.776     -2.383      0.017      -3.372      -0.328\n",
      "exposed_metal     0.4286      0.232      1.849      0.065      -0.026       0.883\n",
      "age               0.0217      0.010      2.116      0.034       0.002       0.042\n",
      "packyrs           0.0100      0.004      2.772      0.006       0.003       0.017\n",
      "=================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                   2.5%     97.5%        OR\n",
      "Intercept      0.034321  0.720217  0.157221\n",
      "exposed_metal  0.974511  2.418135  1.535090\n",
      "age            1.001600  1.042627  1.021908\n",
      "packyrs        1.002933  1.017220  1.010051\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ exposed_metal + age + packyrs', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.614869\n",
      "         Iterations 6\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  614\n",
      "Model:                          Logit   Df Residuals:                      608\n",
      "Method:                           MLE   Df Model:                            5\n",
      "Date:                Thu, 08 Aug 2019   Pseudo R-squ.:                  0.1129\n",
      "Time:                        11:28:03   Log-Likelihood:                -377.53\n",
      "converged:                       True   LL-Null:                       -425.59\n",
      "                                        LLR p-value:                 3.461e-19\n",
      "==========================================================================================\n",
      "                             coef    std err          z      P>|z|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------------------\n",
      "Intercept                 -2.3354      0.842     -2.773      0.006      -3.986      -0.684\n",
      "exposed_metal              0.3302      0.309      1.067      0.286      -0.276       0.936\n",
      "genotype                   1.3246      0.175      7.559      0.000       0.981       1.668\n",
      "exposed_metal:genotype     0.3011      0.494      0.610      0.542      -0.667       1.269\n",
      "age                        0.0202      0.011      1.831      0.067      -0.001       0.042\n",
      "packyrs                    0.0095      0.004      2.523      0.012       0.002       0.017\n",
      "==========================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                            2.5%     97.5%        OR\n",
      "Intercept               0.018568  0.504364  0.096773\n",
      "exposed_metal           0.758763  2.550909  1.391235\n",
      "genotype                2.667556  5.301737  3.760676\n",
      "exposed_metal:genotype  0.513502  3.556446  1.351385\n",
      "age                     0.998572  1.042785  1.020439\n",
      "packyrs                 1.002120  1.017013  1.009539\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ exposed_metal*genotype + age + packyrs', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.615811\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  614\n",
      "Model:                          Logit   Df Residuals:                      608\n",
      "Method:                           MLE   Df Model:                            5\n",
      "Date:                Thu, 08 Aug 2019   Pseudo R-squ.:                  0.1116\n",
      "Time:                        11:28:29   Log-Likelihood:                -378.11\n",
      "converged:                       True   LL-Null:                       -425.59\n",
      "                                        LLR p-value:                 6.061e-19\n",
      "=========================================================================================\n",
      "                            coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-----------------------------------------------------------------------------------------\n",
      "Intercept                -2.2892      0.838     -2.731      0.006      -3.932      -0.646\n",
      "exposed_wood              0.2269      0.442      0.513      0.608      -0.640       1.094\n",
      "genotype                  1.4316      0.171      8.355      0.000       1.096       1.767\n",
      "exposed_wood:genotype    -0.8695      0.601     -1.447      0.148      -2.047       0.308\n",
      "age                       0.0201      0.011      1.823      0.068      -0.002       0.042\n",
      "packyrs                   0.0095      0.004      2.516      0.012       0.002       0.017\n",
      "=========================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                           2.5%     97.5%        OR\n",
      "Intercept              0.019603  0.523984  0.101349\n",
      "exposed_wood           0.527371  2.984926  1.254656\n",
      "genotype               2.991419  5.855585  4.185273\n",
      "exposed_wood:genotype  0.129108  1.360924  0.419173\n",
      "age                    0.998487  1.042558  1.020285\n",
      "packyrs                1.002109  1.017109  1.009581\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ exposed_wood*genotype + age + packyrs', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}