daily pastebin goal
11%
SHARE
TWEET

Untitled

a guest Jul 12th, 2018 64 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. {
  2.  "cells": [
  3.   {
  4.    "cell_type": "code",
  5.    "execution_count": 102,
  6.    "metadata": {},
  7.    "outputs": [],
  8.    "source": [
  9.     "# to add t-tests\n",
  10.     "# to add categories of exposure and for centre"
  11.    ]
  12.   },
  13.   {
  14.    "cell_type": "code",
  15.    "execution_count": 1,
  16.    "metadata": {},
  17.    "outputs": [
  18.     {
  19.      "name": "stderr",
  20.      "output_type": "stream",
  21.      "text": [
  22.       "/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n",
  23.       "  from pandas.core import datetools\n"
  24.      ]
  25.     }
  26.    ],
  27.    "source": [
  28.     "import pandas as pd\n",
  29.     "import statsmodels.api as sm\n",
  30.     "import seaborn as sns; sns.set(color_codes=True)"
  31.    ]
  32.   },
  33.   {
  34.    "cell_type": "code",
  35.    "execution_count": 78,
  36.    "metadata": {},
  37.    "outputs": [],
  38.    "source": [
  39.     "df = pd.read_csv('for_cosetta.csv')"
  40.    ]
  41.   },
  42.   {
  43.    "cell_type": "code",
  44.    "execution_count": 79,
  45.    "metadata": {},
  46.    "outputs": [
  47.     {
  48.      "data": {
  49.       "text/html": [
  50.        "<div>\n",
  51.        "<style scoped>\n",
  52.        "    .dataframe tbody tr th:only-of-type {\n",
  53.        "        vertical-align: middle;\n",
  54.        "    }\n",
  55.        "\n",
  56.        "    .dataframe tbody tr th {\n",
  57.        "        vertical-align: top;\n",
  58.        "    }\n",
  59.        "\n",
  60.        "    .dataframe thead th {\n",
  61.        "        text-align: right;\n",
  62.        "    }\n",
  63.        "</style>\n",
  64.        "<table border=\"1\" class=\"dataframe\">\n",
  65.        "  <thead>\n",
  66.        "    <tr style=\"text-align: right;\">\n",
  67.        "      <th></th>\n",
  68.        "      <th>participant_id</th>\n",
  69.        "      <th>soc_job_ft</th>\n",
  70.        "      <th>job_tasks</th>\n",
  71.        "      <th>start_year</th>\n",
  72.        "      <th>end_year</th>\n",
  73.        "      <th>pt</th>\n",
  74.        "      <th>soc90</th>\n",
  75.        "      <th>soc2000</th>\n",
  76.        "      <th>ssec</th>\n",
  77.        "      <th>years</th>\n",
  78.        "      <th>...</th>\n",
  79.        "      <th>es</th>\n",
  80.        "      <th>cs</th>\n",
  81.        "      <th>centre</th>\n",
  82.        "      <th>gp_coords</th>\n",
  83.        "      <th>centre_coords</th>\n",
  84.        "      <th>distfromcentre</th>\n",
  85.        "      <th>exposed</th>\n",
  86.        "      <th>duration</th>\n",
  87.        "      <th>risk</th>\n",
  88.        "      <th>jobcat</th>\n",
  89.        "    </tr>\n",
  90.        "  </thead>\n",
  91.        "  <tbody>\n",
  92.        "    <tr>\n",
  93.        "      <th>0</th>\n",
  94.        "      <td>80024</td>\n",
  95.        "      <td>Assistant, laboratory</td>\n",
  96.        "      <td>tiles\\nglaziers\\n\\njust colouring \\n\\ntiles fo...</td>\n",
  97.        "      <td>1962</td>\n",
  98.        "      <td>1982</td>\n",
  99.        "      <td>case</td>\n",
  100.        "      <td>864</td>\n",
  101.        "      <td>8138</td>\n",
  102.        "      <td>3.0</td>\n",
  103.        "      <td>72</td>\n",
  104.        "      <td>...</td>\n",
  105.        "      <td>Yes</td>\n",
  106.        "      <td>No</td>\n",
  107.        "      <td>8</td>\n",
  108.        "      <td>NaN</td>\n",
  109.        "      <td>(53.46633849590757, -2.9311029200152485)</td>\n",
  110.        "      <td>5932.734447</td>\n",
  111.        "      <td>False</td>\n",
  112.        "      <td>20</td>\n",
  113.        "      <td>0</td>\n",
  114.        "      <td>3.0</td>\n",
  115.        "    </tr>\n",
  116.        "    <tr>\n",
  117.        "      <th>1</th>\n",
  118.        "      <td>80024</td>\n",
  119.        "      <td>Joiner</td>\n",
  120.        "      <td>1st and 2nd fix\\nincluding fitting kitchems</td>\n",
  121.        "      <td>1982</td>\n",
  122.        "      <td>1992</td>\n",
  123.        "      <td>case</td>\n",
  124.        "      <td>859</td>\n",
  125.        "      <td>8139</td>\n",
  126.        "      <td>7.0</td>\n",
  127.        "      <td>72</td>\n",
  128.        "      <td>...</td>\n",
  129.        "      <td>Yes</td>\n",
  130.        "      <td>No</td>\n",
  131.        "      <td>8</td>\n",
  132.        "      <td>NaN</td>\n",
  133.        "      <td>(53.46633849590757, -2.9311029200152485)</td>\n",
  134.        "      <td>5932.734447</td>\n",
  135.        "      <td>False</td>\n",
  136.        "      <td>10</td>\n",
  137.        "      <td>0</td>\n",
  138.        "      <td>3.0</td>\n",
  139.        "    </tr>\n",
  140.        "    <tr>\n",
  141.        "      <th>2</th>\n",
  142.        "      <td>100009</td>\n",
  143.        "      <td>Soldier</td>\n",
  144.        "      <td>Infantry Soldier in the army: trained in comba...</td>\n",
  145.        "      <td>1955</td>\n",
  146.        "      <td>1958</td>\n",
  147.        "      <td>case</td>\n",
  148.        "      <td>600</td>\n",
  149.        "      <td>3311</td>\n",
  150.        "      <td>3.0</td>\n",
  151.        "      <td>81</td>\n",
  152.        "      <td>...</td>\n",
  153.        "      <td>No</td>\n",
  154.        "      <td>NaN</td>\n",
  155.        "      <td>10</td>\n",
  156.        "      <td>(51.410693661901455, -0.161964476739319)</td>\n",
  157.        "      <td>(51.516971016915214, -0.173544641038758)</td>\n",
  158.        "      <td>11.851470</td>\n",
  159.        "      <td>False</td>\n",
  160.        "      <td>3</td>\n",
  161.        "      <td>0</td>\n",
  162.        "      <td>4.0</td>\n",
  163.        "    </tr>\n",
  164.        "    <tr>\n",
  165.        "      <th>3</th>\n",
  166.        "      <td>80003</td>\n",
  167.        "      <td>Assistant, catering</td>\n",
  168.        "      <td>working in the kitchen, restaurants etc. on th...</td>\n",
  169.        "      <td>1970</td>\n",
  170.        "      <td>1972</td>\n",
  171.        "      <td>case</td>\n",
  172.        "      <td>953</td>\n",
  173.        "      <td>9223</td>\n",
  174.        "      <td>6.0</td>\n",
  175.        "      <td>63</td>\n",
  176.        "      <td>...</td>\n",
  177.        "      <td>Yes</td>\n",
  178.        "      <td>No</td>\n",
  179.        "      <td>8</td>\n",
  180.        "      <td>NaN</td>\n",
  181.        "      <td>(53.46633849590757, -2.9311029200152485)</td>\n",
  182.        "      <td>5932.734447</td>\n",
  183.        "      <td>False</td>\n",
  184.        "      <td>2</td>\n",
  185.        "      <td>0</td>\n",
  186.        "      <td>1.0</td>\n",
  187.        "    </tr>\n",
  188.        "    <tr>\n",
  189.        "      <th>4</th>\n",
  190.        "      <td>100009</td>\n",
  191.        "      <td>Hand, bacon</td>\n",
  192.        "      <td>Bacon Hand - working in the supermarket; cutti...</td>\n",
  193.        "      <td>1958</td>\n",
  194.        "      <td>1972</td>\n",
  195.        "      <td>case</td>\n",
  196.        "      <td>720</td>\n",
  197.        "      <td>7111</td>\n",
  198.        "      <td>6.0</td>\n",
  199.        "      <td>81</td>\n",
  200.        "      <td>...</td>\n",
  201.        "      <td>No</td>\n",
  202.        "      <td>NaN</td>\n",
  203.        "      <td>10</td>\n",
  204.        "      <td>(51.410693661901455, -0.161964476739319)</td>\n",
  205.        "      <td>(51.516971016915214, -0.173544641038758)</td>\n",
  206.        "      <td>11.851470</td>\n",
  207.        "      <td>False</td>\n",
  208.        "      <td>14</td>\n",
  209.        "      <td>0</td>\n",
  210.        "      <td>5.0</td>\n",
  211.        "    </tr>\n",
  212.        "  </tbody>\n",
  213.        "</table>\n",
  214.        "<p>5 rows × 22 columns</p>\n",
  215.        "</div>"
  216.       ],
  217.       "text/plain": [
  218.        "   participant_id             soc_job_ft  \\\n",
  219.        "0           80024  Assistant, laboratory   \n",
  220.        "1           80024                 Joiner   \n",
  221.        "2          100009                Soldier   \n",
  222.        "3           80003    Assistant, catering   \n",
  223.        "4          100009            Hand, bacon   \n",
  224.        "\n",
  225.        "                                           job_tasks  start_year  end_year  \\\n",
  226.        "0  tiles\\nglaziers\\n\\njust colouring \\n\\ntiles fo...        1962      1982   \n",
  227.        "1        1st and 2nd fix\\nincluding fitting kitchems        1982      1992   \n",
  228.        "2  Infantry Soldier in the army: trained in comba...        1955      1958   \n",
  229.        "3  working in the kitchen, restaurants etc. on th...        1970      1972   \n",
  230.        "4  Bacon Hand - working in the supermarket; cutti...        1958      1972   \n",
  231.        "\n",
  232.        "     pt  soc90  soc2000  ssec  years   ...     es   cs centre  \\\n",
  233.        "0  case    864     8138   3.0     72   ...    Yes   No      8   \n",
  234.        "1  case    859     8139   7.0     72   ...    Yes   No      8   \n",
  235.        "2  case    600     3311   3.0     81   ...     No  NaN     10   \n",
  236.        "3  case    953     9223   6.0     63   ...    Yes   No      8   \n",
  237.        "4  case    720     7111   6.0     81   ...     No  NaN     10   \n",
  238.        "\n",
  239.        "                                  gp_coords  \\\n",
  240.        "0                                       NaN   \n",
  241.        "1                                       NaN   \n",
  242.        "2  (51.410693661901455, -0.161964476739319)   \n",
  243.        "3                                       NaN   \n",
  244.        "4  (51.410693661901455, -0.161964476739319)   \n",
  245.        "\n",
  246.        "                              centre_coords distfromcentre exposed  duration  \\\n",
  247.        "0  (53.46633849590757, -2.9311029200152485)    5932.734447   False        20   \n",
  248.        "1  (53.46633849590757, -2.9311029200152485)    5932.734447   False        10   \n",
  249.        "2  (51.516971016915214, -0.173544641038758)      11.851470   False         3   \n",
  250.        "3  (53.46633849590757, -2.9311029200152485)    5932.734447   False         2   \n",
  251.        "4  (51.516971016915214, -0.173544641038758)      11.851470   False        14   \n",
  252.        "\n",
  253.        "   risk  jobcat  \n",
  254.        "0     0     3.0  \n",
  255.        "1     0     3.0  \n",
  256.        "2     0     4.0  \n",
  257.        "3     0     1.0  \n",
  258.        "4     0     5.0  \n",
  259.        "\n",
  260.        "[5 rows x 22 columns]"
  261.       ]
  262.      },
  263.      "execution_count": 79,
  264.      "metadata": {},
  265.      "output_type": "execute_result"
  266.     }
  267.    ],
  268.    "source": [
  269.     "df.head()"
  270.    ]
  271.   },
  272.   {
  273.    "cell_type": "code",
  274.    "execution_count": 80,
  275.    "metadata": {},
  276.    "outputs": [],
  277.    "source": [
  278.     "df = df[df.gp_coords.notna()] # get rid of rows which lack gp coords"
  279.    ]
  280.   },
  281.   {
  282.    "cell_type": "code",
  283.    "execution_count": 81,
  284.    "metadata": {},
  285.    "outputs": [],
  286.    "source": [
  287.     "cumrisk = df.groupby('participant_id').risk.sum().reset_index() "
  288.    ]
  289.   },
  290.   {
  291.    "cell_type": "code",
  292.    "execution_count": 82,
  293.    "metadata": {},
  294.    "outputs": [],
  295.    "source": [
  296.     "maxcat = df.groupby('participant_id').jobcat.min().reset_index() "
  297.    ]
  298.   },
  299.   {
  300.    "cell_type": "code",
  301.    "execution_count": 83,
  302.    "metadata": {},
  303.    "outputs": [],
  304.    "source": [
  305.     "df = df[['participant_id', 'pt', 'years', 'agegroup', 'ethnicity', 'es', 'cs', 'distfromcentre', 'centre']]"
  306.    ]
  307.   },
  308.   {
  309.    "cell_type": "code",
  310.    "execution_count": 84,
  311.    "metadata": {},
  312.    "outputs": [
  313.     {
  314.      "data": {
  315.       "text/html": [
  316.        "<div>\n",
  317.        "<style scoped>\n",
  318.        "    .dataframe tbody tr th:only-of-type {\n",
  319.        "        vertical-align: middle;\n",
  320.        "    }\n",
  321.        "\n",
  322.        "    .dataframe tbody tr th {\n",
  323.        "        vertical-align: top;\n",
  324.        "    }\n",
  325.        "\n",
  326.        "    .dataframe thead th {\n",
  327.        "        text-align: right;\n",
  328.        "    }\n",
  329.        "</style>\n",
  330.        "<table border=\"1\" class=\"dataframe\">\n",
  331.        "  <thead>\n",
  332.        "    <tr style=\"text-align: right;\">\n",
  333.        "      <th></th>\n",
  334.        "      <th>participant_id</th>\n",
  335.        "      <th>pt</th>\n",
  336.        "      <th>years</th>\n",
  337.        "      <th>agegroup</th>\n",
  338.        "      <th>ethnicity</th>\n",
  339.        "      <th>es</th>\n",
  340.        "      <th>cs</th>\n",
  341.        "      <th>distfromcentre</th>\n",
  342.        "      <th>centre</th>\n",
  343.        "    </tr>\n",
  344.        "  </thead>\n",
  345.        "  <tbody>\n",
  346.        "    <tr>\n",
  347.        "      <th>2</th>\n",
  348.        "      <td>100009</td>\n",
  349.        "      <td>case</td>\n",
  350.        "      <td>81</td>\n",
  351.        "      <td>80 to 84</td>\n",
  352.        "      <td>White</td>\n",
  353.        "      <td>No</td>\n",
  354.        "      <td>NaN</td>\n",
  355.        "      <td>11.851470</td>\n",
  356.        "      <td>10</td>\n",
  357.        "    </tr>\n",
  358.        "    <tr>\n",
  359.        "      <th>4</th>\n",
  360.        "      <td>100009</td>\n",
  361.        "      <td>case</td>\n",
  362.        "      <td>81</td>\n",
  363.        "      <td>80 to 84</td>\n",
  364.        "      <td>White</td>\n",
  365.        "      <td>No</td>\n",
  366.        "      <td>NaN</td>\n",
  367.        "      <td>11.851470</td>\n",
  368.        "      <td>10</td>\n",
  369.        "    </tr>\n",
  370.        "    <tr>\n",
  371.        "      <th>5</th>\n",
  372.        "      <td>70003</td>\n",
  373.        "      <td>case</td>\n",
  374.        "      <td>55</td>\n",
  375.        "      <td>55 to 59</td>\n",
  376.        "      <td>White</td>\n",
  377.        "      <td>Yes</td>\n",
  378.        "      <td>Yes</td>\n",
  379.        "      <td>30.403741</td>\n",
  380.        "      <td>7</td>\n",
  381.        "    </tr>\n",
  382.        "    <tr>\n",
  383.        "      <th>6</th>\n",
  384.        "      <td>80001</td>\n",
  385.        "      <td>case</td>\n",
  386.        "      <td>87</td>\n",
  387.        "      <td>85 to 90</td>\n",
  388.        "      <td>White</td>\n",
  389.        "      <td>Yes</td>\n",
  390.        "      <td>No</td>\n",
  391.        "      <td>5.549752</td>\n",
  392.        "      <td>8</td>\n",
  393.        "    </tr>\n",
  394.        "    <tr>\n",
  395.        "      <th>7</th>\n",
  396.        "      <td>10002</td>\n",
  397.        "      <td>case</td>\n",
  398.        "      <td>75</td>\n",
  399.        "      <td>75 to 79</td>\n",
  400.        "      <td>White</td>\n",
  401.        "      <td>Yes</td>\n",
  402.        "      <td>No</td>\n",
  403.        "      <td>6.779659</td>\n",
  404.        "      <td>1</td>\n",
  405.        "    </tr>\n",
  406.        "  </tbody>\n",
  407.        "</table>\n",
  408.        "</div>"
  409.       ],
  410.       "text/plain": [
  411.        "   participant_id    pt  years  agegroup ethnicity   es   cs  distfromcentre  \\\n",
  412.        "2          100009  case     81  80 to 84     White   No  NaN       11.851470   \n",
  413.        "4          100009  case     81  80 to 84     White   No  NaN       11.851470   \n",
  414.        "5           70003  case     55  55 to 59     White  Yes  Yes       30.403741   \n",
  415.        "6           80001  case     87  85 to 90     White  Yes   No        5.549752   \n",
  416.        "7           10002  case     75  75 to 79     White  Yes   No        6.779659   \n",
  417.        "\n",
  418.        "   centre  \n",
  419.        "2      10  \n",
  420.        "4      10  \n",
  421.        "5       7  \n",
  422.        "6       8  \n",
  423.        "7       1  "
  424.       ]
  425.      },
  426.      "execution_count": 84,
  427.      "metadata": {},
  428.      "output_type": "execute_result"
  429.     }
  430.    ],
  431.    "source": [
  432.     "df.head()"
  433.    ]
  434.   },
  435.   {
  436.    "cell_type": "code",
  437.    "execution_count": 85,
  438.    "metadata": {},
  439.    "outputs": [],
  440.    "source": [
  441.     "df = pd.merge(df, cumrisk, on='participant_id')\n",
  442.     "df = pd.merge(df, maxcat, on='participant_id')\n"
  443.    ]
  444.   },
  445.   {
  446.    "cell_type": "code",
  447.    "execution_count": 86,
  448.    "metadata": {},
  449.    "outputs": [
  450.     {
  451.      "data": {
  452.       "text/html": [
  453.        "<div>\n",
  454.        "<style scoped>\n",
  455.        "    .dataframe tbody tr th:only-of-type {\n",
  456.        "        vertical-align: middle;\n",
  457.        "    }\n",
  458.        "\n",
  459.        "    .dataframe tbody tr th {\n",
  460.        "        vertical-align: top;\n",
  461.        "    }\n",
  462.        "\n",
  463.        "    .dataframe thead th {\n",
  464.        "        text-align: right;\n",
  465.        "    }\n",
  466.        "</style>\n",
  467.        "<table border=\"1\" class=\"dataframe\">\n",
  468.        "  <thead>\n",
  469.        "    <tr style=\"text-align: right;\">\n",
  470.        "      <th></th>\n",
  471.        "      <th>participant_id</th>\n",
  472.        "      <th>pt</th>\n",
  473.        "      <th>years</th>\n",
  474.        "      <th>agegroup</th>\n",
  475.        "      <th>ethnicity</th>\n",
  476.        "      <th>es</th>\n",
  477.        "      <th>cs</th>\n",
  478.        "      <th>distfromcentre</th>\n",
  479.        "      <th>centre</th>\n",
  480.        "      <th>risk</th>\n",
  481.        "      <th>jobcat</th>\n",
  482.        "    </tr>\n",
  483.        "  </thead>\n",
  484.        "  <tbody>\n",
  485.        "    <tr>\n",
  486.        "      <th>0</th>\n",
  487.        "      <td>100009</td>\n",
  488.        "      <td>case</td>\n",
  489.        "      <td>81</td>\n",
  490.        "      <td>80 to 84</td>\n",
  491.        "      <td>White</td>\n",
  492.        "      <td>No</td>\n",
  493.        "      <td>NaN</td>\n",
  494.        "      <td>11.85147</td>\n",
  495.        "      <td>10</td>\n",
  496.        "      <td>0</td>\n",
  497.        "      <td>1.0</td>\n",
  498.        "    </tr>\n",
  499.        "    <tr>\n",
  500.        "      <th>1</th>\n",
  501.        "      <td>100009</td>\n",
  502.        "      <td>case</td>\n",
  503.        "      <td>81</td>\n",
  504.        "      <td>80 to 84</td>\n",
  505.        "      <td>White</td>\n",
  506.        "      <td>No</td>\n",
  507.        "      <td>NaN</td>\n",
  508.        "      <td>11.85147</td>\n",
  509.        "      <td>10</td>\n",
  510.        "      <td>0</td>\n",
  511.        "      <td>1.0</td>\n",
  512.        "    </tr>\n",
  513.        "    <tr>\n",
  514.        "      <th>2</th>\n",
  515.        "      <td>100009</td>\n",
  516.        "      <td>case</td>\n",
  517.        "      <td>81</td>\n",
  518.        "      <td>80 to 84</td>\n",
  519.        "      <td>White</td>\n",
  520.        "      <td>No</td>\n",
  521.        "      <td>NaN</td>\n",
  522.        "      <td>11.85147</td>\n",
  523.        "      <td>10</td>\n",
  524.        "      <td>0</td>\n",
  525.        "      <td>1.0</td>\n",
  526.        "    </tr>\n",
  527.        "    <tr>\n",
  528.        "      <th>3</th>\n",
  529.        "      <td>100009</td>\n",
  530.        "      <td>case</td>\n",
  531.        "      <td>81</td>\n",
  532.        "      <td>80 to 84</td>\n",
  533.        "      <td>White</td>\n",
  534.        "      <td>No</td>\n",
  535.        "      <td>NaN</td>\n",
  536.        "      <td>11.85147</td>\n",
  537.        "      <td>10</td>\n",
  538.        "      <td>0</td>\n",
  539.        "      <td>1.0</td>\n",
  540.        "    </tr>\n",
  541.        "    <tr>\n",
  542.        "      <th>4</th>\n",
  543.        "      <td>100009</td>\n",
  544.        "      <td>case</td>\n",
  545.        "      <td>81</td>\n",
  546.        "      <td>80 to 84</td>\n",
  547.        "      <td>White</td>\n",
  548.        "      <td>No</td>\n",
  549.        "      <td>NaN</td>\n",
  550.        "      <td>11.85147</td>\n",
  551.        "      <td>10</td>\n",
  552.        "      <td>0</td>\n",
  553.        "      <td>1.0</td>\n",
  554.        "    </tr>\n",
  555.        "  </tbody>\n",
  556.        "</table>\n",
  557.        "</div>"
  558.       ],
  559.       "text/plain": [
  560.        "   participant_id    pt  years  agegroup ethnicity  es   cs  distfromcentre  \\\n",
  561.        "0          100009  case     81  80 to 84     White  No  NaN        11.85147   \n",
  562.        "1          100009  case     81  80 to 84     White  No  NaN        11.85147   \n",
  563.        "2          100009  case     81  80 to 84     White  No  NaN        11.85147   \n",
  564.        "3          100009  case     81  80 to 84     White  No  NaN        11.85147   \n",
  565.        "4          100009  case     81  80 to 84     White  No  NaN        11.85147   \n",
  566.        "\n",
  567.        "   centre  risk  jobcat  \n",
  568.        "0      10     0     1.0  \n",
  569.        "1      10     0     1.0  \n",
  570.        "2      10     0     1.0  \n",
  571.        "3      10     0     1.0  \n",
  572.        "4      10     0     1.0  "
  573.       ]
  574.      },
  575.      "execution_count": 86,
  576.      "metadata": {},
  577.      "output_type": "execute_result"
  578.     }
  579.    ],
  580.    "source": [
  581.     "df.head()"
  582.    ]
  583.   },
  584.   {
  585.    "cell_type": "code",
  586.    "execution_count": 87,
  587.    "metadata": {},
  588.    "outputs": [],
  589.    "source": [
  590.     "# defining anything less than cat 5 as exposed\n",
  591.     "\n",
  592.     "ev_exposed = df.groupby('participant_id').jobcat.min() < 5\n",
  593.     "ev_exposed = ev_exposed.reset_index() \n",
  594.     "ev_exposed.columns = ['participant_id', 'exposed']"
  595.    ]
  596.   },
  597.   {
  598.    "cell_type": "code",
  599.    "execution_count": 88,
  600.    "metadata": {},
  601.    "outputs": [],
  602.    "source": [
  603.     "df = pd.merge(df, ev_exposed, on='participant_id')"
  604.    ]
  605.   },
  606.   {
  607.    "cell_type": "code",
  608.    "execution_count": 89,
  609.    "metadata": {},
  610.    "outputs": [
  611.     {
  612.      "data": {
  613.       "text/html": [
  614.        "<div>\n",
  615.        "<style scoped>\n",
  616.        "    .dataframe tbody tr th:only-of-type {\n",
  617.        "        vertical-align: middle;\n",
  618.        "    }\n",
  619.        "\n",
  620.        "    .dataframe tbody tr th {\n",
  621.        "        vertical-align: top;\n",
  622.        "    }\n",
  623.        "\n",
  624.        "    .dataframe thead th {\n",
  625.        "        text-align: right;\n",
  626.        "    }\n",
  627.        "</style>\n",
  628.        "<table border=\"1\" class=\"dataframe\">\n",
  629.        "  <thead>\n",
  630.        "    <tr style=\"text-align: right;\">\n",
  631.        "      <th></th>\n",
  632.        "      <th>participant_id</th>\n",
  633.        "      <th>pt</th>\n",
  634.        "      <th>years</th>\n",
  635.        "      <th>agegroup</th>\n",
  636.        "      <th>ethnicity</th>\n",
  637.        "      <th>es</th>\n",
  638.        "      <th>cs</th>\n",
  639.        "      <th>distfromcentre</th>\n",
  640.        "      <th>centre</th>\n",
  641.        "      <th>risk</th>\n",
  642.        "      <th>jobcat</th>\n",
  643.        "      <th>exposed</th>\n",
  644.        "    </tr>\n",
  645.        "  </thead>\n",
  646.        "  <tbody>\n",
  647.        "    <tr>\n",
  648.        "      <th>0</th>\n",
  649.        "      <td>100009</td>\n",
  650.        "      <td>case</td>\n",
  651.        "      <td>81</td>\n",
  652.        "      <td>80 to 84</td>\n",
  653.        "      <td>White</td>\n",
  654.        "      <td>No</td>\n",
  655.        "      <td>NaN</td>\n",
  656.        "      <td>11.85147</td>\n",
  657.        "      <td>10</td>\n",
  658.        "      <td>0</td>\n",
  659.        "      <td>1.0</td>\n",
  660.        "      <td>True</td>\n",
  661.        "    </tr>\n",
  662.        "    <tr>\n",
  663.        "      <th>1</th>\n",
  664.        "      <td>100009</td>\n",
  665.        "      <td>case</td>\n",
  666.        "      <td>81</td>\n",
  667.        "      <td>80 to 84</td>\n",
  668.        "      <td>White</td>\n",
  669.        "      <td>No</td>\n",
  670.        "      <td>NaN</td>\n",
  671.        "      <td>11.85147</td>\n",
  672.        "      <td>10</td>\n",
  673.        "      <td>0</td>\n",
  674.        "      <td>1.0</td>\n",
  675.        "      <td>True</td>\n",
  676.        "    </tr>\n",
  677.        "    <tr>\n",
  678.        "      <th>2</th>\n",
  679.        "      <td>100009</td>\n",
  680.        "      <td>case</td>\n",
  681.        "      <td>81</td>\n",
  682.        "      <td>80 to 84</td>\n",
  683.        "      <td>White</td>\n",
  684.        "      <td>No</td>\n",
  685.        "      <td>NaN</td>\n",
  686.        "      <td>11.85147</td>\n",
  687.        "      <td>10</td>\n",
  688.        "      <td>0</td>\n",
  689.        "      <td>1.0</td>\n",
  690.        "      <td>True</td>\n",
  691.        "    </tr>\n",
  692.        "    <tr>\n",
  693.        "      <th>3</th>\n",
  694.        "      <td>100009</td>\n",
  695.        "      <td>case</td>\n",
  696.        "      <td>81</td>\n",
  697.        "      <td>80 to 84</td>\n",
  698.        "      <td>White</td>\n",
  699.        "      <td>No</td>\n",
  700.        "      <td>NaN</td>\n",
  701.        "      <td>11.85147</td>\n",
  702.        "      <td>10</td>\n",
  703.        "      <td>0</td>\n",
  704.        "      <td>1.0</td>\n",
  705.        "      <td>True</td>\n",
  706.        "    </tr>\n",
  707.        "    <tr>\n",
  708.        "      <th>4</th>\n",
  709.        "      <td>100009</td>\n",
  710.        "      <td>case</td>\n",
  711.        "      <td>81</td>\n",
  712.        "      <td>80 to 84</td>\n",
  713.        "      <td>White</td>\n",
  714.        "      <td>No</td>\n",
  715.        "      <td>NaN</td>\n",
  716.        "      <td>11.85147</td>\n",
  717.        "      <td>10</td>\n",
  718.        "      <td>0</td>\n",
  719.        "      <td>1.0</td>\n",
  720.        "      <td>True</td>\n",
  721.        "    </tr>\n",
  722.        "  </tbody>\n",
  723.        "</table>\n",
  724.        "</div>"
  725.       ],
  726.       "text/plain": [
  727.        "   participant_id    pt  years  agegroup ethnicity  es   cs  distfromcentre  \\\n",
  728.        "0          100009  case     81  80 to 84     White  No  NaN        11.85147   \n",
  729.        "1          100009  case     81  80 to 84     White  No  NaN        11.85147   \n",
  730.        "2          100009  case     81  80 to 84     White  No  NaN        11.85147   \n",
  731.        "3          100009  case     81  80 to 84     White  No  NaN        11.85147   \n",
  732.        "4          100009  case     81  80 to 84     White  No  NaN        11.85147   \n",
  733.        "\n",
  734.        "   centre  risk  jobcat  exposed  \n",
  735.        "0      10     0     1.0     True  \n",
  736.        "1      10     0     1.0     True  \n",
  737.        "2      10     0     1.0     True  \n",
  738.        "3      10     0     1.0     True  \n",
  739.        "4      10     0     1.0     True  "
  740.       ]
  741.      },
  742.      "execution_count": 89,
  743.      "metadata": {},
  744.      "output_type": "execute_result"
  745.     }
  746.    ],
  747.    "source": [
  748.     "df.head()"
  749.    ]
  750.   },
  751.   {
  752.    "cell_type": "code",
  753.    "execution_count": 90,
  754.    "metadata": {},
  755.    "outputs": [],
  756.    "source": [
  757.     "df.cs = df.cs.fillna('No')"
  758.    ]
  759.   },
  760.   {
  761.    "cell_type": "code",
  762.    "execution_count": 91,
  763.    "metadata": {},
  764.    "outputs": [],
  765.    "source": [
  766.     "# pandas has a get dummies function for this which would be more elegant\n",
  767.     "df['pt'] = df['pt'].str.replace('case', '1')\n",
  768.     "df['pt'] = df['pt'].str.replace('control', '0')\n",
  769.     "df['pt'] = df['pt'].astype(int)\n",
  770.     "\n",
  771.     "df['es'] = df['es'].str.replace('Yes', '1')\n",
  772.     "df['es'] = df['es'].str.replace('No', '0')\n",
  773.     "df['es'] = df['es'].astype(int)\n",
  774.     "\n",
  775.     "df['exposed'] = df['exposed'].astype(str)\n",
  776.     "df['exposed'] = df['exposed'].str.replace('True', '1')\n",
  777.     "df['exposed'] = df['exposed'].str.replace('False', '0')\n",
  778.     "df['exposed'] = df['exposed'].astype(int)"
  779.    ]
  780.   },
  781.   {
  782.    "cell_type": "code",
  783.    "execution_count": 92,
  784.    "metadata": {},
  785.    "outputs": [
  786.     {
  787.      "data": {
  788.       "text/html": [
  789.        "<div>\n",
  790.        "<style scoped>\n",
  791.        "    .dataframe tbody tr th:only-of-type {\n",
  792.        "        vertical-align: middle;\n",
  793.        "    }\n",
  794.        "\n",
  795.        "    .dataframe tbody tr th {\n",
  796.        "        vertical-align: top;\n",
  797.        "    }\n",
  798.        "\n",
  799.        "    .dataframe thead th {\n",
  800.        "        text-align: right;\n",
  801.        "    }\n",
  802.        "</style>\n",
  803.        "<table border=\"1\" class=\"dataframe\">\n",
  804.        "  <thead>\n",
  805.        "    <tr style=\"text-align: right;\">\n",
  806.        "      <th></th>\n",
  807.        "      <th>participant_id</th>\n",
  808.        "      <th>pt</th>\n",
  809.        "      <th>years</th>\n",
  810.        "      <th>agegroup</th>\n",
  811.        "      <th>ethnicity</th>\n",
  812.        "      <th>es</th>\n",
  813.        "      <th>cs</th>\n",
  814.        "      <th>distfromcentre</th>\n",
  815.        "      <th>centre</th>\n",
  816.        "      <th>risk</th>\n",
  817.        "      <th>jobcat</th>\n",
  818.        "      <th>exposed</th>\n",
  819.        "    </tr>\n",
  820.        "  </thead>\n",
  821.        "  <tbody>\n",
  822.        "    <tr>\n",
  823.        "      <th>0</th>\n",
  824.        "      <td>100009</td>\n",
  825.        "      <td>1</td>\n",
  826.        "      <td>81</td>\n",
  827.        "      <td>80 to 84</td>\n",
  828.        "      <td>White</td>\n",
  829.        "      <td>0</td>\n",
  830.        "      <td>No</td>\n",
  831.        "      <td>11.85147</td>\n",
  832.        "      <td>10</td>\n",
  833.        "      <td>0</td>\n",
  834.        "      <td>1.0</td>\n",
  835.        "      <td>1</td>\n",
  836.        "    </tr>\n",
  837.        "    <tr>\n",
  838.        "      <th>1</th>\n",
  839.        "      <td>100009</td>\n",
  840.        "      <td>1</td>\n",
  841.        "      <td>81</td>\n",
  842.        "      <td>80 to 84</td>\n",
  843.        "      <td>White</td>\n",
  844.        "      <td>0</td>\n",
  845.        "      <td>No</td>\n",
  846.        "      <td>11.85147</td>\n",
  847.        "      <td>10</td>\n",
  848.        "      <td>0</td>\n",
  849.        "      <td>1.0</td>\n",
  850.        "      <td>1</td>\n",
  851.        "    </tr>\n",
  852.        "    <tr>\n",
  853.        "      <th>2</th>\n",
  854.        "      <td>100009</td>\n",
  855.        "      <td>1</td>\n",
  856.        "      <td>81</td>\n",
  857.        "      <td>80 to 84</td>\n",
  858.        "      <td>White</td>\n",
  859.        "      <td>0</td>\n",
  860.        "      <td>No</td>\n",
  861.        "      <td>11.85147</td>\n",
  862.        "      <td>10</td>\n",
  863.        "      <td>0</td>\n",
  864.        "      <td>1.0</td>\n",
  865.        "      <td>1</td>\n",
  866.        "    </tr>\n",
  867.        "    <tr>\n",
  868.        "      <th>3</th>\n",
  869.        "      <td>100009</td>\n",
  870.        "      <td>1</td>\n",
  871.        "      <td>81</td>\n",
  872.        "      <td>80 to 84</td>\n",
  873.        "      <td>White</td>\n",
  874.        "      <td>0</td>\n",
  875.        "      <td>No</td>\n",
  876.        "      <td>11.85147</td>\n",
  877.        "      <td>10</td>\n",
  878.        "      <td>0</td>\n",
  879.        "      <td>1.0</td>\n",
  880.        "      <td>1</td>\n",
  881.        "    </tr>\n",
  882.        "    <tr>\n",
  883.        "      <th>4</th>\n",
  884.        "      <td>100009</td>\n",
  885.        "      <td>1</td>\n",
  886.        "      <td>81</td>\n",
  887.        "      <td>80 to 84</td>\n",
  888.        "      <td>White</td>\n",
  889.        "      <td>0</td>\n",
  890.        "      <td>No</td>\n",
  891.        "      <td>11.85147</td>\n",
  892.        "      <td>10</td>\n",
  893.        "      <td>0</td>\n",
  894.        "      <td>1.0</td>\n",
  895.        "      <td>1</td>\n",
  896.        "    </tr>\n",
  897.        "  </tbody>\n",
  898.        "</table>\n",
  899.        "</div>"
  900.       ],
  901.       "text/plain": [
  902.        "   participant_id  pt  years  agegroup ethnicity  es  cs  distfromcentre  \\\n",
  903.        "0          100009   1     81  80 to 84     White   0  No        11.85147   \n",
  904.        "1          100009   1     81  80 to 84     White   0  No        11.85147   \n",
  905.        "2          100009   1     81  80 to 84     White   0  No        11.85147   \n",
  906.        "3          100009   1     81  80 to 84     White   0  No        11.85147   \n",
  907.        "4          100009   1     81  80 to 84     White   0  No        11.85147   \n",
  908.        "\n",
  909.        "   centre  risk  jobcat  exposed  \n",
  910.        "0      10     0     1.0        1  \n",
  911.        "1      10     0     1.0        1  \n",
  912.        "2      10     0     1.0        1  \n",
  913.        "3      10     0     1.0        1  \n",
  914.        "4      10     0     1.0        1  "
  915.       ]
  916.      },
  917.      "execution_count": 92,
  918.      "metadata": {},
  919.      "output_type": "execute_result"
  920.     }
  921.    ],
  922.    "source": [
  923.     "df.head()"
  924.    ]
  925.   },
  926.   {
  927.    "cell_type": "code",
  928.    "execution_count": 96,
  929.    "metadata": {},
  930.    "outputs": [],
  931.    "source": [
  932.     "df.columns = ['participant_id', 'case', 'age', 'agegroup', 'ethnicity', 'ever_smoked',  'current_smoker', 'distfromcentre', 'centre', 'exp_cum', 'jobcat', 'exp_bin']"
  933.    ]
  934.   },
  935.   {
  936.    "cell_type": "code",
  937.    "execution_count": 97,
  938.    "metadata": {},
  939.    "outputs": [],
  940.    "source": [
  941.     "df = df.drop_duplicates(subset='participant_id')"
  942.    ]
  943.   },
  944.   {
  945.    "cell_type": "code",
  946.    "execution_count": 98,
  947.    "metadata": {},
  948.    "outputs": [
  949.     {
  950.      "data": {
  951.       "text/html": [
  952.        "<div>\n",
  953.        "<style scoped>\n",
  954.        "    .dataframe tbody tr th:only-of-type {\n",
  955.        "        vertical-align: middle;\n",
  956.        "    }\n",
  957.        "\n",
  958.        "    .dataframe tbody tr th {\n",
  959.        "        vertical-align: top;\n",
  960.        "    }\n",
  961.        "\n",
  962.        "    .dataframe thead th {\n",
  963.        "        text-align: right;\n",
  964.        "    }\n",
  965.        "</style>\n",
  966.        "<table border=\"1\" class=\"dataframe\">\n",
  967.        "  <thead>\n",
  968.        "    <tr style=\"text-align: right;\">\n",
  969.        "      <th></th>\n",
  970.        "      <th>participant_id</th>\n",
  971.        "      <th>case</th>\n",
  972.        "      <th>age</th>\n",
  973.        "      <th>agegroup</th>\n",
  974.        "      <th>ethnicity</th>\n",
  975.        "      <th>ever_smoked</th>\n",
  976.        "      <th>current_smoker</th>\n",
  977.        "      <th>distfromcentre</th>\n",
  978.        "      <th>centre</th>\n",
  979.        "      <th>exp_cum</th>\n",
  980.        "      <th>jobcat</th>\n",
  981.        "      <th>exp_bin</th>\n",
  982.        "    </tr>\n",
  983.        "  </thead>\n",
  984.        "  <tbody>\n",
  985.        "    <tr>\n",
  986.        "      <th>0</th>\n",
  987.        "      <td>100009</td>\n",
  988.        "      <td>1</td>\n",
  989.        "      <td>81</td>\n",
  990.        "      <td>80 to 84</td>\n",
  991.        "      <td>White</td>\n",
  992.        "      <td>0</td>\n",
  993.        "      <td>No</td>\n",
  994.        "      <td>11.851470</td>\n",
  995.        "      <td>10</td>\n",
  996.        "      <td>0</td>\n",
  997.        "      <td>1.0</td>\n",
  998.        "      <td>1</td>\n",
  999.        "    </tr>\n",
  1000.        "    <tr>\n",
  1001.        "      <th>5</th>\n",
  1002.        "      <td>70003</td>\n",
  1003.        "      <td>1</td>\n",
  1004.        "      <td>55</td>\n",
  1005.        "      <td>55 to 59</td>\n",
  1006.        "      <td>White</td>\n",
  1007.        "      <td>1</td>\n",
  1008.        "      <td>Yes</td>\n",
  1009.        "      <td>30.403741</td>\n",
  1010.        "      <td>7</td>\n",
  1011.        "      <td>47</td>\n",
  1012.        "      <td>1.0</td>\n",
  1013.        "      <td>1</td>\n",
  1014.        "    </tr>\n",
  1015.        "    <tr>\n",
  1016.        "      <th>8</th>\n",
  1017.        "      <td>80001</td>\n",
  1018.        "      <td>1</td>\n",
  1019.        "      <td>87</td>\n",
  1020.        "      <td>85 to 90</td>\n",
  1021.        "      <td>White</td>\n",
  1022.        "      <td>1</td>\n",
  1023.        "      <td>No</td>\n",
  1024.        "      <td>5.549752</td>\n",
  1025.        "      <td>8</td>\n",
  1026.        "      <td>43</td>\n",
  1027.        "      <td>2.2</td>\n",
  1028.        "      <td>1</td>\n",
  1029.        "    </tr>\n",
  1030.        "    <tr>\n",
  1031.        "      <th>13</th>\n",
  1032.        "      <td>10002</td>\n",
  1033.        "      <td>1</td>\n",
  1034.        "      <td>75</td>\n",
  1035.        "      <td>75 to 79</td>\n",
  1036.        "      <td>White</td>\n",
  1037.        "      <td>1</td>\n",
  1038.        "      <td>No</td>\n",
  1039.        "      <td>6.779659</td>\n",
  1040.        "      <td>1</td>\n",
  1041.        "      <td>44</td>\n",
  1042.        "      <td>3.0</td>\n",
  1043.        "      <td>1</td>\n",
  1044.        "    </tr>\n",
  1045.        "    <tr>\n",
  1046.        "      <th>16</th>\n",
  1047.        "      <td>80010</td>\n",
  1048.        "      <td>1</td>\n",
  1049.        "      <td>77</td>\n",
  1050.        "      <td>75 to 79</td>\n",
  1051.        "      <td>White</td>\n",
  1052.        "      <td>1</td>\n",
  1053.        "      <td>No</td>\n",
  1054.        "      <td>33.762778</td>\n",
  1055.        "      <td>8</td>\n",
  1056.        "      <td>23</td>\n",
  1057.        "      <td>2.1</td>\n",
  1058.        "      <td>1</td>\n",
  1059.        "    </tr>\n",
  1060.        "  </tbody>\n",
  1061.        "</table>\n",
  1062.        "</div>"
  1063.       ],
  1064.       "text/plain": [
  1065.        "    participant_id  case  age  agegroup ethnicity  ever_smoked current_smoker  \\\n",
  1066.        "0           100009     1   81  80 to 84     White            0             No   \n",
  1067.        "5            70003     1   55  55 to 59     White            1            Yes   \n",
  1068.        "8            80001     1   87  85 to 90     White            1             No   \n",
  1069.        "13           10002     1   75  75 to 79     White            1             No   \n",
  1070.        "16           80010     1   77  75 to 79     White            1             No   \n",
  1071.        "\n",
  1072.        "    distfromcentre  centre  exp_cum  jobcat  exp_bin  \n",
  1073.        "0        11.851470      10        0     1.0        1  \n",
  1074.        "5        30.403741       7       47     1.0        1  \n",
  1075.        "8         5.549752       8       43     2.2        1  \n",
  1076.        "13        6.779659       1       44     3.0        1  \n",
  1077.        "16       33.762778       8       23     2.1        1  "
  1078.       ]
  1079.      },
  1080.      "execution_count": 98,
  1081.      "metadata": {},
  1082.      "output_type": "execute_result"
  1083.     }
  1084.    ],
  1085.    "source": [
  1086.     "df.head() # note exp_cum is buggy currently"
  1087.    ]
  1088.   },
  1089.   {
  1090.    "cell_type": "code",
  1091.    "execution_count": 47,
  1092.    "metadata": {},
  1093.    "outputs": [
  1094.     {
  1095.      "data": {
  1096.       "text/plain": [
  1097.        "<matplotlib.axes._subplots.AxesSubplot at 0x7f97790adc50>"
  1098.       ]
  1099.      },
  1100.      "execution_count": 47,
  1101.      "metadata": {},
  1102.      "output_type": "execute_result"
  1103.     },
  1104.     {
  1105.      "data": {
  1106.       "image/png": "\n",
  1107.       "text/plain": [
  1108.        "<matplotlib.figure.Figure at 0x7f977909c630>"
  1109.       ]
  1110.      },
  1111.      "metadata": {},
  1112.      "output_type": "display_data"
  1113.     }
  1114.    ],
  1115.    "source": [
  1116.     "df.plot(x='distfromcentre', y='case', kind='scatter') # being a case is assoc with being further from centre"
  1117.    ]
  1118.   },
  1119.   {
  1120.    "cell_type": "code",
  1121.    "execution_count": 44,
  1122.    "metadata": {},
  1123.    "outputs": [
  1124.     {
  1125.      "data": {
  1126.       "text/plain": [
  1127.        "<matplotlib.axes._subplots.AxesSubplot at 0x7f97791cf080>"
  1128.       ]
  1129.      },
  1130.      "execution_count": 44,
  1131.      "metadata": {},
  1132.      "output_type": "execute_result"
  1133.     },
  1134.     {
  1135.      "data": {
  1136.       "image/png": "\n",
  1137.       "text/plain": [
  1138.        "<matplotlib.figure.Figure at 0x7f97791cdbe0>"
  1139.       ]
  1140.      },
  1141.      "metadata": {},
  1142.      "output_type": "display_data"
  1143.     }
  1144.    ],
  1145.    "source": [
  1146.     "df.boxplot(column='distfromcentre', by='case', vert=False, figsize=(5,5))"
  1147.    ]
  1148.   },
  1149.   {
  1150.    "cell_type": "code",
  1151.    "execution_count": 101,
  1152.    "metadata": {},
  1153.    "outputs": [
  1154.     {
  1155.      "data": {
  1156.       "text/plain": [
  1157.        "case\n",
  1158.        "0    13.149346\n",
  1159.        "1    27.014471\n",
  1160.        "Name: distfromcentre, dtype: float64"
  1161.       ]
  1162.      },
  1163.      "execution_count": 101,
  1164.      "metadata": {},
  1165.      "output_type": "execute_result"
  1166.     }
  1167.    ],
  1168.    "source": [
  1169.     "df.groupby('case').distfromcentre.mean()"
  1170.    ]
  1171.   },
  1172.   {
  1173.    "cell_type": "code",
  1174.    "execution_count": 24,
  1175.    "metadata": {},
  1176.    "outputs": [
  1177.     {
  1178.      "data": {
  1179.       "text/plain": [
  1180.        "<matplotlib.axes._subplots.AxesSubplot at 0x7f977cbf3cc0>"
  1181.       ]
  1182.      },
  1183.      "execution_count": 24,
  1184.      "metadata": {},
  1185.      "output_type": "execute_result"
  1186.     },
  1187.     {
  1188.      "data": {
  1189.       "image/png": "\n",
  1190.       "text/plain": [
  1191.        "<matplotlib.figure.Figure at 0x7f977cc4d5c0>"
  1192.       ]
  1193.      },
  1194.      "metadata": {},
  1195.      "output_type": "display_data"
  1196.     }
  1197.    ],
  1198.    "source": [
  1199.     "df.plot(x='distfromcentre', y='exp_bin', kind='scatter') # being exposed is assoc with being closer to centre"
  1200.    ]
  1201.   },
  1202.   {
  1203.    "cell_type": "code",
  1204.    "execution_count": 46,
  1205.    "metadata": {},
  1206.    "outputs": [
  1207.     {
  1208.      "data": {
  1209.       "text/plain": [
  1210.        "<matplotlib.axes._subplots.AxesSubplot at 0x7f97790ea8d0>"
  1211.       ]
  1212.      },
  1213.      "execution_count": 46,
  1214.      "metadata": {},
  1215.      "output_type": "execute_result"
  1216.     },
  1217.     {
  1218.      "data": {
  1219.       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAUUAAAFZCAYAAAAPYD29AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAFz1JREFUeJzt3XmYplWZ3/FvT7eAIEgDDciOiHcEZ8SdRCBIohGiYoILi4BDI0YBw8BkRFHES3GaiTo4ICoCAso6gga3uOAEnKgsIeIMkhuBoGAjtNAzQjcNNvT8cZ7Cm+Ktrfutequqv5/r6qvrfdZznlP1q/Nsp+asWrUKSVLzR4MugCRNJ4aiJBWGoiQVhqIkFYaiJBWGoiQVhuIMFhHnR8THBl2OQRvtOETEOyLi76e6TP0wVvtGxMMR8dypLNPaYN6gCzAbRMRdwBbA48DvgR8B/yUz7x5gsZ4iIlYBO2fm7YMui/ojM5816DLMRvYU++cN3Tfpc4D7gDMGXJ5JExFzIsLvHc1K9hT7LDNXRMRXgNOHpkXEs2khuS+wHPgC8PHMfCIiPgssyMw3d8ueBrwM+PfAvwW+DJwFHA88DJyUmRf12ndEvBN4H7AJ8Pe03uriiLi2W+Tmrse4MDMvG7buXOCvgMOBh4BPdmV+RmaujIj/BfxvYG/gJcAfR8Ry4HPAHsCDwGmZ+YVue+cD92TmB7vPewNfzsxtus93AZ8HDqX9Ivka8O7MXNHNfz3wMWAH4OddXX7WzXsxcC6wM/AtYKzXsuZExBnAYcC9wNGZeXVEvAU4MTNfWo7DCcCemfmmHsf32cCngP2AJ4AvAh/OzMf72Y7DbBYR3wN2B24CDsvMX3b7eLL33x3vZd3x2qs7Zgdn5h3j2IcKf9v3WUSsD7wN+EmZfAbwbOC5tB+Qw4A/7eadAPxJd+1rT2AhcHhmDv2gbwlsBmxNC6yzIyJ67Hcf4C+Bt9JC5pfApQCZuVe32Isy81nDA7HzTlpo70YLvaeFAi3AjgI27LZ/CXAPsBXwZuDjEfHvRjw4T3cI8B+AnYDnA0MB+hLgPOBdwKa08LwqItaNiHVoAfolWvj/LXDAGPt5JXAn7Th+GLgyIjYBrgJ2jIgXlGXf3m27lwuAlcDzgBcDrwWO7Ob1pR17OAT4aLfuT4HRgvQg4CPAfOB24NRxbF/DGIr987WI+Cfgd8BrgP8OT/bA3ga8PzMfysy7aL2wQwEyczntB/FTtN7EsZl5z7BtfygzH83Ma4Bv0oJvuEOA8zLzpsx8FHg/8K8jYodxlv+twKcz857MXAos6rHM+Zl5S2aupP2Q7wG8LzNXZOZPgXOG6jVOZ2bm3Zn5IO0H+KBu+juBz2fmdZn5eGZeADxK6y3tDjwDOD0zf5+ZXwFuGGM/95flLwMS+I/dcbqMdvyJiF1pPa1vDN9ARGxB+6VxXGYuy8z7gb8GDoS+tuNw38zMa7uynkRr021HWPbKzLy+a5+LaL/gNEGGYv+8KTM3BtYFjgGuiYih3sE6tJ7VkF/SegwAZOb1tJ7MHODyYdtdmpnLhq27VY/9b1X3kZkPAw/U/YxhK6DeGOp1k6hO2wp4MDMfGla28e5v+PZqvbYHToiIfxr6B2zbzd8K+HXpgQ2tO5peyw/t6wLg4IiYQwv0y7sAGm57WhjfW8r0eWDzoQX61I7DPXmMujZ9cJT1flO+Xg54I2Y1GIp91vVsrqTdid4D+C3tjvT2ZbHtgF8PfYiIo2lhuhj4i2GbnB8RGwxbd3GPXS+u++jW2bTuZwz3AtuUz716IzVYFgObRMSGw8o2tL9lwPpl3pY9tlf3Uet1N3BqZm5c/q2fmZd05dy6C7G67mh6Lb8YIDN/AjwG7AkczMinznfTequblTJtlJm7Di3Qp3Yc7sljFBHPol0yGM96Wk3eaOmz7ofvjbTrOrd2F+EvB06NiMNo39THA5/oln8+7YbC3rTf7tdHxLe709EhH4mID9Cujb2edl1suIuBSyPiYuBW4OPAdd3pOrQ74s+lXWvq5XLgv0bEN2mB9r7R6pmZd0fEj4C/jIg/p10TXEh3Kkq7/nVC95zdOsBxPTZzdER8o6v3B2instBuRH01Ir4PXE8L172Ba4Ef067rvTciPkM71q8A/m6U4m7eLX8W7VrpC2g3aIZcCJwJrMzMns80Zua9EfFd4JMR8SHazZIdgW0y85o+tuNw+0XEHt1x+CitTafNo16zkT3F/vl6RDxMu6Z4Ku0i+y3dvGNpQXMn7a7wxcB5ETGPdv3ptMy8OTN/QQuHL0XEut26vwGW0noHF9Huwv6/4TvPzKuBDwFX0HpTO9Fd7+qcAlzQnfr1upb1BeC7wM+A/0sLjZW0Hu9IDqJdg1sMfJV2J/Z73bwvATcDd3Xb7XVz5+Ju3p3dv491dbmRdl3xzK7utwPv6OY9Bvzn7vNS2vXaK0cpI8B1tDvVv6W1zZsz84Ey/0vACxm5lzjkMFrA/7zb91eA5/SzHXu4mBaeDwIvpV071iSa4yCz09fwx1imeN/7Ap/LzO3HXHj1tn8XcGRmfn8ytj8REfFM2s2Yl3SB1u/t782A2lET5+mzgCeD4dW0ntsWtN7JVwdaqKnzbuCGyQhEzTyGoobMoT3jdhnwCO2RkZMHWqIp0PVY59D7ucyp2P8tPPUm3JB3jfPhbvWZp89rqaE3ToDvAOdk5ngeJB6+jS1oD0+/GDg7M0/oayGlAbCnuJbLzB8CYwZiRJwCPC8z314mH0W7ebHRsOcAZxQHy1Dl3Wetie2Bn48UiN1d2RlvttRD4+Pp81pihEEUbge+z1MHangf8F5gI9rjI++hvclxFe3a26PAHbTHdg7ptvMY7ZrcHrRHW1bQnh88nvaYy2n84ZW2y2mvBj46dFcW+Bvgz2mP/7y7297ptLeBPpGZH+/KNpf2/ORC2rOHt9HeJLo7Iv4V7R3zlwJLaK/UXd6tdz4jDJbQDZaxJ+3ZwlXdtu/rynUG8GfA9zLz0NEGqdDsYU9xLTDeQRS6AQqOAV6emRvSBmu4KzP/J+1h8Mu6ASVelJnvoD1v91fdtKFHa/anPb+3cTf/JNr7yrsBL6I9aP3BststgfVorweeTHte8u20cNsTOLkMpHo87dnI/WihfQSwvHtT5Hu0Z/o275Y5q3uXeUjPwRJGGSxjy+5YbQ8cNdogFSMcds1QnhasHeogCquAr0TE8T2We5z2mtouEbGkvA0zET/OzK91Xz8SEYfQBke4HyAiPkILlA91y/ye9krf4xFxKXA2bWCKh4Bburuzf0J7uPtI4C8yM7t1b+62+TZaeH+xm35TRFxBG7ln6AH6K7t3k4mIi2gDN4zmCdrD6I926zw5SEU3/4Lu7ZTdgWsmcoA0vRmKa4dxDaKQbVy+42hvv+waEd8Bjs/MibxrO/wVtKcMVMHTB0J4IDOH3pp5pPv/vjL/Ef4wsMG2tFP34bYHXtkN0jBkHk99Q2WigyUsyW5sx7KPwyPi2DJtHcY3qINmEENx7fDkIAolGLejR8Bk5sXAxRGxEa1Hdxpt9JjxXnwevtzQQBVDPbbxDoTQy9201xf/scf0azLzNau53V6G12NokArHKJzlDMW1w7gGUeiuKW5NG2F7Ba2XNnTd+T7gNRHxR5n5xAT2fQnwwYi4gRY0J9NuYqyOc4CPRsTPadcF/5g2Ks83gEURcSjdwLq0a5gPZ+at49juWINlwCiDVAwbPk0znDda1gITGERhXdrgsr+lnW5uThvYANrNGYAHIuKmCez+Y8CNtIEm/oE2pP7q/gXCT9HuXn+XNvDGucAzu1B6LW0AjMVd2U/r6jMepzD6YBmjDlKh2cVHciSpsKcoSYWhKEmFoShJhaEoSYWhKEnFpDynuGTJQxO6pT1//vosXbp8MooycNZtZrJuM9P8+eszb97cOWMvObJp0VOcN2/uoIswaazbzGTdZqZ+1G1ahKIkTReGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQVhqIkFYaiJBXzBl2A0Rx7+rUsW7Fy1GUW/uoqzt3ujau9jw3Wm8cZx+212utLml2mdSguW7GS807cZ9RlbjvywjGXGc0Ri36w2utKmn08fZakwlCUpMJQlKRi2oTiAQe8YdBFGDiPgTR4Y95oiYjzgNcD92fmCyerIHfc8YvJ2vSM4TGQBm88PcXzgddNcjkkaVoYMxQz81rgwSkoiyQN3LS5pihJ08GkPLw9f/76zJs3d8Lr9XqQesGCDUdd57ZxLLM6+x2UNa3LVJtp5Z0I67Z2mpRQXLp0+YSWH2qg4W+mHLHoByxZ8tCY649nmdGsyRsxY1mwYMNxl2+3S9e8LlNpInWbaazbzNSPsPf0WZKKMUMxIi4Bfty+jHsiYuHkF0uSBmPM0+fMPGgqCiJJ08G0OX3eaaedB12EgfMYSIM3bULxiiu+PugiDJzHQBq8aROKkjQdGIqSVBiKklQYipJUTOu/0QJjv4K3cJ2NWbQGr+ltsN60PwSSptC0ToTxvX63D6+a9JJIWlt4+ixJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJhaEoSYWhKEmFoShJxbxBF2A0x55+LctWrOw5b+GvruLc7d44xSUa2QbrzeOM4/YadDEkraFpHYrLVqzkvBP36TnvtiMvHHHeIByx6AeDLoKkPvD0WZIKQ1GSCkNRkoppE4oHHPCGQRdBq8F202wzrhstEfE64NPAXOCczFzU74Lccccv+r1JTQHbTbPNmD3FiJgLfAbYF9gFOCgidpnsgknSIIzn9PkVwO2ZeWdmPgZcCuw/ucWSpMEYTyhuDdxdPt/TTZOkWWc81xTn9Ji2arQV5s9fn3nz5k64ML0egF6wYMOey942yrxBWVsf4J5u7dAvs7VeMLvrtqbGE4r3ANuWz9sAi0dbYenS5RMqxFADDX9D5YhFP2DJkodGXG+0eYPQ6w2bBQs2nHbl7JcFCzZk60vfMyvrN9vbbTbXbU2NJxRvAHaOiB2BXwMHAgev8Z4laRoa85piZq4EjgG+A9wKXJ6Zt0x2wSRpEMb1nGJmfgv41iSXRZIGbtq80bLTTjsPughaDbabZptpE4pXXPH1QRdBq8F202wzbUJRkqYDQ1GSCkNRkgpDUZKKaf03WmDkV+cWrrMxi6bRa3UbrDftD6WkcZjWP8mj/2GqfXjVlJVE0trC02dJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKgxFSSoMRUkqDEVJKuasWrVq0GWQpGnDnqIkFYaiJBWGoiQVhqIkFYaiJBWGoiQV8wa584h4HfBpYC5wTmYuGmR51lRE3AU8BDwOrMzMl0XEJsBlwA7AXcBbM3PpgIo4IRFxHvB64P7MfGE3rWd9ImIOrS33A5YD78jMmwZR7vEYoW6nAO8ElnSLfSAzv9XNez+wkNa2783M70x5occpIrYFLgS2BJ4Azs7MT8+GthulbqfQp7YbWE8xIuYCnwH2BXYBDoqIXQZVnj56dWbulpkv6z6fCFydmTsDV3efZ4rzgdcNmzZSffYFdu7+HQV8dorKuLrO5+l1A/jrrv12Kz9UuwAHArt265zVff9OVyuBEzLzBcDuwNFdHWZD241UN+hT2w3y9PkVwO2ZeWdmPgZcCuw/wPJMlv2BC7qvLwDeNMCyTEhmXgs8OGzySPXZH7gwM1dl5k+AjSPiOVNT0okboW4j2R+4NDMfzcz/D9xO+/6dljLz3qGeXmY+BNwKbM0saLtR6jaSCbfdIENxa+Du8vkeRq/cTLAK+G5E/J+IOKqbtkVm3gutQYHNB1a6/hipPrOlPY+JiJ9FxHkRMb+bNmPrFhE7AC8GrmOWtd2wukGf2m6QoTinx7SZ/s7hqzLzJbTTkaMjYq9BF2gKzYb2/CywE7AbcC/wyW76jKxbRDwLuAI4LjN/N8qiM65+PerWt7YbZCjeA2xbPm8DLB5QWfoiMxd3/98PfJXWTb9v6FSk+//+wZWwL0aqz4xvz8y8LzMfz8wngC/wh9OsGVe3iHgGLTQuyswru8mzou161a2fbTfIULwB2DkidoyIdWgXQ68aYHnWSERsEBEbDn0NvBb4R1qdDu8WOxz4H4MpYd+MVJ+rgMMiYk5E7A7889Cp2kwx7Draf6K1H7S6HRgR60bEjrQbEtdPdfnGq7ubfC5wa2Z+qsya8W03Ut362XYDHSUnIvYDTqc9knNeZp46sMKsoYh4Lq13CO1Rp4sz89SI2BS4HNgO+BXwlswc7wX+gYqIS4C9gc2A+4APA1+jR326b9YzaXf4lgN/mpk3DqLc4zFC3famnX6toj2y8q6hcIiIk4AjaHc/j8vMb095occpIvYAfgj8A+2xFYAP0K69zei2G6VuB9GntnPoMEkqfKNFkgpDUZIKQ1GSCkNRkgpDUZIKQ1GSCkNR015E7BARvx1h3lYR8XdTXSbNXgMdT1FaU92rla8edDk0exiKWmMR8UpgEbBRN+lkYBntHdSXZeY/R8T5wG8y88Tu698DO9LeS70WOLobQm60/XwC2At4JvCezPxhN1LKjZm5WbfMKuAk2qtemwL/LTOv6GN1Nct5+qw1EhEbA58DDs7Ml9JGs/488FPaCMnnRsRhwPOBD5ZVX0kbz29XYHva4Kaj2RT4WWa+AjgGuCQi1h1h2d9l5suBQ4G/Wa2Kaa1lKGpN/Rtaj+/bEfFT4Nu090+fB5wKbEIbxunAzFxZ1rssMx/upl0A7DPGfh4DvgyQmdcAjwAxwrKXdv//BNgqItabcK201vL0WWtqDq0H97SxI7uBPrcDHqX19H41yjYm+hL+aOusAMjMxyMC/D7XBNhT1Jr6EW0IuCdvdkTEy7uRV74InEMbpuqSoaHVOm/phlubB7wdGOsO8jrAwd329wTWA7J/1ZAaQ1FrpPvLhG8EPhwRN0fErcApwJ/RboiclplXA38LnF1WvZY2DNkttOHi67xeHqCF73XAWcBBY92YkVaHQ4dpynV3n2/MzDMHXRZpOHuKklTYU9S0ERGfo/0t32pl+Rva0qQzFCWp8PRZkgpDUZIKQ1GSCkNRkgpDUZKKfwF+n8BTrDNK/QAAAABJRU5ErkJggg==\n",
  1220.       "text/plain": [
  1221.        "<matplotlib.figure.Figure at 0x7f97790e1978>"
  1222.       ]
  1223.      },
  1224.      "metadata": {},
  1225.      "output_type": "display_data"
  1226.     }
  1227.    ],
  1228.    "source": [
  1229.     "df.boxplot(column='distfromcentre', by='exp_bin', vert=False, figsize=(5,5))"
  1230.    ]
  1231.   },
  1232.   {
  1233.    "cell_type": "code",
  1234.    "execution_count": 100,
  1235.    "metadata": {},
  1236.    "outputs": [
  1237.     {
  1238.      "data": {
  1239.       "text/plain": [
  1240.        "exp_bin\n",
  1241.        "0    29.437325\n",
  1242.        "1    21.816693\n",
  1243.        "Name: distfromcentre, dtype: float64"
  1244.       ]
  1245.      },
  1246.      "execution_count": 100,
  1247.      "metadata": {},
  1248.      "output_type": "execute_result"
  1249.     }
  1250.    ],
  1251.    "source": [
  1252.     "df.groupby('exp_bin').distfromcentre.mean()"
  1253.    ]
  1254.   },
  1255.   {
  1256.    "cell_type": "code",
  1257.    "execution_count": 104,
  1258.    "metadata": {},
  1259.    "outputs": [
  1260.     {
  1261.      "name": "stdout",
  1262.      "output_type": "stream",
  1263.      "text": [
  1264.       "Optimization terminated successfully.\n",
  1265.       "         Current function value: 0.568095\n",
  1266.       "         Iterations 6\n",
  1267.       "                           Logit Regression Results                           \n",
  1268.       "==============================================================================\n",
  1269.       "Dep. Variable:                   case   No. Observations:                  375\n",
  1270.       "Model:                          Logit   Df Residuals:                      371\n",
  1271.       "Method:                           MLE   Df Model:                            3\n",
  1272.       "Date:                Thu, 12 Jul 2018   Pseudo R-squ.:                 0.06828\n",
  1273.       "Time:                        15:51:50   Log-Likelihood:                -213.04\n",
  1274.       "converged:                       True   LL-Null:                       -228.65\n",
  1275.       "                                        LLR p-value:                 7.617e-07\n",
  1276.       "==================================================================================\n",
  1277.       "                     coef    std err          z      P>|z|      [0.025      0.975]\n",
  1278.       "----------------------------------------------------------------------------------\n",
  1279.       "distfromcentre     0.0318      0.008      4.091      0.000       0.017       0.047\n",
  1280.       "age                0.0007      0.005      0.150      0.880      -0.009       0.010\n",
  1281.       "ever_smoked        0.1899      0.260      0.730      0.465      -0.320       0.699\n",
  1282.       "exp_bin            0.1372      0.342      0.401      0.688      -0.533       0.807\n",
  1283.       "==================================================================================\n"
  1284.      ]
  1285.     }
  1286.    ],
  1287.    "source": [
  1288.     "import statsmodels.api as sm\n",
  1289.     "import numpy as np\n",
  1290.     "from scipy import stats\n",
  1291.     "stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)\n",
  1292.     "y=df['case']\n",
  1293.     "X=df[['distfromcentre', 'age', 'ever_smoked', 'exp_bin']]\n",
  1294.     "logit_model=sm.Logit(y,X)\n",
  1295.     "result=logit_model.fit()\n",
  1296.     "print(result.summary())"
  1297.    ]
  1298.   },
  1299.   {
  1300.    "cell_type": "code",
  1301.    "execution_count": 105,
  1302.    "metadata": {},
  1303.    "outputs": [
  1304.     {
  1305.      "name": "stdout",
  1306.      "output_type": "stream",
  1307.      "text": [
  1308.       "                    2.5%     97.5%        OR\n",
  1309.       "distfromcentre  1.016707  1.048178  1.032323\n",
  1310.       "age             0.991344  1.010191  1.000723\n",
  1311.       "ever_smoked     0.726397  2.012467  1.209070\n",
  1312.       "exp_bin         0.586808  2.242103  1.147033\n"
  1313.      ]
  1314.     }
  1315.    ],
  1316.    "source": [
  1317.     "params = result.params\n",
  1318.     "conf = result.conf_int()\n",
  1319.     "conf['OR'] = params\n",
  1320.     "conf.columns = ['2.5%', '97.5%', 'OR']\n",
  1321.     "print(np.exp(conf))"
  1322.    ]
  1323.   },
  1324.   {
  1325.    "cell_type": "code",
  1326.    "execution_count": null,
  1327.    "metadata": {},
  1328.    "outputs": [],
  1329.    "source": []
  1330.   }
  1331.  ],
  1332.  "metadata": {
  1333.   "kernelspec": {
  1334.    "display_name": "Python 3",
  1335.    "language": "python",
  1336.    "name": "python3"
  1337.   },
  1338.   "language_info": {
  1339.    "codemirror_mode": {
  1340.     "name": "ipython",
  1341.     "version": 3
  1342.    },
  1343.    "file_extension": ".py",
  1344.    "mimetype": "text/x-python",
  1345.    "name": "python",
  1346.    "nbconvert_exporter": "python",
  1347.    "pygments_lexer": "ipython3",
  1348.    "version": "3.5.4"
  1349.   }
  1350.  },
  1351.  "nbformat": 4,
  1352.  "nbformat_minor": 2
  1353. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top