Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running on PyMC3 v3.5\n",
      "\n",
      "   Patient_ID  Family_ID  Household_ID   Age  Sex  Classroom  Loc-x  Loc-y  \\\n",
      "0         184         51             6  13.0    0          2     73     80   \n",
      "1         173          9            35   6.0    1          0     85     43   \n",
      "2         177         17             6   8.0    2          1     73     80   \n",
      "3         178         17             6   4.0    1          0     73     80   \n",
      "4         174          9            35   3.0    2          0     85     43   \n",
      "5          45         48            51   7.0    1          1     30     22   \n",
      "6         183          4             6   4.0    1          0     73     80   \n",
      "7         175          9            35   2.0    1          0     85     43   \n",
      "8         181          4             6  10.0    2          2     73     80   \n",
      "9          42         33            45   4.0    2          0     68      7   \n",
      "\n",
      "   Infector  Prodromal_Onset  Eruption_Onset  Death_Day  Died  \\\n",
      "0         0                1               8         -1     0   \n",
      "1         0                3               5         -1     0   \n",
      "2       184                9              13         -1     0   \n",
      "3       184                9              13         -1     0   \n",
      "4         0               10              10         -1     0   \n",
      "5       184               13              15         -1     0   \n",
      "6       184               13              17         -1     0   \n",
      "7       173               14              17         -1     0   \n",
      "8       184               15              19         -1     0   \n",
      "9       178               17              21         -1     0   \n",
      "\n",
      "   Infectious_Onset  Infectious_End  Infectious_Duration  \n",
      "0                 0              11                   11  \n",
      "1                 2               8                    6  \n",
      "2                 8              16                    8  \n",
      "3                 8              16                    8  \n",
      "4                 9              13                    4  \n",
      "5                12              18                    6  \n",
      "6                12              20                    8  \n",
      "7                13              20                    7  \n",
      "8                14              22                    8  \n",
      "9                16              24                    8  \n",
      "hi\n"
     ]
    }
   ],
   "source": [
    "import csv\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import pymc3 as pm\n",
    "from jupyterthemes import jtplot\n",
    "from collections import namedtuple\n",
    "import os\n",
    "import time\n",
    "from datetime import datetime\n",
    "import pickle\n",
    "%matplotlib inline\n",
    "##plt.style.use(\"ggplot\")\n",
    "print(f\"Running on PyMC3 v{pm.__version__}\")\n",
    "print(\"\")\n",
    "jtplot.style()\n",
    "\n",
    "pd.options.mode.chained_assignment = None  # default=\"warn\"\n",
    "\n",
    "measles_data_all = pd.read_csv(\"hagelloch_cleaned.csv\",\n",
    "                               sep=\" \")\n",
    "\n",
    "## remove useless columns\n",
    "patients_data = measles_data_all[[\"PN\", \"FN\", \"HN\", \"AGE\", \"SEX\", \"CL\",\n",
    "                                   \"HNX\", \"HNY\", \"IFTO\", \"PRO_DAYS\",\n",
    "                                   \"ERU_DAYS\", \"DEAD_DAYS\"]].copy()\n",
    "\n",
    "patients_data.columns = [\"Patient_ID\",\n",
    "                         \"Family_ID\",\n",
    "                         \"Household_ID\",\n",
    "                         \"Age\",\n",
    "                         \"Sex\",\n",
    "                         \"Classroom\",\n",
    "                         \"Loc-x\",\n",
    "                         \"Loc-y\",\n",
    "                         \"Infector\",\n",
    "                         \"Prodromal_Onset\",\n",
    "                         \"Eruption_Onset\",\n",
    "                         \"Death_Day\"]\n",
    "\n",
    "\n",
    "death_mask = ~(patients_data[\"Death_Day\"] == -1)\n",
    "\n",
    "d = 3 # time still infectious after eruption\n",
    "\n",
    "patients_data[\"Died\"] = death_mask\n",
    "patients_data[\"Died\"] = patients_data[\"Died\"].astype(np.int)\n",
    "patients_data[\"Infectious_Onset\"] = patients_data[\"Prodromal_Onset\"] -1\n",
    "patients_data[\"Infectious_End\"] = np.where(death_mask,\n",
    "                                           patients_data[\"Death_Day\"],\n",
    "                                           (patients_data[\"Eruption_Onset\"] + d))\n",
    "patients_data[\"Infectious_Duration\"] = (patients_data[\"Infectious_End\"] -\n",
    "                                        patients_data[\"Infectious_Onset\"])\n",
    "\n",
    "print(patients_data.head(10))\n",
    "print(\"hi\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lognormal_parameters(desired_mean, desired_variance):\n",
    "    mu = np.log(desired_mean / np.sqrt(1 + desired_variance / (desired_mean**2)))\n",
    "    sigma = np.sqrt(np.log(1 + desired_variance/(desired_mean**2)))\n",
    "    return (mu, sigma)\n",
    "\n",
    "class Observed_Outbreak:\n",
    "    def __init__(self, data_frame, duration=None):\n",
    "        self.data_frame = data_frame.copy()\n",
    "        self.n_patients = self.data_frame.shape[0]\n",
    "        self.slabs = {}\n",
    "        if not (duration is None):\n",
    "            self.duration = duration\n",
    "            self.duration_bc = self.duration[None, :]\n",
    "    \n",
    "    def impute_latencies(self, latency_params):\n",
    "        latencies = np.random.lognormal(*latency_params,\n",
    "                                        self.n_patients).astype(np.int)\n",
    "        self.data_frame[\"Latency_Onset\"] = self.data_frame[\"Infectious_Onset\"] - latencies\n",
    "        earliest_time = self.data_frame[\"Latency_Onset\"].min()\n",
    "        latest_time = self.data_frame[\"Infectious_End\"].max()\n",
    "        self.duration = np.arange(earliest_time, latest_time+1)\n",
    "        self.duration_bc = self.duration[None, :]\n",
    "    \n",
    "    def get_global_infectious(self, valid_days=None):\n",
    "        infectious_onsets_bc = self.data_frame[\"Infectious_Onset\"][:, None]\n",
    "        infectious_ends_bc = self.data_frame[\"Infectious_End\"][:, None]\n",
    "        self.global_infectious_array = ((infectious_onsets_bc <= self.duration_bc) &\n",
    "                                           (self.duration_bc < infectious_ends_bc)).T\n",
    "        self.global_infectious = self.global_infectious_array.sum(axis=1)\n",
    "        if valid_days is None:\n",
    "            self.valid_days = self.global_infectious.copy() != 0\n",
    "        else:\n",
    "            self.valid_days = valid_days\n",
    "        self.n_days = sum(self.valid_days)\n",
    "        self.global_infectious_array = self.global_infectious_array[self.valid_days, :]\n",
    "        self.global_infectious = self.global_infectious[self.valid_days]\n",
    "        self.slabs[\"Community\"] = np.broadcast_to(self.global_infectious[:, None],\n",
    "                                                  (self.n_days, self.n_patients))\n",
    "    \n",
    "    def get_susceptible_states(self):\n",
    "        latency_onsets_bc = self.data_frame[\"Latency_Onset\"][:, None]\n",
    "        self.states_before = ((self.duration_bc < latency_onsets_bc).T).astype(np.int)\n",
    "        self.states_after = np.roll(self.states_before, -1, axis=0)\n",
    "        self.states_after[-1, :] = 0        \n",
    "        self.states_before = self.states_before[self.valid_days, :]\n",
    "        self.states_after = self.states_after[self.valid_days, :]\n",
    "    \n",
    "    def get_patient_distances(self):\n",
    "        self.distance_array = np.zeros((self.n_patients, self.n_patients))\n",
    "        for first in range(self.n_patients):\n",
    "            first_patient = self.data_frame.iloc[first]\n",
    "            first_x = first_patient[\"Loc-x\"]\n",
    "            first_y = first_patient[\"Loc-y\"]\n",
    "            \n",
    "            dx2 = (first_x - self.data_frame[\"Loc-x\"])**2\n",
    "            dy2 = (first_y - self.data_frame[\"Loc-y\"])**2\n",
    "            distance_1d = np.sqrt(dx2 + dy2)\n",
    "            self.distance_array[first, :] = distance_1d\n",
    "    \n",
    "    def get_distance_infectious(self):\n",
    "        distance_slab = np.zeros((self.n_days, self.n_patients))\n",
    "        for today in range(self.n_days):\n",
    "            global_infectious_today = self.global_infectious_array[today, :]\n",
    "            for patient in range(self.n_patients):\n",
    "                distances_from_patient = self.distance_array[patient, :]\n",
    "                distances_exped = np.exp(-distances_from_patient)\n",
    "                product = global_infectious_today * distances_exped\n",
    "                distance_slab[today, patient] = product.sum()\n",
    "        self.slabs[\"Distance\"] = distance_slab\n",
    "                \n",
    "    def get_group_infectious(self, group_name, saturated=False):\n",
    "        output_slab = np.zeros((self.n_days, self.n_patients))\n",
    "        for group_id, group in self.data_frame.groupby(group_name):\n",
    "            ## create an Observed Outbreak class for this particular group,\n",
    "            ## but using the duration from the entire outbreak\n",
    "            group_suboutbreak = Observed_Outbreak(group, self.duration)\n",
    "            group_suboutbreak.get_global_infectious(self.valid_days)\n",
    "            ## get number of infectious in this particular group for each day\n",
    "            group_infectious = group_suboutbreak.global_infectious\n",
    "            group_infectious_bc = np.broadcast_to(group_infectious[:, None],\n",
    "                                                 (self.n_days, self.n_patients))\n",
    "            ## nonzero entries when group_id matches the column's (i.e. patient's) group\n",
    "            group_id_mask = (self.data_frame[group_name] == group_id).values\n",
    "            subslab = group_infectious_bc * group_id_mask\n",
    "            output_slab += subslab.astype(np.int)\n",
    "        if saturated:\n",
    "            output_slab = output_slab.astype(np.bool).astype(np.int)\n",
    "        self.slabs[group_name] = output_slab\n",
    "    \n",
    "    def get_classroom_infectious(self, classroom, infectious_override=False):\n",
    "        data_frame_new = self.data_frame.copy()\n",
    "        if infectious_override:\n",
    "             data_frame_new[\"Infectious_End\"] = (data_frame_new[\"Infectious_Onset\"] +\n",
    "                                                 infectious_override)\n",
    "        \n",
    "        classroom_frame = data_frame_new[data_frame_new[\"Classroom\"] == classroom]\n",
    "        classroom_suboutbreak = Observed_Outbreak(classroom_frame, self.duration)\n",
    "        classroom_suboutbreak.get_global_infectious(self.valid_days)\n",
    "        classroom_infectious = classroom_suboutbreak.global_infectious\n",
    "        classroom_infectious_bc = np.broadcast_to(classroom_infectious[:, None],\n",
    "                                                 (self.n_days, self.n_patients))\n",
    "        classroom_id_mask = (self.data_frame[\"Classroom\"] == classroom).values\n",
    "        classroom_slab = classroom_infectious_bc * classroom_id_mask\n",
    "        self.slabs[f\"Classroom_{classroom}\"] = classroom_slab\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Auto-assigning NUTS sampler...\n",
      "Initializing NUTS using jitter+adapt_diag...\n",
      "Multiprocess sampling (4 chains in 4 jobs)\n",
      "NUTS: [rv_q_sex, rv_q_family, rv_q_household, rv_q_class_2, rv_q_class_1, rv_q_community]\n",
      "Sampling 4 chains: 100%|██████████| 8000/8000 [03:28<00:00, 15.39draws/s]\n",
      "D:\\Anaconda3\\lib\\site-packages\\mkl_fft\\_numpy_fft.py:1044: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
      "  output = mkl_fft.rfftn_numpy(a, s, axes)\n",
      "There was 1 divergence after tuning. Increase `target_accept` or reparameterize.\n",
      "There were 2 divergences after tuning. Increase `target_accept` or reparameterize.\n",
      "The acceptance probability does not match the target. It is 0.6616778933467038, but should be close to 0.8. Try to increase the number of tuning steps.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                    mean        sd  mc_error   hpd_2.5  hpd_97.5        n_eff  \\\n",
      "rv_q_community  0.994322  0.000972  0.000038  0.992559  0.996216   505.819056   \n",
      "rv_q_class_1    0.882522  0.058837  0.001676  0.768720  0.984137  1196.684376   \n",
      "rv_q_class_2    0.946839  0.018331  0.000500  0.914268  0.982833  1425.523988   \n",
      "rv_q_household  0.892061  0.052584  0.001962  0.790368  0.984600   697.508660   \n",
      "rv_q_family     0.772249  0.105571  0.004480  0.601709  0.998689   516.690902   \n",
      "rv_q_sex        0.996833  0.002024  0.000079  0.993130  0.999992   544.947321   \n",
      "\n",
      "                    Rhat  \n",
      "rv_q_community  1.004532  \n",
      "rv_q_class_1    1.000721  \n",
      "rv_q_class_2    1.002293  \n",
      "rv_q_household  1.002365  \n",
      "rv_q_family     1.003017  \n",
      "rv_q_sex        1.006037  \n",
      "Model successfully dumped to file: models/model_sah_1_sat_0_2018-08-23T15-15-18.pkl.\n",
      "1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Auto-assigning NUTS sampler...\n",
      "Initializing NUTS using jitter+adapt_diag...\n",
      "Multiprocess sampling (4 chains in 4 jobs)\n",
      "NUTS: [rv_q_sex, rv_q_family, rv_q_household, rv_q_class_2, rv_q_class_1, rv_q_community]\n",
      "Sampling 4 chains:  90%|█████████ | 7229/8000 [02:53<00:16, 46.84draws/s]"
     ]
    }
   ],
   "source": [
    "latency_mean = 12\n",
    "latency_var = 5.433\n",
    "latency_parameters = lognormal_parameters(latency_mean, latency_var)\n",
    "stay_at_home = 1\n",
    "saturated = False\n",
    "\n",
    "n_samples = 500\n",
    "n_tune = 1500\n",
    "n_chains = 4\n",
    "\n",
    "model_name = \"model_sah_\"+str(int(stay_at_home))+\"_sat_\"+str(int(saturated))+\"_\"\n",
    "marginalization_mc = 66\n",
    "for i in range(marginalization_mc):\n",
    "    print(i)\n",
    "    outbreak = Observed_Outbreak(patients_data)\n",
    "    outbreak.impute_latencies(latency_parameters)\n",
    "    outbreak.get_global_infectious()\n",
    "    outbreak.get_susceptible_states()\n",
    "\n",
    "    outbreak.get_group_infectious(\"Household_ID\")\n",
    "    outbreak.get_group_infectious(\"Family_ID\")\n",
    "    outbreak.get_group_infectious(\"Sex\")\n",
    "    outbreak.get_classroom_infectious(1, infectious_override=stay_at_home)\n",
    "    outbreak.get_classroom_infectious(2, infectious_override=stay_at_home)\n",
    "\n",
    "    community_slab = outbreak.slabs[\"Community\"]\n",
    "    c1_slab = outbreak.slabs[\"Classroom_1\"]\n",
    "    c2_slab = outbreak.slabs[\"Classroom_2\"]\n",
    "    household_slab = outbreak.slabs[\"Household_ID\"]\n",
    "    family_slab = outbreak.slabs[\"Family_ID\"]\n",
    "    sex_slab = outbreak.slabs[\"Sex\"]\n",
    "    s_before = outbreak.states_before\n",
    "    s_after = outbreak.states_after\n",
    "\n",
    "    this_model = pm.Model()\n",
    "    with this_model:\n",
    "        rv_q_community = pm.Uniform(\"rv_q_community\")\n",
    "        ##rv_q_class_0 = pm.Uniform(\"rv_q_class_0\")\n",
    "        rv_q_class_1 = pm.Uniform(\"rv_q_class_1\")\n",
    "        rv_q_class_2 = pm.Uniform(\"rv_q_class_2\")\n",
    "        rv_q_household = pm.Uniform(\"rv_q_household\")\n",
    "        rv_q_family = pm.Uniform(\"rv_q_family\")\n",
    "        rv_q_sex = pm.Uniform(\"rv_q_sex\")\n",
    "\n",
    "        probabilities = ((rv_q_community**community_slab) *\n",
    "                         (rv_q_class_1**c1_slab) *\n",
    "                         (rv_q_class_2**c2_slab) *\n",
    "                         (rv_q_household**household_slab) *\n",
    "                         (rv_q_family**family_slab) *\n",
    "                         (rv_q_sex**sex_slab))\n",
    "\n",
    "        ## probability is zero if already not susceptible\n",
    "        probabilities = s_before * probabilities\n",
    "        s_observed = pm.Bernoulli(\"s_observed\", p=probabilities, observed=s_after)\n",
    "        this_trace = pm.sample(draws=n_samples, cores=n_chains,\n",
    "                               chains=n_chains, tune=n_tune)\n",
    "        pm.traceplot(this_trace)\n",
    "        ##print(\"Likelihood sampled using MCMC NUTS:\")\n",
    "        ##plt.show()\n",
    "        print(pm.summary(this_trace))\n",
    "\n",
    "\n",
    "    now = datetime.now().strftime(\"%Y-%m-%dT%H-%M-%S\")\n",
    "    dump_filename = \"models/\"+model_name+now+\".pkl\"\n",
    "    os.makedirs(os.path.dirname(dump_filename), exist_ok=True)\n",
    "    model_parameters = {\"latency_parameters\": latency_parameters,\n",
    "                        \"stay_at_home\": stay_at_home,\n",
    "                        \"saturated\": saturated,\n",
    "                        \"n_samples\": n_samples,\n",
    "                        \"n_tunes\": n_tune,\n",
    "                        \"n_chains\": n_chains}\n",
    "\n",
    "    with open(dump_filename, \"wb\") as the_file:\n",
    "        pickle.dump({\"model_parameters\": model_parameters,\n",
    "                     \"latencies\": outbreak.data_frame[\"Latency_Onset\"],\n",
    "                     \"model\": this_model,\n",
    "                     \"trace\": this_trace}, the_file)\n",
    "\n",
    "    print(f\"Model successfully dumped to file: {dump_filename}.\")\n",
    "\n",
    "print(\"Marginalization finished.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}