Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import pymc3 as pm
- from jupyterthemes import jtplot
- from collections import namedtuple
- import os
- import time
- from datetime import datetime
- import pickle
- import theano
- plt.style.use("ggplot")
- print(f"Running on PyMC3 v{pm.__version__}")
- print("")
- ##jtplot.style()
- pd.options.mode.chained_assignment = None # default="warn"
- measles_data_all = pd.read_csv("hagelloch_cleaned.csv",
- sep=" ")
- ## remove useless columns
- patients_data = measles_data_all[["PN", "FN", "HN", "AGE", "SEX", "CL",
- "HNX", "HNY", "IFTO", "PRO_DAYS",
- "ERU_DAYS", "DEAD_DAYS"]].copy()
- patients_data.columns = ["Patient_ID",
- "Family_ID",
- "Household_ID",
- "Age",
- "Sex",
- "Classroom",
- "Loc-x",
- "Loc-y",
- "Infector",
- "Prodromal_Onset",
- "Eruption_Onset",
- "Death_Day"]
- death_mask = ~(patients_data["Death_Day"] == -1)
- d = 3 # time still infectious after eruption
- patients_data["Died"] = death_mask
- patients_data["Died"] = patients_data["Died"].astype(np.int)
- patients_data["Infectious_Onset"] = patients_data["Prodromal_Onset"] -1
- patients_data["Infectious_End"] = np.where(death_mask,
- patients_data["Death_Day"],
- (patients_data["Eruption_Onset"] + d))
- patients_data["Infectious_Duration"] = (patients_data["Infectious_End"] -
- patients_data["Infectious_Onset"])
- print(patients_data.head(10))
- print("hi")
- def lognormal_parameters(desired_mean, desired_variance):
- mu = np.log(desired_mean / np.sqrt(1 + desired_variance / (desired_mean**2)))
- sigma = np.sqrt(np.log(1 + desired_variance/(desired_mean**2)))
- return (mu, sigma)
- class Observed_Outbreak:
- def __init__(self, data_frame, duration=None):
- self.data_frame = data_frame.copy()
- self.n_patients = self.data_frame.shape[0]
- self.slabs = {}
- if not (duration is None):
- self.duration = duration
- self.duration_bc = self.duration[None, :]
- def impute_latencies(self, latency_params):
- latencies = np.random.lognormal(*latency_params,
- self.n_patients).astype(np.int)
- self.data_frame["Latency_Onset"] = self.data_frame["Infectious_Onset"] - latencies
- earliest_time = self.data_frame["Latency_Onset"].min()
- latest_time = self.data_frame["Infectious_End"].max()
- self.duration = np.arange(earliest_time, latest_time+1)
- self.duration_bc = self.duration[None, :]
- def get_global_infectious(self, valid_days=None):
- infectious_onsets_bc = self.data_frame["Infectious_Onset"][:, None]
- infectious_ends_bc = self.data_frame["Infectious_End"][:, None]
- self.global_infectious_array = ((infectious_onsets_bc <= self.duration_bc) &
- (self.duration_bc < infectious_ends_bc)).T
- self.global_infectious = self.global_infectious_array.sum(axis=1)
- if valid_days is None:
- self.valid_days = self.global_infectious.copy() != 0
- else:
- self.valid_days = valid_days
- self.n_days = sum(self.valid_days)
- self.global_infectious_array = self.global_infectious_array[self.valid_days, :]
- self.global_infectious = self.global_infectious[self.valid_days]
- self.slabs["Community"] = np.broadcast_to(self.global_infectious[:, None],
- (self.n_days, self.n_patients))
- def get_susceptible_states(self):
- latency_onsets_bc = self.data_frame["Latency_Onset"][:, None]
- self.states_before = ((self.duration_bc < latency_onsets_bc).T).astype(np.int)
- self.states_after = np.roll(self.states_before, -1, axis=0)
- self.states_after[-1, :] = 0
- self.states_before = self.states_before[self.valid_days, :]
- self.states_after = self.states_after[self.valid_days, :]
- def get_patient_distances(self):
- self.distance_array = np.zeros((self.n_patients, self.n_patients))
- for first in range(self.n_patients):
- first_patient = self.data_frame.iloc[first]
- first_x = first_patient["Loc-x"]
- first_y = first_patient["Loc-y"]
- dx2 = (first_x - self.data_frame["Loc-x"])**2
- dy2 = (first_y - self.data_frame["Loc-y"])**2
- distance_1d = np.sqrt(dx2 + dy2)
- self.distance_array[first, :] = distance_1d
- def get_distance_infectious(self):
- distance_slab = np.zeros((self.n_days, self.n_patients))
- for today in range(self.n_days):
- global_infectious_today = self.global_infectious_array[today, :]
- for patient in range(self.n_patients):
- distances_from_patient = self.distance_array[patient, :]
- distances_exped = np.exp(-distances_from_patient)
- product = global_infectious_today * distances_exped
- distance_slab[today, patient] = product.sum()
- self.slabs["Distance"] = distance_slab
- def get_group_infectious(self, group_name, saturated=False):
- output_slab = np.zeros((self.n_days, self.n_patients))
- for group_id, group in self.data_frame.groupby(group_name):
- ## create an Observed Outbreak class for this particular group,
- ## but using the duration from the entire outbreak
- group_suboutbreak = Observed_Outbreak(group, self.duration)
- group_suboutbreak.get_global_infectious(self.valid_days)
- ## get number of infectious in this particular group for each day
- group_infectious = group_suboutbreak.global_infectious
- group_infectious_bc = np.broadcast_to(group_infectious[:, None],
- (self.n_days, self.n_patients))
- ## nonzero entries when group_id matches the column's (i.e. patient's) group
- group_id_mask = (self.data_frame[group_name] == group_id).values
- subslab = group_infectious_bc * group_id_mask
- output_slab += subslab.astype(np.int)
- if saturated:
- output_slab = output_slab.astype(np.bool).astype(np.int)
- self.slabs[group_name] = output_slab
- def get_classroom_infectious(self, classroom, infectious_override=False):
- data_frame_new = self.data_frame.copy()
- if infectious_override:
- data_frame_new["Infectious_End"] = (data_frame_new["Infectious_Onset"] +
- infectious_override)
- classroom_frame = data_frame_new[data_frame_new["Classroom"] == classroom]
- classroom_suboutbreak = Observed_Outbreak(classroom_frame, self.duration)
- classroom_suboutbreak.get_global_infectious(self.valid_days)
- classroom_infectious = classroom_suboutbreak.global_infectious
- classroom_infectious_bc = np.broadcast_to(classroom_infectious[:, None],
- (self.n_days, self.n_patients))
- classroom_id_mask = (self.data_frame["Classroom"] == classroom).values
- classroom_slab = classroom_infectious_bc * classroom_id_mask
- self.slabs[f"Classroom_{classroom}"] = classroom_slab
- latency_mean = 12
- latency_var = 5.433
- latency_parameters = lognormal_parameters(latency_mean, latency_var)
- stay_at_home = 1
- saturated = False
- n_samples = 500
- n_tune = 1500
- n_chains = 4
- model_name = "model_sah_"+str(int(stay_at_home))+"_sat_"+str(int(saturated))+"_"
- outbreak = Observed_Outbreak(patients_data)
- outbreak.impute_latencies(latency_parameters)
- outbreak.get_global_infectious()
- outbreak.get_susceptible_states()
- outbreak.get_group_infectious("Household_ID")
- outbreak.get_group_infectious("Family_ID")
- outbreak.get_group_infectious("Sex")
- outbreak.get_classroom_infectious(1, infectious_override=stay_at_home)
- outbreak.get_classroom_infectious(2, infectious_override=stay_at_home)
- community_slab = theano.shared(outbreak.slabs["Community"])
- c1_slab = theano.shared(outbreak.slabs["Classroom_1"])
- c2_slab = theano.shared(outbreak.slabs["Classroom_2"])
- household_slab = theano.shared(outbreak.slabs["Household_ID"])
- family_slab = theano.shared(outbreak.slabs["Family_ID"])
- sex_slab = theano.shared(outbreak.slabs["Sex"])
- s_before = theano.shared(outbreak.states_before)
- s_after = theano.shared(outbreak.states_after)
- with pm.Model() as this_model:
- rv_q_community = pm.Uniform("rv_q_community")
- ##rv_q_class_0 = pm.Uniform("rv_q_class_0")
- rv_q_class_1 = pm.Uniform("rv_q_class_1")
- rv_q_class_2 = pm.Uniform("rv_q_class_2")
- rv_q_household = pm.Uniform("rv_q_household")
- rv_q_family = pm.Uniform("rv_q_family")
- rv_q_sex = pm.Uniform("rv_q_sex")
- probabilities = ((rv_q_community**community_slab) *
- (rv_q_class_1**c1_slab) *
- (rv_q_class_2**c2_slab) *
- (rv_q_household**household_slab) *
- (rv_q_family**family_slab) *
- (rv_q_sex**sex_slab))
- ## probability is zero if already not susceptible
- probabilities = s_before * probabilities
- s_observed = pm.Bernoulli("s_observed", p=probabilities, observed=s_after)
- print("Model created.")
- marginalization_mc = 100
- for i in range(marginalization_mc):
- print(i)
- outbreak = Observed_Outbreak(patients_data)
- outbreak.impute_latencies(latency_parameters)
- outbreak.get_global_infectious()
- outbreak.get_susceptible_states()
- outbreak.get_group_infectious("Household_ID")
- outbreak.get_group_infectious("Family_ID")
- outbreak.get_group_infectious("Sex")
- outbreak.get_classroom_infectious(1, infectious_override=stay_at_home)
- outbreak.get_classroom_infectious(2, infectious_override=stay_at_home)
- community_slab.set_value(outbreak.slabs["Community"])
- c1_slab.set_value(outbreak.slabs["Classroom_1"])
- c2_slab.set_value(outbreak.slabs["Classroom_2"])
- household_slab.set_value(outbreak.slabs["Household_ID"])
- family_slab.set_value(outbreak.slabs["Family_ID"])
- sex_slab.set_value(outbreak.slabs["Sex"])
- s_before.set_value(outbreak.states_before)
- s_after.set_value(outbreak.states_after)
- with this_model:
- this_trace = pm.sample(draws=n_samples, cores=n_chains,
- chains=n_chains, tune=n_tune)
- print(pm.summary(this_trace))
- now = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
- dump_filename = "models/"+model_name+now+".pkl"
- os.makedirs(os.path.dirname(dump_filename), exist_ok=True)
- model_parameters = {"latency_parameters": latency_parameters,
- "stay_at_home": stay_at_home,
- "saturated": saturated,
- "n_samples": n_samples,
- "n_tunes": n_tune,
- "n_chains": n_chains}
- with open(dump_filename, "wb") as the_file:
- pickle.dump({"model_parameters": model_parameters,
- "latencies": outbreak.data_frame["Latency_Onset"],
- "model": this_model,
- "trace": this_trace}, the_file)
- print(f"Model successfully dumped to file: {dump_filename}.")
- print("Marginalization finished.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement