library(magrittr)
library(bayesplot)
library(flextable)
library(corrplot)
library(truncdist)
library(rstanarm)
library(brms)
library(triangulr)
library(tidyverse)
library(ggplot2)
library(viridis)
Analysis for ‘A Multi-Pathogen Exposure Model for Children in Developing Communities’
Preamble
Load in required packages:
Read in data:
= read_csv("https://myweb.uiowa.edu/dksewell/public/haiti-soil.csv")
soil = read_csv("https://myweb.uiowa.edu/dksewell/public/haiti-behavioral.csv") behavior
Look at summary statistics
%>%
soil select(-SiteID) %>%
pivot_longer(cols = everything(),
names_to = "Pathogen",
values_to = "log10conc") %>%
group_by(Pathogen) %>%
summarize(Detection = sum(!is.na(log10conc)),
`Mean concentration` = mean(10^na.omit(log10conc)),
`Median concentration` = median(10^na.omit(log10conc)),
`Min concentration` = min(10^na.omit(log10conc)),
`Max concentration` = max(10^na.omit(log10conc))) %>%
flextable() %>%
colformat_double(digits = 0)
Pathogen | Detection | Mean concentration | Median concentration | Min concentration | Max concentration |
---|---|---|---|---|---|
Aeromonas | 5 | 5,478 | 130 | 85 | 19,938 |
Cholera | 9 | 484,112 | 477,807 | 111,195 | 873,786 |
EAEC aaic | 13 | 478,779 | 934 | 455 | 6,200,779 |
EAEC aata | 2 | 206,474 | 206,474 | 119,515 | 293,432 |
EPEC bfpa | 14 | 10,996 | 2,368 | 210 | 75,628 |
EPEC eae | 38 | 306,567 | 3,928 | 689 | 6,947,445 |
ETEC LT | 22 | 18,851 | 644 | 79 | 385,419 |
%>%
behavior group_by(age) %>%
summarize(`Total time observed` = sum(time_min),
`Geophagy/hr` = sum(geophagy)/sum(time_min) * 60,
`Mouth-to-hand/hr` = sum(mouth_to_hand)/sum(time_min) * 60,
`Mouth-to-object/hr` = sum(mouth_to_object)/sum(time_min) * 60,
`Hand-to-object/hr` = sum(hand_to_object)/sum(time_min) * 60,
`Hand-to-soil/hr` = sum(hand_to_soil)/sum(time_min) * 60) %>%
flextable() %>%
colformat_double(digits = 1)
age | Total time observed | Geophagy/hr | Mouth-to-hand/hr | Mouth-to-object/hr | Hand-to-object/hr | Hand-to-soil/hr |
---|---|---|---|---|---|---|
B | 356.4 | 0.7 | 17.5 | 4.4 | 20.2 | 10.4 |
C | 1,859.9 | 0.1 | 8.5 | 1.9 | 26.6 | 12.0 |
T | 1,069.6 | 0.1 | 11.2 | 2.2 | 19.2 | 9.3 |
Soil pathogen concentration modeling
Letting \(z_{ij}\) denote the binary indicator of the presence of pathogen \(j\) for subject \(i\) and \(y_{ij}\) its \(log_{10}\) concentration for \(i=1,\ldots,N\) and \(j=1,\ldots,J\), we wish to model
\[\begin{align*} \pi\left( \{z_{ij},y_{ij}\}_{j=1}^J \right). \end{align*}\] We fix \(\pi(y_{ij}|z_{ij}=0) = \delta_0(y_{ij})\), i.e., the log_10 concentration distribution is a point mass on 0 if not present. Zero is selected for computational convenience and is suitable since all \(y_{ij}|z_{ij}=1>0\) in our dataset.
We accomplish this joint modeling through telescoping out the likelihood: \[\begin{align*} & \pi\left( \{z_{ij}\}_{j=1}^J, \{y_{ij}\}_{j:z_{ij}=1} \right) &\\ & = \pi(z_{i1})\pi(y_{i1}|z_{i1})\prod_{j=2}^J \pi(z_{ij}|y_{i1},\ldots,y_{i(j-1)})\pi(y_{ij}|z_{ij},y_{i1},\ldots,y_{i(j-1)}). \end{align*}\] Note that this decomposition assumes that all the information contained in \(z_{ik}\) is also contained in \(y_{ik}\), and so \(y_{ij}\), \(j>k\), is conditionally independent of \(z_{ik}\) given \(y_{ik}\). We model these distributions using generalized linear models (GLMs) using \(\{y_{ik}\}_{k<j}\) as covariates for predicting both \(z_{ij}\) and \(y_{ij}\).
Model fitting
# Create the "z's", i.e., the presence/absence variables, and
# set log10 conc of non-detects to zero.
%<>%
soil select(-SiteID) %>%
mutate_all(function(x) ifelse(is.na(x),0,x)) %>%
bind_cols(soil %>%
select(-SiteID) %>%
mutate_all(function(x) ifelse(is.na(x),0, 1)) %>%
rename_all(function(x) paste0(x,"_binary"))) %>%
::clean_names()
janitor
# Get pathogen names
=
path_names %>%
soil select(-contains("_binary")) %>%
colnames()
# Create objects to store model fits
=
z_fits =
y_fits list()
# Look at order of prevalence
=
n_pos %>%
soil select(all_of(path_names)) %>%
as.matrix() %>%
apply(2,function(x) sum(x > 0))
EAEC aata only had 2 positive detects, not enough to statistically identify associations with any other pathogens.
$eaec_aata =
z_fitsstan_glm(eaec_aata_binary ~ 1,
family = binomial(),
data = soil,
iter = 10000,
seed = 2024)
summary(z_fits$eaec_aata)
Model Info:
function: stan_glm
family: binomial [logit]
formula: eaec_aata_binary ~ 1
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 79
predictors: 1
Estimates:
mean sd 10% 50% 90%
(Intercept) -3.6 0.6 -4.4 -3.5 -2.8
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 0.0 0.0 0.0 0.0 0.1
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 6006
mean_PPD 0.0 1.0 9916
log-posterior 0.0 1.0 5438
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$eaec_aata)
$eaec_aata =
y_fitsstan_glm(eaec_aata ~ 1,
family = gaussian(),
data =
%>%
soil filter(eaec_aata_binary == 1),
iter = 10000,
seed = 2024)
summary(y_fits$eaec_aata)
Model Info:
function: stan_glm
family: gaussian [identity]
formula: eaec_aata ~ 1
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 2
predictors: 1
Estimates:
mean sd 10% 50% 90%
(Intercept) 5.3 0.2 5.0 5.3 5.5
sigma 0.3 0.2 0.2 0.3 0.6
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 5.3 0.4 4.9 5.3 5.7
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 5237
sigma 0.0 1.0 5816
mean_PPD 0.0 1.0 9832
log-posterior 0.0 1.0 4387
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$eaec_aata)
$aeromonas =
z_fitsstan_glm(aeromonas_binary ~ 1,
family = binomial(),
data = soil,
iter = 10000,
seed = 2024)
summary(z_fits$aeromonas)
Model Info:
function: stan_glm
family: binomial [logit]
formula: aeromonas_binary ~ 1
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 79
predictors: 1
Estimates:
mean sd 10% 50% 90%
(Intercept) -2.7 0.4 -3.3 -2.7 -2.1
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 0.1 0.0 0.0 0.1 0.1
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 5303
mean_PPD 0.0 1.0 9169
log-posterior 0.0 1.0 5069
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$aeromonas)
$aeromonas =
y_fitsstan_glm(aeromonas ~ 1,
family = gaussian(),
data =
%>%
soil filter(aeromonas_binary == 1),
iter = 10000,
seed = 2024)
summary(y_fits$aeromonas)
Model Info:
function: stan_glm
family: gaussian [identity]
formula: aeromonas ~ 1
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 5
predictors: 1
Estimates:
mean sd 10% 50% 90%
(Intercept) 2.8 0.6 2.1 2.8 3.6
sigma 1.3 0.5 0.8 1.2 1.9
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 2.8 0.9 1.8 2.8 3.9
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 7232
sigma 0.0 1.0 6639
mean_PPD 0.0 1.0 11282
log-posterior 0.0 1.0 5094
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$aeromonas)
$cholera =
z_fitsstan_glm(cholera_binary ~ aeromonas,
family = binomial(),
data = soil,
iter = 10000,
seed = 2024)
summary(z_fits$cholera)
Model Info:
function: stan_glm
family: binomial [logit]
formula: cholera_binary ~ aeromonas
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 79
predictors: 2
Estimates:
mean sd 10% 50% 90%
(Intercept) -2.3 0.4 -2.9 -2.3 -1.8
aeromonas 0.8 0.4 0.4 0.8 1.3
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 0.1 0.0 0.1 0.1 0.2
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 11516
aeromonas 0.0 1.0 10880
mean_PPD 0.0 1.0 15621
log-posterior 0.0 1.0 7219
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$cholera)
$cholera =
y_fitsstan_glm(cholera ~ aeromonas,
family = gaussian(),
data =
%>%
soil filter(cholera_binary == 1),
iter = 10000,
seed = 2024)
summary(y_fits$cholera)
Model Info:
function: stan_glm
family: gaussian [identity]
formula: cholera ~ aeromonas
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 9
predictors: 2
Estimates:
mean sd 10% 50% 90%
(Intercept) 5.5 0.2 5.3 5.5 5.7
aeromonas 0.0 0.1 -0.1 0.0 0.2
sigma 0.4 0.1 0.3 0.4 0.6
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 5.6 0.2 5.3 5.6 5.8
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 12120
aeromonas 0.0 1.0 13085
sigma 0.0 1.0 10149
mean_PPD 0.0 1.0 14891
log-posterior 0.0 1.0 6520
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$cholera)
$eaec_aaic =
z_fitsstan_glm(eaec_aaic_binary ~ aeromonas + cholera,
family = binomial(),
data = soil,
iter = 10000,
seed = 2024)
summary(z_fits$eaec_aaic)
Model Info:
function: stan_glm
family: binomial [logit]
formula: eaec_aaic_binary ~ aeromonas + cholera
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 79
predictors: 3
Estimates:
mean sd 10% 50% 90%
(Intercept) -1.8 0.3 -2.2 -1.8 -1.4
aeromonas -0.5 0.6 -1.3 -0.4 0.2
cholera 0.2 0.2 0.0 0.2 0.4
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 0.2 0.1 0.1 0.2 0.2
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 15782
aeromonas 0.0 1.0 10597
cholera 0.0 1.0 14452
mean_PPD 0.0 1.0 18823
log-posterior 0.0 1.0 6963
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$eaec_aaic)
$eaec_aaic =
y_fitsstan_glm(eaec_aaic ~ aeromonas + cholera,
family = gaussian(),
data =
%>%
soil filter(eaec_aaic_binary == 1),
iter = 10000,
seed = 2024)
summary(y_fits$eaec_aaic)
Model Info:
function: stan_glm
family: gaussian [identity]
formula: eaec_aaic ~ aeromonas + cholera
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 13
predictors: 3
Estimates:
mean sd 10% 50% 90%
(Intercept) 3.1 0.1 2.9 3.1 3.3
aeromonas 1.7 0.3 1.4 1.7 2.1
cholera 0.0 0.1 -0.1 0.0 0.1
sigma 0.4 0.1 0.3 0.4 0.6
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 3.4 0.2 3.2 3.4 3.6
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 16470
aeromonas 0.0 1.0 11946
cholera 0.0 1.0 11931
sigma 0.0 1.0 9782
mean_PPD 0.0 1.0 17551
log-posterior 0.0 1.0 6087
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$eaec_aaic)
$epec_bfpa =
z_fitsstan_glm(epec_bfpa_binary ~ aeromonas + cholera + eaec_aaic,
family = binomial(),
data = soil,
iter = 10000,
seed = 2024)
summary(z_fits$epec_bfpa)
Model Info:
function: stan_glm
family: binomial [logit]
formula: epec_bfpa_binary ~ aeromonas + cholera + eaec_aaic
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 79
predictors: 4
Estimates:
mean sd 10% 50% 90%
(Intercept) -2.3 0.4 -2.9 -2.3 -1.8
aeromonas -0.8 0.9 -2.0 -0.6 0.3
cholera -0.2 0.3 -0.5 -0.2 0.1
eaec_aaic 1.0 0.2 0.7 1.0 1.3
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 0.2 0.1 0.1 0.2 0.3
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 12700
aeromonas 0.0 1.0 9705
cholera 0.0 1.0 11473
eaec_aaic 0.0 1.0 10589
mean_PPD 0.0 1.0 19332
log-posterior 0.0 1.0 7017
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$epec_bfpa)
$epec_bfpa =
y_fitsstan_glm(epec_bfpa ~ aeromonas + cholera + eaec_aaic,
family = gaussian(),
data =
%>%
soil filter(epec_bfpa_binary == 1),
iter = 10000,
seed = 2024)
summary(y_fits$epec_bfpa)
Model Info:
function: stan_glm
family: gaussian [identity]
formula: epec_bfpa ~ aeromonas + cholera + eaec_aaic
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 14
predictors: 4
Estimates:
mean sd 10% 50% 90%
(Intercept) 2.9 0.3 2.6 2.9 3.3
aeromonas 0.8 0.5 0.1 0.8 1.5
cholera -0.2 0.2 -0.4 -0.2 0.0
eaec_aaic 0.2 0.1 0.0 0.2 0.4
sigma 0.7 0.2 0.5 0.7 0.9
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 3.3 0.3 3.0 3.3 3.7
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 15608
aeromonas 0.0 1.0 13547
cholera 0.0 1.0 13215
eaec_aaic 0.0 1.0 12866
sigma 0.0 1.0 9698
mean_PPD 0.0 1.0 17347
log-posterior 0.0 1.0 5626
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$epec_bfpa)
$etec_lt =
z_fitsstan_glm(etec_lt_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa,
family = binomial(),
data = soil,
iter = 10000,
seed = 2024)
summary(z_fits$etec_lt)
Model Info:
function: stan_glm
family: binomial [logit]
formula: etec_lt_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 79
predictors: 5
Estimates:
mean sd 10% 50% 90%
(Intercept) -1.2 0.3 -1.6 -1.2 -0.8
aeromonas 0.2 0.4 -0.3 0.2 0.6
cholera 0.2 0.2 0.0 0.2 0.4
eaec_aaic -0.5 0.3 -0.9 -0.5 -0.1
epec_bfpa 0.5 0.3 0.2 0.5 0.8
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 0.3 0.1 0.2 0.3 0.4
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 17486
aeromonas 0.0 1.0 15152
cholera 0.0 1.0 15344
eaec_aaic 0.0 1.0 11247
epec_bfpa 0.0 1.0 11941
mean_PPD 0.0 1.0 18282
log-posterior 0.0 1.0 8321
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$etec_lt)
$etec_lt =
y_fitsstan_glm(etec_lt ~ aeromonas + cholera + eaec_aaic + epec_bfpa,
family = gaussian(),
data =
%>%
soil filter(etec_lt_binary == 1),
iter = 10000,
seed = 2024)
summary(y_fits$etec_lt)
Model Info:
function: stan_glm
family: gaussian [identity]
formula: etec_lt ~ aeromonas + cholera + eaec_aaic + epec_bfpa
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 22
predictors: 5
Estimates:
mean sd 10% 50% 90%
(Intercept) 2.8 0.2 2.5 2.8 3.1
aeromonas 0.2 0.2 -0.1 0.2 0.4
cholera 0.0 0.1 -0.1 0.0 0.1
eaec_aaic 0.0 0.2 -0.2 0.0 0.2
epec_bfpa 0.2 0.1 0.0 0.2 0.4
sigma 0.8 0.1 0.6 0.8 1.0
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 3.0 0.2 2.7 3.0 3.3
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 20560
aeromonas 0.0 1.0 21620
cholera 0.0 1.0 18880
eaec_aaic 0.0 1.0 17555
epec_bfpa 0.0 1.0 17485
sigma 0.0 1.0 15093
mean_PPD 0.0 1.0 20887
log-posterior 0.0 1.0 7426
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$etec_lt)
$epec_eae =
z_fitsstan_glm(epec_eae_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa + etec_lt,
family = binomial(),
data = soil,
iter = 10000,
seed = 2024)
summary(z_fits$epec_eae)
Model Info:
function: stan_glm
family: binomial [logit]
formula: epec_eae_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa +
etec_lt
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 79
predictors: 6
Estimates:
mean sd 10% 50% 90%
(Intercept) -0.7 0.3 -1.1 -0.7 -0.3
aeromonas -0.1 0.4 -0.7 -0.1 0.4
cholera 0.3 0.2 0.1 0.3 0.6
eaec_aaic 0.1 0.3 -0.3 0.1 0.5
epec_bfpa 1.2 0.5 0.7 1.2 1.8
etec_lt 0.0 0.2 -0.3 0.0 0.2
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 0.5 0.1 0.4 0.5 0.6
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 30714
aeromonas 0.0 1.0 15601
cholera 0.0 1.0 15342
eaec_aaic 0.0 1.0 17128
epec_bfpa 0.0 1.0 10870
etec_lt 0.0 1.0 18482
mean_PPD 0.0 1.0 23957
log-posterior 0.0 1.0 8172
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$epec_eae)
$epec_eae =
y_fitsstan_glm(epec_eae ~ aeromonas + cholera + eaec_aaic + epec_bfpa + etec_lt,
family = gaussian(),
data =
%>%
soil filter(epec_eae_binary == 1),
iter = 10000,
seed = 2024)
summary(y_fits$epec_eae)
Model Info:
function: stan_glm
family: gaussian [identity]
formula: epec_eae ~ aeromonas + cholera + eaec_aaic + epec_bfpa + etec_lt
algorithm: sampling
sample: 20000 (posterior sample size)
priors: see help('prior_summary')
observations: 38
predictors: 6
Estimates:
mean sd 10% 50% 90%
(Intercept) 3.7 0.2 3.5 3.7 3.9
aeromonas 0.1 0.2 -0.2 0.1 0.3
cholera 0.0 0.1 -0.1 0.0 0.1
eaec_aaic 0.1 0.1 -0.1 0.1 0.2
epec_bfpa 0.4 0.1 0.3 0.4 0.5
etec_lt -0.2 0.1 -0.3 -0.2 -0.1
sigma 0.8 0.1 0.6 0.8 0.9
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 4.0 0.2 3.8 4.0 4.2
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 19882
aeromonas 0.0 1.0 12866
cholera 0.0 1.0 12101
eaec_aaic 0.0 1.0 12432
epec_bfpa 0.0 1.0 12231
etec_lt 0.0 1.0 12554
sigma 0.0 1.0 12093
mean_PPD 0.0 1.0 18294
log-posterior 0.0 1.0 6736
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$epec_eae)
Collate results
# Set number of digits for output
= 3
n_digits
# Create tables for reporting results, one for presence/absence,
# the other for log_10 concentration
=
results list(z = tibble(Covariate = c("(Intercept)",(n_pos %>% sort() %>% names())[-c(1,length(path_names))])),
y = tibble(Covariate = c("(Intercept)",(n_pos %>% sort() %>% names())[-c(1,length(path_names))]))
)for(j in (n_pos %>% sort() %>% names())){
$z[[j]] = results$y[[j]] = ""
results
=
pmedian %>%
z_fits[[j]] as.matrix() %>%
apply(2,median) %>%
round(n_digits)
=
ci posterior_interval(z_fits[[j]],prob = 0.95) %>%
round(n_digits)
for(k in 1:length(pmedian)){
$z[[j]][ match(gsub("_binary","",names(pmedian))[k],
results$z$Covariate) ] =
resultspaste(pmedian[k],
" (",
1],
ci[k,", ",
2],
ci[k,")",
collapse = "",
sep = "")
}
=
pmedian %>%
y_fits[[j]] as.matrix() %>%
apply(2,median) %>%
round(n_digits)
= pmedian[-length(pmedian)]
pmedian =
ci posterior_interval(z_fits[[j]],prob = 0.95) %>%
round(n_digits)
for(k in 1:length(pmedian)){
$y[[j]][ match(names(pmedian)[k],
results$y$Covariate) ] =
resultspaste(pmedian[k],
" (",
1],
ci[k,", ",
2],
ci[k,")",
collapse = "",
sep = "")
}
}
$z %>%
resultsflextable()
Covariate | eaec_aata | aeromonas | cholera | eaec_aaic | epec_bfpa | etec_lt | epec_eae |
---|---|---|---|---|---|---|---|
(Intercept) | -3.51 (-4.967, -2.477) | -2.654 (-3.633, -1.886) | -2.31 (-3.186, -1.605) | -1.765 (-2.482, -1.152) | -2.305 (-3.279, -1.526) | -1.191 (-1.838, -0.601) | -0.656 (-1.298, -0.055) |
aeromonas | 0.827 (0.115, 1.648) | -0.437 (-1.97, 0.454) | -0.641 (-2.851, 0.63) | 0.158 (-0.629, 0.913) | -0.099 (-1.029, 0.715) | ||
cholera | 0.227 (-0.106, 0.538) | -0.194 (-0.769, 0.233) | 0.196 (-0.129, 0.507) | 0.336 (0.011, 0.739) | |||
eaec_aaic | 1.011 (0.559, 1.536) | -0.516 (-1.164, 0.032) | 0.071 (-0.574, 0.708) | ||||
epec_bfpa | 0.478 (-0.027, 1.029) | 1.172 (0.468, 2.281) | |||||
etec_lt | -0.038 (-0.478, 0.379) |
$y %>%
resultsflextable()
Covariate | eaec_aata | aeromonas | cholera | eaec_aaic | epec_bfpa | etec_lt | epec_eae |
---|---|---|---|---|---|---|---|
(Intercept) | 5.272 (-4.967, -2.477) | 2.849 (-3.633, -1.886) | 5.526 (-3.186, -1.605) | 3.077 (-2.482, -1.152) | 2.945 (-3.279, -1.526) | 2.797 (-1.838, -0.601) | 3.703 (-1.298, -0.055) |
aeromonas | 0.048 (0.115, 1.648) | 1.745 (-1.97, 0.454) | 0.784 (-2.851, 0.63) | 0.165 (-0.629, 0.913) | 0.051 (-1.029, 0.715) | ||
cholera | 0.018 (-0.106, 0.538) | -0.186 (-0.769, 0.233) | -0.031 (-0.129, 0.507) | -0.01 (0.011, 0.739) | |||
eaec_aaic | 0.188 (0.559, 1.536) | 0.023 (-1.164, 0.032) | 0.058 (-0.574, 0.708) | ||||
epec_bfpa | 0.186 (-0.027, 1.029) | 0.392 (0.468, 2.281) | |||||
etec_lt | -0.191 (-0.478, 0.379) |
Assess Model Fit
Model fit is assessed using the Bayesian p-value based on the following goodness of fit statistic: \[ \sum_{i=1}^N\sum_{j=1}^J\frac{(y_{ij} - \mathbb{E}(y_{ij})^2}{\text{Var}(y_{ij})} \] For those unfamiliar with Bayesian p-values, this is a well-established technique to assess model fit in the Bayesian framework. Values close to 0.5 indicate the model fits the data well, while values close to 0 or to 1 indicate a poor fit.
Note that with the sequential GLM modeling approach described above, we have \[\begin{align*} z_{ij} & \sim \text{Bernoulli}(p_{ij}) &\\ y_{ij} & \sim \begin{cases} \delta_0(y) & \text{ if }z_{ij}=0\\ N(\mu_{ij},\sigma^2_j)& \text{ if }z_{ij} = 1 \end{cases} & \\ \text{where logit} (p_{ij}) & = \alpha_{j0} + \sum_{k<j}y_{ik}\alpha_{jk}, &\\ \mu_{ij} & = \beta_{j0} + \sum_{k<j}y_{ik}\beta_{jk}. &\\ \text{Hence } \mathbb{E}(y_{ij}) & = \mathbb{E}(\mathbb{E}(y_{ij}|z_{ij})) &\\ & = \mathbb{E}(\mu_{ij}z_{ij}) &\\ & = \mu_{ij}p_{ij} & \\ \text{Var}(y_{ij}) & = \mathbb{E}(\text{Var}(y_{ij}|z_{ij})) + \text{Var}(\mathbb{E}(y_{ij}|z_{ij})) &\\ & = \mathbb{E}(\sigma^2_jz_{ij}) + \text{Var}(\mu_{ij}z_{ij}) &\\ & = \sigma^2_jp_{ij} + \mu_{ij}^2p_{ij}(1-p_{ij}). \end{align*}\]
First, get posterior samples of the model parameters:
=
z_samples %>%
z_fits lapply(as.matrix)
=
y_samples %>%
y_fits lapply(as.matrix)
Now we need to get samples from the posterior predictive density. This must be done sequentially according to how we fit the models:
set.seed(2024)
= y_predictions = list()
z_predictions ## eaec_aata
$eaec_aata =
z_predictions$eaec_aata %>%
z_fitsposterior_predict()
$eaec_aata =
y_predictions$eaec_aata %>%
y_fitsposterior_predict(newdata = data.frame(whatever = rep(1,nrow(soil))))
$eaec_aata =
y_predictions$eaec_aata * z_predictions$eaec_aata
y_predictions## aeromonas
$aeromonas =
z_predictions$aeromonas %>%
z_fitsposterior_predict()
$aeromonas =
y_predictions$aeromonas %>%
y_fitsposterior_predict(newdata = data.frame(whatever = rep(1,nrow(soil))))
$aeromonas =
y_predictions$aeromonas * z_predictions$aeromonas
y_predictions## cholera
$cholera =
z_predictions$cholera[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) +
z_samples$cholera[,"aeromonas"] * y_predictions$aeromonas #first term will be repeated correctly
z_samples$cholera =
z_predictions1.0 * (matrix(runif(prod(dim(z_predictions$cholera))),
nrow(z_predictions$cholera),
ncol(z_predictions$cholera)) <
1.0 / (1.0 + exp(-z_predictions$cholera) ) ) )
($cholera =
y_predictions$cholera[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) +
y_samples$cholera[,"aeromonas"] * y_predictions$aeromonas +
y_samplesmatrix(rnorm(nrow(y_samples$cholera) * nrow(soil),
sd = y_samples$cholera[,"sigma"]),
nrow(y_samples$eaec_aata),
nrow(soil))
$cholera =
y_predictions$cholera * z_predictions$cholera
y_predictions## eaec_aaic
$eaec_aaic =
z_predictions$eaec_aaic[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) +
z_samples$eaec_aaic[,"aeromonas"] * y_predictions$aeromonas +
z_samples$eaec_aaic[,"cholera"] * y_predictions$cholera
z_samples$eaec_aaic =
z_predictions1.0 * (matrix(runif(prod(dim(z_predictions$eaec_aaic))),
nrow(z_predictions$eaec_aaic),
ncol(z_predictions$eaec_aaic)) <
1.0 / (1.0 + exp(-z_predictions$eaec_aaic)))
$eaec_aaic =
y_predictions$eaec_aaic[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) +
y_samples$eaec_aaic[,"aeromonas"] * y_predictions$aeromonas +
y_samples$eaec_aaic[,"cholera"] * y_predictions$cholera +
y_samplesmatrix(rnorm(nrow(y_samples$eaec_aaic) * nrow(soil),
sd = y_samples$eaec_aaic[,"sigma"]),
nrow(y_samples$eaec_aata),
nrow(soil))
$eaec_aaic =
y_predictions$eaec_aaic * z_predictions$eaec_aaic
y_predictions## epec_bfpa
$epec_bfpa =
z_predictions$epec_bfpa[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) +
z_samples$epec_bfpa[,"aeromonas"] * y_predictions$aeromonas +
z_samples$epec_bfpa[,"cholera"] * y_predictions$cholera +
z_samples$epec_bfpa[,"eaec_aaic"] * y_predictions$eaec_aaic
z_samples$epec_bfpa =
z_predictions1.0 * (matrix(runif(prod(dim(z_predictions$epec_bfpa))),
nrow(z_predictions$epec_bfpa),
ncol(z_predictions$epec_bfpa)) <
1.0 / (1.0 + exp(-z_predictions$epec_bfpa)))
$epec_bfpa =
y_predictions$epec_bfpa[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) +
y_samples$epec_bfpa[,"aeromonas"] * y_predictions$aeromonas +
y_samples$epec_bfpa[,"cholera"] * y_predictions$cholera +
y_samples$epec_bfpa[,"eaec_aaic"] * y_predictions$eaec_aaic +
y_samplesmatrix(rnorm(nrow(y_samples$epec_bfpa) * nrow(soil),
sd = y_samples$epec_bfpa[,"sigma"]),
nrow(y_samples$eaec_aata),
nrow(soil))
$epec_bfpa =
y_predictions$epec_bfpa * z_predictions$epec_bfpa
y_predictions## etec_lt
$etec_lt =
z_predictions$etec_lt[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) +
z_samples$etec_lt[,"aeromonas"] * y_predictions$aeromonas +
z_samples$etec_lt[,"cholera"] * y_predictions$cholera +
z_samples$etec_lt[,"eaec_aaic"] * y_predictions$eaec_aaic +
z_samples$etec_lt[,"epec_bfpa"] * y_predictions$epec_bfpa
z_samples$etec_lt =
z_predictions1.0 * (matrix(runif(prod(dim(z_predictions$etec_lt))),
nrow(z_predictions$etec_lt),
ncol(z_predictions$etec_lt)) <
1.0 / (1.0 + exp(-z_predictions$etec_lt)))
$etec_lt =
y_predictions$etec_lt[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) +
y_samples$etec_lt[,"aeromonas"] * y_predictions$aeromonas +
y_samples$etec_lt[,"cholera"] * y_predictions$cholera +
y_samples$etec_lt[,"eaec_aaic"] * y_predictions$eaec_aaic +
y_samples$etec_lt[,"epec_bfpa"] * y_predictions$epec_bfpa +
y_samplesmatrix(rnorm(nrow(y_samples$etec_lt) * nrow(soil),
sd = y_samples$etec_lt[,"sigma"]),
nrow(y_samples$eaec_aata),
nrow(soil))
$etec_lt =
y_predictions$etec_lt * z_predictions$etec_lt
y_predictions## epec_eae
$epec_eae =
z_predictions$epec_eae[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) +
z_samples$epec_eae[,"aeromonas"] * y_predictions$aeromonas +
z_samples$epec_eae[,"cholera"] * y_predictions$cholera +
z_samples$epec_eae[,"eaec_aaic"] * y_predictions$eaec_aaic +
z_samples$epec_eae[,"epec_bfpa"] * y_predictions$epec_bfpa +
z_samples$epec_eae[,"etec_lt"] * y_predictions$etec_lt
z_samples$epec_eae =
z_predictions1.0 * (matrix(runif(prod(dim(z_predictions$epec_eae))),
nrow(z_predictions$epec_eae),
ncol(z_predictions$epec_eae)) <
1.0 / (1.0 + exp(-z_predictions$epec_eae)))
$epec_eae =
y_predictions$epec_eae[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) +
y_samples$epec_eae[,"aeromonas"] * y_predictions$aeromonas +
y_samples$epec_eae[,"cholera"] * y_predictions$cholera +
y_samples$epec_eae[,"eaec_aaic"] * y_predictions$eaec_aaic +
y_samples$epec_eae[,"epec_bfpa"] * y_predictions$epec_bfpa +
y_samples$epec_eae[,"etec_lt"] * y_predictions$etec_lt +
y_samplesmatrix(rnorm(nrow(y_samples$epec_eae) * nrow(soil),
sd = y_samples$epec_eae[,"sigma"]),
nrow(y_samples$eaec_aata),
nrow(soil))
$epec_eae =
y_predictions$epec_eae * z_predictions$epec_eae y_predictions
Below is the function to obtain the GOF statistics (one for the observed data and one for the predictive sample) for a given posterior draw.
= function(iter){
gof_stat =
dt.soil_sim %>%
soil ::select(all_of(names(z_fits)))
dplyr
for(j in names(z_fits)) dt.soil_sim[[j]] = y_predictions[[j]][iter,]
%<>%
dt.soil_sim bind_cols(dt.soil_sim %>%
mutate_all(function(x) ifelse(is.na(x),0, 1)) %>%
rename_all(function(x) paste0(x,"_binary")))
=
PrGr0 =
mu =
Var =
Exp matrix(0.0,nrow(dt.soil_sim),NROW(n_pos))
## Compute GOF stat for simulated data
for(j in 1:NROW(n_pos)){
= model.matrix(formula(z_fits[[j]]), # Formula is the same for z_fits and y_fits
mm data = dt.soil_sim)
=
PrGr0[,j] 1.0 / drop(1.0 + exp(-mm %*% z_samples[[j]][iter,]))
= mm %*% y_samples[[j]][iter,-ncol(y_samples[[j]])]
mu[,j]
= PrGr0[,j] * mu[,j]
Exp[,j]
=
Var[,j] * y_samples[[j]][iter,ncol(y_samples[[j]])]^2 +
PrGr0[,j] * (1.0 - PrGr0[,j]) * mu[,j]^2
PrGr0[,j]
}
=
GOF_sim sum( (as.matrix(dt.soil_sim %>%
::select(all_of(names(z_fits)))) - Exp)^2 / Var )
dplyr
## Compute GOF stat for observed data
for(j in 1:NROW(n_pos)){
= model.matrix(formula(z_fits[[j]]),
mm data = soil)
=
PrGr0[,j] 1.0 / drop(1.0 + exp(-mm %*% z_samples[[j]][iter,]))
= mm %*% y_samples[[j]][iter,-ncol(y_samples[[j]])]
mu[,j]
= PrGr0[,j] * mu[,j]
Exp[,j]
=
Var[,j] * y_samples[[j]][iter,ncol(y_samples[[j]])]^2 +
PrGr0[,j] * (1.0 - PrGr0[,j]) * mu[,j]^2
PrGr0[,j]
}
=
GOF_obs sum( (as.matrix(soil %>%
::select(all_of(names(z_fits)))) - Exp)^2 / Var )
dplyr
return(c(GOF_sim = GOF_sim,
GOF_obs = GOF_obs))
}
Now to compute and plot.
=
GOF_stats sapply(1:nrow(z_samples[[1]]),gof_stat) %>%
t() %>%
as_tibble()
plot(GOF_stats,
pch = 16,
col = c("indianred","steelblue")[1L + (GOF_stats$GOF_sim < GOF_stats$GOF_obs)],
bty = 'l',
xlim = quantile(GOF_stats$GOF_sim,c(0.01,0.99)),
ylim = quantile(GOF_stats$GOF_obs,c(0.01,0.99)))
legend('topright',
bty = 'n',
cex = 2,
legend = paste0("Bayesian p-value = ",round(mean(GOF_stats$GOF_sim < GOF_stats$GOF_obs),2)))
Behavioral modeling
Model fitting
We consider four different models: Poisson, negative binomial, zero-inflated Poisson (ZIP), and zero-inflated negative binomial (ZINB). We then use Bayes factors to determine the best fitting model.
Set up:
= names(behavior)[-c(1:3)]
beh_names
= list()
b_fits for(j in beh_names) b_fits[[j]] = list()
For each model, we need to compile the STAN code. After that, we can simply change the data being fed into that STAN model and run the HMC algorithm.
$hand_to_object$poisson =
b_fitsbrm(y ~ offset(log(time_min)) + (1|age),
family = "poisson",
data =
%>%
behavior rename(y = hand_to_object),
save_pars = save_pars(all = TRUE),
iter = 10000,
seed = 2024,
cores = 4)
for(j in beh_names[-1]){
$poisson =
b_fits[[j]]$hand_to_object$poisson %>%
b_fitsupdate(newdata =
%>%
behavior rename(y = !!j),
seed = 2024,
cores = 4)
}
Check for convergence:
lapply(b_fits,
function(X) rhat(X$poisson)) %>%
unlist() %>%
max()
[1] 1.012148
$hand_to_object$negbinomial =
b_fitsbrm(y ~ offset(log(time_min)) + (1|age),
family = "negbinomial",
data =
%>%
behavior rename(y = hand_to_object),
save_pars = save_pars(all = TRUE),
iter = 10000,
seed = 2024,
cores = 4)
for(j in beh_names[-1]){
$negbinomial =
b_fits[[j]]$hand_to_object$negbinomial %>%
b_fitsupdate(newdata =
%>%
behavior rename(y = !!j),
seed = 2024,
cores = 4)
}
Check for convergence:
lapply(b_fits,
function(X) rhat(X$negbinomial)) %>%
unlist() %>%
max()
[1] 1.009607
$hand_to_object$zip =
b_fitsbrm(bf(y ~ offset(log(time_min)) + (1|age),
zi = ~ log(time_min) + (1|age)),
family = "zero_inflated_poisson",
data =
%>%
behavior rename(y = hand_to_object),
save_pars = save_pars(all = TRUE),
iter = 10000,
seed = 2024,
cores = 4)
for(j in beh_names[-1]){
$zip =
b_fits[[j]]$hand_to_object$zip %>%
b_fitsupdate(newdata =
%>%
behavior rename(y = !!j),
seed = 2024,
cores = 4)
}
Check for convergence:
lapply(b_fits,
function(X) rhat(X$zip)) %>%
unlist() %>%
max()
[1] 1.058392
$hand_to_object$zinb =
b_fitsbrm(bf(y ~ offset(log(time_min)) + (1|age),
zi = ~ log(time_min) + (1|age)),
family = "zero_inflated_negbinomial",
data =
%>%
behavior rename(y = hand_to_object),
save_pars = save_pars(all = TRUE),
iter = 10000,
seed = 2024,
cores = 4)
for(j in beh_names[-1]){
$zinb =
b_fits[[j]]$hand_to_object$zinb %>%
b_fitsupdate(newdata =
%>%
behavior rename(y = !!j),
seed = 2024,
cores = 4)
}
Check for convergence:
lapply(b_fits,
function(X) rhat(X$zinb)) %>%
unlist() %>%
max()
[1] 1.009326
Model selection
We select the best model for each behavior via Bayes factors.
= list()
bf_by_behavior for(j in names(b_fits)){
cat(paste0("\n--- ",j))
=
bf_by_behavior[[j]] matrix(0.0,4,4,
dimnames = list(c("poisson","negbinomial","zip","zinb"),
c("poisson","negbinomial","zip","zinb")))
lower.tri(bf_by_behavior[[j]])] = NA
bf_by_behavior[[j]][diag(bf_by_behavior[[j]]) = NA
1,2] =
bf_by_behavior[[j]][bayes_factor(b_fits[[j]]$poisson,
$negbinomial)$bf
b_fits[[j]]1,3] =
bf_by_behavior[[j]][bayes_factor(b_fits[[j]]$poisson,
$zip)$bf
b_fits[[j]]1,4] =
bf_by_behavior[[j]][bayes_factor(b_fits[[j]]$poisson,
$zinb)$bf
b_fits[[j]]2,3] =
bf_by_behavior[[j]][bayes_factor(b_fits[[j]]$negbinomial,
$zip)$bf
b_fits[[j]]2,4] =
bf_by_behavior[[j]][bayes_factor(b_fits[[j]]$negbinomial,
$zinb)$bf
b_fits[[j]]3,4] =
bf_by_behavior[[j]][bayes_factor(b_fits[[j]]$zip,
$zinb)$bf
b_fits[[j]] }
= list()
best_fits
for(j in names(bf_by_behavior)){
lower.tri(bf_by_behavior[[j]])] =
bf_by_behavior[[j]][1.0 / t(bf_by_behavior[[j]])[lower.tri(bf_by_behavior[[j]])]
diag(bf_by_behavior[[j]]) = 1.0
=
best_model_name rownames(bf_by_behavior[[j]])[which(apply(bf_by_behavior[[j]],
1,
function(x) all(x >= 1.0)))]
=
best_fits[[j]]
b_fits[[j]][[best_model_name]] }
Below are the estimated parameters of the behavioral distributions. Note that for the zero probabilities for the ZIP and ZINB, the results are on the scale of the linear predictor, i.e., the untransformed regression coefficients. This is because the log(minutes observed) was used as a covariate rather than an offset for this component.
=
best_fit_summary tibble(Behavior = rep(gsub("_","-",names(best_fits)),
each = 3),
Age = rep(c("B","C","T"), length(best_fits)),
`Poi mean` = NA,
`NB mean` = NA,
`NB dispersion` = NA,
`ZIP Poi mean` = NA,
`ZIP zero prob Intercept` = NA,
`ZIP zero prob logtime (min)` = NA,
`ZINB NB mean` = NA,
`ZINB NB dispersion` = NA,
`ZINB zero prob Intercept` = NA,
`ZINB zero prob logtime (min)` = NA)
for(i in 1:length(best_fits)){
if(best_fits[[i]]$family$family == "poisson"){
$`Poi mean`[3 * (i - 1) + 1:3] =
best_fit_summaryexp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
}if(best_fits[[i]]$family$family == "negbinomial"){
$`NB mean`[3 * (i - 1) + 1:3] =
best_fit_summaryexp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
$`NB dispersion`[3 * (i - 1) + 1:3] =
best_fit_summarysummary(best_fits[[i]])$spec_pars$Estimate
}if(best_fits[[i]]$family$family == "zero_inflated_poisson"){
$`ZIP Poi mean`[3 * (i - 1) + 1:3] =
best_fit_summaryexp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
$`ZIP zero prob Intercept`[3 * (i - 1) + 1:3] =
best_fit_summarycoef(best_fits[[i]])$age[,"Estimate","zi_Intercept"]
$`ZIP zero prob logtime (min)`[3 * (i - 1) + 1:3] =
best_fit_summarycoef(best_fits[[i]])$age[,"Estimate","zi_logtime_min"]
}if(best_fits[[i]]$family$family == "zero_inflated_negbinomial"){
$`ZINB NB mean`[3 * (i - 1) + 1:3] =
best_fit_summaryexp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
$`ZINB NB dispersion`[3 * (i - 1) + 1:3] =
best_fit_summarysummary(best_fits[[i]])$spec_pars$Estimate
$`ZINB zero prob Intercept`[3 * (i - 1) + 1:3] =
best_fit_summarycoef(best_fits[[i]])$age[,"Estimate","zi_Intercept"]
$`ZINB zero prob logtime (min)`[3 * (i - 1) + 1:3] =
best_fit_summarycoef(best_fits[[i]])$age[,"Estimate","zi_logtime_min"]
}
}
%>%
best_fit_summary flextable() %>%
merge_v(1) %>%
hline(1:3*3) %>%
colformat_double(digits = 3)
Behavior | Age | Poi mean | NB mean | NB dispersion | ZIP Poi mean | ZIP zero prob Intercept | ZIP zero prob logtime (min) | ZINB NB mean | ZINB NB dispersion | ZINB zero prob Intercept | ZINB zero prob logtime (min) |
---|---|---|---|---|---|---|---|---|---|---|---|
hand-to-object | B | 0.367 | 2.338 | ||||||||
C | 0.443 | 2.338 | |||||||||
T | 0.361 | 2.338 | |||||||||
hand-to-soil | B | 0.187 | 0.854 | ||||||||
C | 0.207 | 0.854 | |||||||||
T | 0.186 | 0.854 | |||||||||
mouth-to-object | B | 0.062 | 0.319 | ||||||||
C | 0.027 | 0.319 | |||||||||
T | 0.042 | 0.319 | |||||||||
geophagy | B | 0.013 | 0.014 | -1.523 | |||||||
C | 0.002 | 1.164 | -1.523 | ||||||||
T | 0.004 | 0.129 | -1.523 | ||||||||
mouth-to-hand | B | 0.323 | 1.214 | ||||||||
C | 0.161 | 1.214 | |||||||||
T | 0.209 | 1.214 |
Monte Carlo simulation
First, we will draw new behavior counts in a 1hr period through the posterior predictive distributions.
set.seed(2024)
= list()
behavior_draws
for(j in names(best_fits)){
=
behavior_draws[[j]] %>%
best_fits[[j]] posterior_predict(newdata = tibble(age = c("B","T","C"),
time_min = 60))
colnames(behavior_draws[[j]]) = c("B","T","C")
}
= behavior_draws
behavior_draws_compiled for(j in names(behavior_draws_compiled)){
%<>%
behavior_draws_compiled[[j]] as_tibble() %>%
mutate(Behavior = j) %>%
relocate(Behavior)
}=
behavior_draws_compiled do.call(bind_rows,
%>%
behavior_draws_compiled) pivot_longer(cols = B:C,
names_to = "Age",
values_to = "Count") %>%
mutate(Behavior =
case_match(Behavior,
"hand_to_object" ~ "Hand-to-object",
"hand_to_soil" ~ "Hand-to-soil",
"mouth_to_object" ~ "Mouth-to-object",
"geophagy" ~ "Geophagy",
"mouth_to_hand" ~ "Hand-to-mouth"),
Age =
case_match(Age,
"B" ~ "Baby",
"T" ~ "Toddler",
"C" ~ "Child") %>%
factor(levels = c("Baby","Toddler","Child")))
=
behavior_summary %>%
behavior group_by(age) %>%
summarize(geophagy = sum(geophagy)/sum(time_min) * 60,
hand_to_mouth = sum(mouth_to_hand)/sum(time_min) * 60,
hand_to_object = sum(hand_to_object)/sum(time_min) * 60,
hand_to_soil = sum(hand_to_soil)/sum(time_min) * 60,
mouth_to_object = sum(mouth_to_object)/sum(time_min) * 60)
%>%
behavior_draws_compiled ggplot(aes(y = Count,
x = Behavior,
fill = Age)) +
geom_boxplot(outliers = FALSE,
col = "gray40") +
geom_point(data = data.frame(x = c(1:5 - 0.25,
1:5,
1:5 + 0.25),
y = c(unlist(behavior_summary[1,-1]),
unlist(behavior_summary[3,-1]),
unlist(behavior_summary[2,-1])),
Age = rep(c("Baby","Toddler","Child"),
each = 5)),
aes(y=y,x=x),
col = "gray40",
size = 3,
pch = 8) +
ylab("Count / hr") +
scale_fill_viridis_d() +
theme(
panel.background = element_rect(fill = "white"),
panel.grid.major = element_line(color = "white"),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "black"),
axis.text.y = element_text(size = 16),
axis.text.x = element_text(size = 16,angle = -45, hjust = 0),
axis.title = element_text(size = 18, face = "bold"),
axis.ticks = element_line(color = "black"),
legend.background = element_rect(fill = "white"),
legend.key = element_blank(),
legend.text = element_text(size = 16),
legend.title = element_text(size = 18, face = "bold")
)
Next we will draw values/fix constants from the extant literature.
= nrow(behavior_draws$mouth_to_hand)
n_sim set.seed(2024)
## Helper function to get mean and sd from empirical quantiles
= function(probs,
find_mean_and_sd
obs_quantiles,
init_for_optim){
= function(x){
objective_function sum( (obs_quantiles - qnorm(probs,
mean = x[1],
sd = exp(x[2])))^2 )
}
optim(init_for_optim,
objective_function)
}
## Transfer Efficiencies
=
trans_soil_hand rlnorm(n_sim, meanlog = 0.11, sdlog = 2.0) #Ozkaynak et al
=
trans_object_hand rtrunc(n_sim, "norm", a = 0, b =1, mean = 0.23, sd=0.22) # Julian, et al
=
trans_hand_mouth rbeta(n_sim,shape1 = 5.2, shape2 = 2.6) # Kwong et al
=
trans_object_mouth rbeta(n_sim, shape1 = 2, shape2 = 8) # Ozkaynak et al
# Surface areas
= list()
child_hand_surf_area_cm2 $B =
child_hand_surf_area_cm2rtri(n_sim, min = 74.59, max = 119.18, mode=98.31) #Kwong et al 2019
$T =
child_hand_surf_area_cm2rtri(n_sim, min = 80.01, max = 177.38, mode=121.18) #Kwong et al 2019
$C =
child_hand_surf_area_cm2rtri(n_sim, min = 139.63, max = 153.28, mode=146.47) #Kwong et al 2019
=
object_surf_area_cm2 rexp(n_sim,rate=0.11) #Kwong et al 2019
# Other miscellaneous constants
=
soil_handful_g 0.001 * rbeta(n_sim, shape1 = 5.3, shape2=158.6) #Kwong, et al 2019
=
hand_loading_g_cm2 0.001 * rtri(n_sim, min = 0, max = 0.95, mode=0.1) #Kwong et al 2019
=
obj_loading_g_cm2 0.001 * rlnorm(n_sim, meanlog = 0.11, sdlog = 2.0) #Ozkaynak et al # Treating objects like hands
=
temp find_mean_and_sd(c(5, 25, 50, 75, 95, 99)/100,
c(0.10,0.16,0.21,0.25,0.31,0.60),
c(0.21,0.05))
$convergence == 0 temp
[1] TRUE
print(c(mean = temp$par[1],sd = exp(temp$par[2])))
mean sd
0.2299022 0.1080376
=
child_frac_hand_touch_soil rtrunc(n_sim,
"norm",
a = 0, b =1,
mean = temp$par[1],
sd = exp(temp$par[2])) #Auyueng et al, 2008
=
temp find_mean_and_sd(c(5, 25, 50, 75, 95, 99)/100,
c(0.12,0.14,0.16,0.19,0.30,0.42),
c(0.16,log(0.05)))
$convergence == 0 temp
[1] TRUE
print(c(mean = temp$par[1],sd = exp(temp$par[2])))
mean sd
0.1933396 0.0730457
=
child_frac_hand_touch_mouth rtrunc(n_sim,
"norm",
a = 0, b =1,
mean = temp$par[1],
sd = exp(temp$par[2])) #Auyueng et al, 2008
=
temp find_mean_and_sd(c(5, 25, 50, 75, 95, 99)/100,
c(0.13,0.14,0.15,0.17,0.20,0.23),
c(0.15,log(0.05)))
$convergence == 0 temp
[1] TRUE
print(c(mean = temp$par[1],sd = exp(temp$par[2])))
mean sd
0.16015715 0.02538482
=
frac_hand_touch_object rtrunc(n_sim,
"norm",
a = 0, b =1,
mean = temp$par[1],
sd = exp(temp$par[2])) #Auyueng et al, 2008
=
max_obj_loading_g_cm2 0.001 * runif(n_sim, 6, 8) #Ozkaynak et al # Treating objects like hands
Set up objects for exposure calculations and compute for each MC draw. Note that since soil samples are exchangeable, we can simply use the first column of each y_prediction matrix.
= exposure_dose_per_hour = list()
exposure_dose_per_hour_culture for(age in c("B","T","C")){
=
exposure_dose_per_hour[[age]] =
exposure_dose_per_hour_culture[[age]] matrix(0.0, n_sim, length(y_fits),
dimnames = list(NULL, names(y_fits)))
}
for(age in names(exposure_dose_per_hour)){
for(j in names(y_fits)){
=
exposure_dose_per_hour[[age]][,j] ifelse(y_predictions[[j]][,1] == 0, 0, 10^y_predictions[[j]][,1]) *
*
( (trans_soil_hand *
child_hand_surf_area_cm2[[age]] *
child_frac_hand_touch_soil *
child_frac_hand_touch_mouth *
trans_hand_mouth *
hand_loading_g_cm2 sapply(1:n_sim,function(i){
min(behavior_draws$hand_to_soil[i,age],
$mouth_to_hand[i,age])
behavior_draws
})+
) *
( trans_object_hand *
child_hand_surf_area_cm2[[age]] *
frac_hand_touch_object *
child_frac_hand_touch_mouth *
trans_hand_mouth *
hand_loading_g_cm2 sapply(1:n_sim,function(i){
min(behavior_draws$hand_to_object[i,age],
$mouth_to_hand[i,age])
behavior_draws
})+
) *
( trans_object_mouth *
object_surf_area_cm2 sapply(1:n_sim,function(i){
min(obj_loading_g_cm2,
max_obj_loading_g_cm2)*
}) $mouth_to_object[,age]
behavior_draws+
) *
(soil_handful_g $geophagy[,age]))
behavior_draws
} }
Results
Tabular form
= list()
exposure_results_table for(j in names(exposure_dose_per_hour)){
=
exposure_results_table[[j]] tibble(Organism =
names(y_fits),
`Pr(+)` =
apply(exposure_dose_per_hour[[j]],2,function(x) mean(x > 0)),
`Median | +` =
apply(exposure_dose_per_hour[[j]],2,function(x) median(x[which(x > 0)])),
lower =
apply(exposure_dose_per_hour[[j]],2,function(x) quantile(x[which(x > 0)],probs = 0.025)),
upper =
apply(exposure_dose_per_hour[[j]],2,function(x) quantile(x[which(x > 0)],probs = 0.975))) %>%
mutate(`Median | +` = round(log10(`Median | +`),2),
`95% PI` = paste("(",
round(log10(lower),2),
", ",
round(log10(upper),2),
")",
sep = ""),.keep = "unused") %>%
rename_with( ~ paste(switch(j, B = "Baby", T = "Toddler", C = "Child"),
sep = "_"))
.x,
}
%>%
exposure_results_table do.call(what = bind_cols) %>%
select(-Toddler_Organism,-Child_Organism) %>%
rename(Organism = Baby_Organism) %>%
flextable() %>%
separate_header()
Organism | Baby | Toddler | Child | ||||||
---|---|---|---|---|---|---|---|---|---|
Pr(+) | Median | + | 95% PI | Pr(+) | Median | + | 95% PI | Pr(+) | Median | + | 95% PI | |
eaec_aata | 0.03005 | 2.96 | (0.6, 5.02) | 0.02945 | 2.95 | (-0.13, 5.05) | 0.02885 | 3.01 | (-0.27, 5.01) |
aeromonas | 0.06715 | 0.46 | (-2.99, 4.08) | 0.06620 | 0.49 | (-3.27, 4.11) | 0.06565 | 0.58 | (-3.49, 4.2) |
cholera | 0.11780 | 3.28 | (1.08, 5.36) | 0.11605 | 3.27 | (0.72, 5.4) | 0.11435 | 3.34 | (0.62, 5.34) |
eaec_aaic | 0.17330 | 0.84 | (-1.48, 5.84) | 0.17070 | 0.88 | (-2.16, 5.84) | 0.16880 | 0.91 | (-1.93, 5.58) |
epec_bfpa | 0.18140 | 0.97 | (-1.96, 4.26) | 0.17780 | 0.97 | (-2.22, 4.37) | 0.17660 | 1.04 | (-2.33, 4.35) |
etec_lt | 0.27610 | 0.69 | (-2.03, 3.44) | 0.27210 | 0.71 | (-2.24, 3.45) | 0.26915 | 0.75 | (-2.34, 3.5) |
epec_eae | 0.48060 | 1.68 | (-1.22, 4.75) | 0.47380 | 1.69 | (-1.38, 4.87) | 0.46845 | 1.73 | (-1.54, 4.88) |
Graphical form
=
exposure_results $B %>%
exposure_dose_per_houras_tibble() %>%
mutate(Age = "Baby") %>%
bind_rows(exposure_dose_per_hour$T %>%
as_tibble() %>%
mutate(Age = "Toddler")) %>%
bind_rows(exposure_dose_per_hour$C %>%
as_tibble() %>%
mutate(Age = "Child")) %>%
pivot_longer(!Age,
names_to = "Organism",
values_to = "Concentration") %>%
mutate(Organism =
case_match(Organism,
"eaec_aata" ~ "EAEC aata",
"aeromonas" ~ "Aeromonas",
"cholera" ~ "Cholera",
"eaec_aaic" ~ "EAEC aaic",
"epec_bfpa" ~ "EPEC bfpa",
"etec_lt" ~ "ETEC LT",
"epec_eae" ~ "EPEC eae"))
%>%
exposure_results group_by(Age,Organism) %>%
summarize_if(is.numeric,
list(`Pr(+)` =
function(x) mean(x > 0))) %>%
ggplot(aes(y = `Pr(+)`,
x = Organism,
fill = Age)) +
geom_bar(position = 'dodge', stat = 'identity') +
scale_fill_viridis_d() +
theme(
panel.background = element_rect(fill = "white"),
panel.grid.major = element_line(color = "white"),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "black"),
axis.text.y = element_text(size = 16),
axis.text.x = element_text(size = 16,angle = -45, hjust = 0),
axis.title = element_text(size = 18, face = "bold"),
axis.ticks = element_line(color = "black"),
legend.background = element_rect(fill = "white"),
legend.key = element_blank(),
legend.text = element_text(size = 16),
legend.title = element_text(size = 18, face = "bold")
)
=
IQR_bounds %>%
exposure_results filter(!near(Concentration, 0)) %>%
mutate(Concentration = log10(Concentration)) %>%
group_by(Age,Organism) %>%
summarize(lower = quantile(Concentration,0.25) - 1.5 * IQR(Concentration),
upper = quantile(Concentration,0.75) + 1.5 * IQR(Concentration))
%>%
exposure_results filter(!near(Concentration, 0)) %>%
mutate(Concentration = log10(Concentration)) %>%
ggplot(aes(y = Concentration,
x = Organism,
fill = Age)) +
geom_violin(bounds = range(c(IQR_bounds$lower,
$upper))) +
IQR_boundsscale_fill_viridis_d() +
scale_y_continuous(breaks = c(-4:6),
labels = 10^c(-4:6)) +
geom_hline(yintercept = c(-4:6),
color = "gray30",
linetype = "dashed",
alpha = 0.5) +
theme(
panel.background = element_rect(fill = "white"),
panel.grid.major = element_line(color = "white"),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "black"),
axis.text.y = element_text(size = 16),
axis.text.x = element_text(size = 16,angle = -45, hjust = 0),
axis.title = element_text(size = 18, face = "bold"),
axis.ticks = element_line(color = "black"),
legend.background = element_rect(fill = "white"),
legend.key = element_blank(),
legend.text = element_text(size = 16),
legend.title = element_text(size = 18, face = "bold")
)
Multipathogen exposure
Graphically display the probability of multipathogen exposure:
= list()
multipath_exposure for(j in names(exposure_dose_per_hour)){
=
multipath_exposure[[j]] tibble(Age = switch(j, B = "Baby", T = "Toddler", C = "Child"),
`N pathogens` =
apply(exposure_dose_per_hour[[j]],1,function(x) sum(x>0))
)
}%<>%
multipath_exposure bind_rows() %>%
mutate(`N pathogens` = ifelse(`N pathogens` >= 4, "4+",as.character(`N pathogens`))) %$%
table(Age, `N pathogens`) %>%
prop.table(1) %>%
as_tibble() %>%
rename(`Posterior prob` = n) %>%
mutate(`N pathogens` = factor(`N pathogens`,
levels = rev(c("0","1","2","3","4+"))))
%>%
multipath_exposure mutate(Age = factor(Age, levels = c("Baby","Toddler","Child"))) %>%
ggplot(aes(x = `Posterior prob`,
y = Age,
fill = `N pathogens`)) +
geom_bar(stat = "identity",position = "stack") +
theme(
panel.background = element_rect(fill = "white"),
panel.grid.major = element_line(color = "white"),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "black"),
axis.text = element_text(size = 16),
axis.title = element_text(size = 18, face = "bold"),
axis.ticks = element_line(color = "black"),
legend.background = element_rect(fill = "white"),
legend.key = element_blank(),
legend.text = element_text(size = 16),
legend.title = element_text(size = 18, face = "bold")
)
Correlation of dose
=
exposure_corr $B %>%
exposure_dose_per_houras_tibble() %>%
bind_rows(exposure_dose_per_hour$T %>%
as_tibble()) %>%
bind_rows(exposure_dose_per_hour$C %>%
as_tibble()) %>%
as.matrix() %>%
cor(method = "spearman")
rownames(exposure_corr) =
colnames(exposure_corr) %<>%
case_match("eaec_aata" ~ "EAEC aata",
"aeromonas" ~ "Aeromonas",
"cholera" ~ "Cholera",
"eaec_aaic" ~ "EAEC aaic",
"epec_bfpa" ~ "EPEC bfpa",
"etec_lt" ~ "ETEC LT",
"epec_eae" ~ "EPEC eae")
corrplot(exposure_corr,
type = "upper",
order = "hclust",
col = viridis_pal()(15),
method = "ellipse",
tl.col = "black",
addCoef.col = 'white')