Analysis for ‘A Multi-Pathogen Exposure Model for Children in Developing Communities’

Author

Stephanie A. Houser, Daniel K. Sewell, Danielle N. Medgyesi, John M. Brogan, Jean Philippe Creve-Coeur, Kelly K. Baker

Preamble

Load in required packages:

library(magrittr)
library(bayesplot)
library(flextable)
library(corrplot)
library(truncdist)
library(rstanarm)
library(brms)
library(triangulr)
library(tidyverse)
library(ggplot2)
library(viridis)

Read in data:

soil = read_csv("https://myweb.uiowa.edu/dksewell/public/haiti-soil.csv")
behavior = read_csv("https://myweb.uiowa.edu/dksewell/public/haiti-behavioral.csv")

Look at summary statistics

soil %>% 
  select(-SiteID) %>% 
  pivot_longer(cols = everything(),
               names_to = "Pathogen",
               values_to = "log10conc") %>% 
  group_by(Pathogen) %>% 
  summarize(Detection = sum(!is.na(log10conc)),
            `Mean concentration` = mean(10^na.omit(log10conc)),
            `Median concentration` = median(10^na.omit(log10conc)),
            `Min concentration` = min(10^na.omit(log10conc)),
            `Max concentration` = max(10^na.omit(log10conc))) %>% 
  flextable() %>% 
  colformat_double(digits = 0)

Pathogen	Detection	Mean concentration	Median concentration	Min concentration	Max concentration
Aeromonas	5	5,478	130	85	19,938
Cholera	9	484,112	477,807	111,195	873,786
EAEC aaic	13	478,779	934	455	6,200,779
EAEC aata	2	206,474	206,474	119,515	293,432
EPEC bfpa	14	10,996	2,368	210	75,628
EPEC eae	38	306,567	3,928	689	6,947,445
ETEC LT	22	18,851	644	79	385,419

behavior %>% 
  group_by(age) %>% 
  summarize(`Total time observed` = sum(time_min),
            `Geophagy/hr` = sum(geophagy)/sum(time_min) * 60,
            `Mouth-to-hand/hr` = sum(mouth_to_hand)/sum(time_min) * 60,
            `Mouth-to-object/hr` = sum(mouth_to_object)/sum(time_min) * 60,
            `Hand-to-object/hr` = sum(hand_to_object)/sum(time_min) * 60,
            `Hand-to-soil/hr` = sum(hand_to_soil)/sum(time_min) * 60) %>% 
  flextable() %>% 
  colformat_double(digits = 1)

age	Total time observed	Geophagy/hr	Mouth-to-hand/hr	Mouth-to-object/hr	Hand-to-object/hr	Hand-to-soil/hr
B	356.4	0.7	17.5	4.4	20.2	10.4
C	1,859.9	0.1	8.5	1.9	26.6	12.0
T	1,069.6	0.1	11.2	2.2	19.2	9.3

Soil pathogen concentration modeling

Letting \(z_{ij}\) denote the binary indicator of the presence of pathogen \(j\) for subject \(i\) and \(y_{ij}\) its \(log_{10}\) concentration for \(i=1,\ldots,N\) and \(j=1,\ldots,J\), we wish to model

\[\begin{align*} \pi\left( \{z_{ij},y_{ij}\}_{j=1}^J \right). \end{align*}\] We fix \(\pi(y_{ij}|z_{ij}=0) = \delta_0(y_{ij})\), i.e., the log_10 concentration distribution is a point mass on 0 if not present. Zero is selected for computational convenience and is suitable since all \(y_{ij}|z_{ij}=1>0\) in our dataset.

We accomplish this joint modeling through telescoping out the likelihood: \[\begin{align*} & \pi\left( \{z_{ij}\}_{j=1}^J, \{y_{ij}\}_{j:z_{ij}=1} \right) &\\ & = \pi(z_{i1})\pi(y_{i1}|z_{i1})\prod_{j=2}^J \pi(z_{ij}|y_{i1},\ldots,y_{i(j-1)})\pi(y_{ij}|z_{ij},y_{i1},\ldots,y_{i(j-1)}). \end{align*}\] Note that this decomposition assumes that all the information contained in \(z_{ik}\) is also contained in \(y_{ik}\), and so \(y_{ij}\), \(j>k\), is conditionally independent of \(z_{ik}\) given \(y_{ik}\). We model these distributions using generalized linear models (GLMs) using \(\{y_{ik}\}_{k<j}\) as covariates for predicting both \(z_{ij}\) and \(y_{ij}\).

Model fitting

# Create the "z's", i.e., the presence/absence variables, and
# set log10 conc of non-detects to zero.
soil %<>% 
  select(-SiteID) %>% 
  mutate_all(function(x) ifelse(is.na(x),0,x)) %>% 
  bind_cols(soil %>% 
              select(-SiteID) %>% 
              mutate_all(function(x) ifelse(is.na(x),0, 1)) %>% 
              rename_all(function(x) paste0(x,"_binary"))) %>% 
  janitor::clean_names()

# Get pathogen names
path_names = 
  soil %>% 
  select(-contains("_binary")) %>% 
  colnames()

# Create objects to store model fits
z_fits = 
  y_fits = 
  list()

# Look at order of prevalence
n_pos = 
  soil %>% 
  select(all_of(path_names)) %>% 
  as.matrix() %>% 
  apply(2,function(x) sum(x > 0))

EAEC aata only had 2 positive detects, not enough to statistically identify associations with any other pathogens.

z_fits$eaec_aata = 
  stan_glm(eaec_aata_binary ~ 1,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)

summary(z_fits$eaec_aata)


Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      eaec_aata_binary ~ 1
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   1

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -3.6    0.6 -4.4  -3.5  -2.8 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.0    0.0  0.0   0.0   0.1  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  6006 
mean_PPD      0.0  1.0  9916 
log-posterior 0.0  1.0  5438 

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(z_fits$eaec_aata)

y_fits$eaec_aata = 
  stan_glm(eaec_aata ~ 1,
           family = gaussian(),
           data = 
             soil %>% 
             filter(eaec_aata_binary == 1),
           iter = 10000,
           seed = 2024)

summary(y_fits$eaec_aata)


Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      eaec_aata ~ 1
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 2
 predictors:   1

Estimates:
              mean   sd   10%   50%   90%
(Intercept) 5.3    0.2  5.0   5.3   5.5  
sigma       0.3    0.2  0.2   0.3   0.6  

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 5.3    0.4  4.9   5.3   5.7  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  5237 
sigma         0.0  1.0  5816 
mean_PPD      0.0  1.0  9832 
log-posterior 0.0  1.0  4387 

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(y_fits$eaec_aata)

z_fits$aeromonas = 
  stan_glm(aeromonas_binary ~ 1,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)

summary(z_fits$aeromonas)


Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      aeromonas_binary ~ 1
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   1

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -2.7    0.4 -3.3  -2.7  -2.1 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.1    0.0  0.0   0.1   0.1  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  5303 
mean_PPD      0.0  1.0  9169 
log-posterior 0.0  1.0  5069 

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(z_fits$aeromonas)

y_fits$aeromonas = 
  stan_glm(aeromonas ~ 1,
           family = gaussian(),
           data = 
             soil %>% 
             filter(aeromonas_binary == 1),
           iter = 10000,
           seed = 2024)

summary(y_fits$aeromonas)


Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      aeromonas ~ 1
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 5
 predictors:   1

Estimates:
              mean   sd   10%   50%   90%
(Intercept) 2.8    0.6  2.1   2.8   3.6  
sigma       1.3    0.5  0.8   1.2   1.9  

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 2.8    0.9  1.8   2.8   3.9  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0   7232
sigma         0.0  1.0   6639
mean_PPD      0.0  1.0  11282
log-posterior 0.0  1.0   5094

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(y_fits$aeromonas)

z_fits$cholera = 
  stan_glm(cholera_binary ~ aeromonas,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)

summary(z_fits$cholera)


Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      cholera_binary ~ aeromonas
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   2

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -2.3    0.4 -2.9  -2.3  -1.8 
aeromonas    0.8    0.4  0.4   0.8   1.3 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.1    0.0  0.1   0.1   0.2  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  11516
aeromonas     0.0  1.0  10880
mean_PPD      0.0  1.0  15621
log-posterior 0.0  1.0   7219

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(z_fits$cholera)

y_fits$cholera = 
  stan_glm(cholera ~ aeromonas,
           family = gaussian(),
           data = 
             soil %>% 
             filter(cholera_binary == 1),
           iter = 10000,
           seed = 2024)

summary(y_fits$cholera)


Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      cholera ~ aeromonas
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 9
 predictors:   2

Estimates:
              mean   sd   10%   50%   90%
(Intercept)  5.5    0.2  5.3   5.5   5.7 
aeromonas    0.0    0.1 -0.1   0.0   0.2 
sigma        0.4    0.1  0.3   0.4   0.6 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 5.6    0.2  5.3   5.6   5.8  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  12120
aeromonas     0.0  1.0  13085
sigma         0.0  1.0  10149
mean_PPD      0.0  1.0  14891
log-posterior 0.0  1.0   6520

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(y_fits$cholera)

z_fits$eaec_aaic = 
  stan_glm(eaec_aaic_binary ~ aeromonas + cholera,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)

summary(z_fits$eaec_aaic)


Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      eaec_aaic_binary ~ aeromonas + cholera
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   3

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -1.8    0.3 -2.2  -1.8  -1.4 
aeromonas   -0.5    0.6 -1.3  -0.4   0.2 
cholera      0.2    0.2  0.0   0.2   0.4 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.2    0.1  0.1   0.2   0.2  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  15782
aeromonas     0.0  1.0  10597
cholera       0.0  1.0  14452
mean_PPD      0.0  1.0  18823
log-posterior 0.0  1.0   6963

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(z_fits$eaec_aaic)

y_fits$eaec_aaic = 
  stan_glm(eaec_aaic ~ aeromonas + cholera,
           family = gaussian(),
           data = 
             soil %>% 
             filter(eaec_aaic_binary == 1),
           iter = 10000,
           seed = 2024)

summary(y_fits$eaec_aaic)


Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      eaec_aaic ~ aeromonas + cholera
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 13
 predictors:   3

Estimates:
              mean   sd   10%   50%   90%
(Intercept)  3.1    0.1  2.9   3.1   3.3 
aeromonas    1.7    0.3  1.4   1.7   2.1 
cholera      0.0    0.1 -0.1   0.0   0.1 
sigma        0.4    0.1  0.3   0.4   0.6 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 3.4    0.2  3.2   3.4   3.6  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  16470
aeromonas     0.0  1.0  11946
cholera       0.0  1.0  11931
sigma         0.0  1.0   9782
mean_PPD      0.0  1.0  17551
log-posterior 0.0  1.0   6087

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(y_fits$eaec_aaic)

z_fits$epec_bfpa = 
  stan_glm(epec_bfpa_binary ~ aeromonas + cholera + eaec_aaic,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)

summary(z_fits$epec_bfpa)


Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      epec_bfpa_binary ~ aeromonas + cholera + eaec_aaic
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   4

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -2.3    0.4 -2.9  -2.3  -1.8 
aeromonas   -0.8    0.9 -2.0  -0.6   0.3 
cholera     -0.2    0.3 -0.5  -0.2   0.1 
eaec_aaic    1.0    0.2  0.7   1.0   1.3 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.2    0.1  0.1   0.2   0.3  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  12700
aeromonas     0.0  1.0   9705
cholera       0.0  1.0  11473
eaec_aaic     0.0  1.0  10589
mean_PPD      0.0  1.0  19332
log-posterior 0.0  1.0   7017

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(z_fits$epec_bfpa)

y_fits$epec_bfpa = 
  stan_glm(epec_bfpa ~ aeromonas + cholera + eaec_aaic,
           family = gaussian(),
           data = 
             soil %>% 
             filter(epec_bfpa_binary == 1),
           iter = 10000,
           seed = 2024)

summary(y_fits$epec_bfpa)


Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      epec_bfpa ~ aeromonas + cholera + eaec_aaic
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 14
 predictors:   4

Estimates:
              mean   sd   10%   50%   90%
(Intercept)  2.9    0.3  2.6   2.9   3.3 
aeromonas    0.8    0.5  0.1   0.8   1.5 
cholera     -0.2    0.2 -0.4  -0.2   0.0 
eaec_aaic    0.2    0.1  0.0   0.2   0.4 
sigma        0.7    0.2  0.5   0.7   0.9 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 3.3    0.3  3.0   3.3   3.7  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  15608
aeromonas     0.0  1.0  13547
cholera       0.0  1.0  13215
eaec_aaic     0.0  1.0  12866
sigma         0.0  1.0   9698
mean_PPD      0.0  1.0  17347
log-posterior 0.0  1.0   5626

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(y_fits$epec_bfpa)

z_fits$etec_lt = 
  stan_glm(etec_lt_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)

summary(z_fits$etec_lt)


Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      etec_lt_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   5

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -1.2    0.3 -1.6  -1.2  -0.8 
aeromonas    0.2    0.4 -0.3   0.2   0.6 
cholera      0.2    0.2  0.0   0.2   0.4 
eaec_aaic   -0.5    0.3 -0.9  -0.5  -0.1 
epec_bfpa    0.5    0.3  0.2   0.5   0.8 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.3    0.1  0.2   0.3   0.4  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  17486
aeromonas     0.0  1.0  15152
cholera       0.0  1.0  15344
eaec_aaic     0.0  1.0  11247
epec_bfpa     0.0  1.0  11941
mean_PPD      0.0  1.0  18282
log-posterior 0.0  1.0   8321

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(z_fits$etec_lt)

y_fits$etec_lt = 
  stan_glm(etec_lt ~ aeromonas + cholera + eaec_aaic + epec_bfpa,
           family = gaussian(),
           data = 
             soil %>% 
             filter(etec_lt_binary == 1),
           iter = 10000,
           seed = 2024)

summary(y_fits$etec_lt)


Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      etec_lt ~ aeromonas + cholera + eaec_aaic + epec_bfpa
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 22
 predictors:   5

Estimates:
              mean   sd   10%   50%   90%
(Intercept)  2.8    0.2  2.5   2.8   3.1 
aeromonas    0.2    0.2 -0.1   0.2   0.4 
cholera      0.0    0.1 -0.1   0.0   0.1 
eaec_aaic    0.0    0.2 -0.2   0.0   0.2 
epec_bfpa    0.2    0.1  0.0   0.2   0.4 
sigma        0.8    0.1  0.6   0.8   1.0 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 3.0    0.2  2.7   3.0   3.3  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  20560
aeromonas     0.0  1.0  21620
cholera       0.0  1.0  18880
eaec_aaic     0.0  1.0  17555
epec_bfpa     0.0  1.0  17485
sigma         0.0  1.0  15093
mean_PPD      0.0  1.0  20887
log-posterior 0.0  1.0   7426

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(y_fits$etec_lt)

z_fits$epec_eae = 
  stan_glm(epec_eae_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa + etec_lt,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)

summary(z_fits$epec_eae)


Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      epec_eae_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa + 
       etec_lt
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   6

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -0.7    0.3 -1.1  -0.7  -0.3 
aeromonas   -0.1    0.4 -0.7  -0.1   0.4 
cholera      0.3    0.2  0.1   0.3   0.6 
eaec_aaic    0.1    0.3 -0.3   0.1   0.5 
epec_bfpa    1.2    0.5  0.7   1.2   1.8 
etec_lt      0.0    0.2 -0.3   0.0   0.2 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.5    0.1  0.4   0.5   0.6  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  30714
aeromonas     0.0  1.0  15601
cholera       0.0  1.0  15342
eaec_aaic     0.0  1.0  17128
epec_bfpa     0.0  1.0  10870
etec_lt       0.0  1.0  18482
mean_PPD      0.0  1.0  23957
log-posterior 0.0  1.0   8172

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(z_fits$epec_eae)

y_fits$epec_eae = 
  stan_glm(epec_eae ~ aeromonas + cholera + eaec_aaic + epec_bfpa + etec_lt,
           family = gaussian(),
           data = 
             soil %>% 
             filter(epec_eae_binary == 1),
           iter = 10000,
           seed = 2024)

summary(y_fits$epec_eae)


Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      epec_eae ~ aeromonas + cholera + eaec_aaic + epec_bfpa + etec_lt
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 38
 predictors:   6

Estimates:
              mean   sd   10%   50%   90%
(Intercept)  3.7    0.2  3.5   3.7   3.9 
aeromonas    0.1    0.2 -0.2   0.1   0.3 
cholera      0.0    0.1 -0.1   0.0   0.1 
eaec_aaic    0.1    0.1 -0.1   0.1   0.2 
epec_bfpa    0.4    0.1  0.3   0.4   0.5 
etec_lt     -0.2    0.1 -0.3  -0.2  -0.1 
sigma        0.8    0.1  0.6   0.8   0.9 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 4.0    0.2  3.8   4.0   4.2  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  19882
aeromonas     0.0  1.0  12866
cholera       0.0  1.0  12101
eaec_aaic     0.0  1.0  12432
epec_bfpa     0.0  1.0  12231
etec_lt       0.0  1.0  12554
sigma         0.0  1.0  12093
mean_PPD      0.0  1.0  18294
log-posterior 0.0  1.0   6736

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

mcmc_trace(y_fits$epec_eae)

Collate results

# Set number of digits for output
n_digits = 3

# Create tables for reporting results, one for presence/absence, 
# the other for log_10 concentration
results = 
  list(z = tibble(Covariate = c("(Intercept)",(n_pos %>% sort() %>% names())[-c(1,length(path_names))])),
       y = tibble(Covariate = c("(Intercept)",(n_pos %>% sort() %>% names())[-c(1,length(path_names))]))
  )
for(j in (n_pos %>% sort() %>% names())){
  results$z[[j]] = results$y[[j]] = ""
  
  pmedian = 
    z_fits[[j]] %>% 
    as.matrix() %>% 
    apply(2,median) %>% 
    round(n_digits)
  ci = 
    posterior_interval(z_fits[[j]],prob = 0.95) %>% 
    round(n_digits)
  for(k in 1:length(pmedian)){
    results$z[[j]][ match(gsub("_binary","",names(pmedian))[k],
                          results$z$Covariate) ] = 
      paste(pmedian[k],
            " (",
            ci[k,1],
            ", ",
            ci[k,2],
            ")",
            collapse = "",
            sep = "")
  }
  
  pmedian = 
    y_fits[[j]] %>% 
    as.matrix() %>% 
    apply(2,median) %>% 
    round(n_digits)
  pmedian = pmedian[-length(pmedian)]
  ci = 
    posterior_interval(z_fits[[j]],prob = 0.95) %>% 
    round(n_digits)
  for(k in 1:length(pmedian)){
    results$y[[j]][ match(names(pmedian)[k],
                          results$y$Covariate) ] = 
      paste(pmedian[k],
            " (",
            ci[k,1],
            ", ",
            ci[k,2],
            ")",
            collapse = "",
            sep = "")
  }
  
}

results$z %>% 
  flextable()

Covariate	eaec_aata	aeromonas	cholera	eaec_aaic	epec_bfpa	etec_lt	epec_eae
(Intercept)	-3.51 (-4.967, -2.477)	-2.654 (-3.633, -1.886)	-2.31 (-3.186, -1.605)	-1.765 (-2.482, -1.152)	-2.305 (-3.279, -1.526)	-1.191 (-1.838, -0.601)	-0.656 (-1.298, -0.055)
aeromonas			0.827 (0.115, 1.648)	-0.437 (-1.97, 0.454)	-0.641 (-2.851, 0.63)	0.158 (-0.629, 0.913)	-0.099 (-1.029, 0.715)
cholera				0.227 (-0.106, 0.538)	-0.194 (-0.769, 0.233)	0.196 (-0.129, 0.507)	0.336 (0.011, 0.739)
eaec_aaic					1.011 (0.559, 1.536)	-0.516 (-1.164, 0.032)	0.071 (-0.574, 0.708)
epec_bfpa						0.478 (-0.027, 1.029)	1.172 (0.468, 2.281)
etec_lt							-0.038 (-0.478, 0.379)

results$y %>% 
  flextable()

Covariate	eaec_aata	aeromonas	cholera	eaec_aaic	epec_bfpa	etec_lt	epec_eae
(Intercept)	5.272 (-4.967, -2.477)	2.849 (-3.633, -1.886)	5.526 (-3.186, -1.605)	3.077 (-2.482, -1.152)	2.945 (-3.279, -1.526)	2.797 (-1.838, -0.601)	3.703 (-1.298, -0.055)
aeromonas			0.048 (0.115, 1.648)	1.745 (-1.97, 0.454)	0.784 (-2.851, 0.63)	0.165 (-0.629, 0.913)	0.051 (-1.029, 0.715)
cholera				0.018 (-0.106, 0.538)	-0.186 (-0.769, 0.233)	-0.031 (-0.129, 0.507)	-0.01 (0.011, 0.739)
eaec_aaic					0.188 (0.559, 1.536)	0.023 (-1.164, 0.032)	0.058 (-0.574, 0.708)
epec_bfpa						0.186 (-0.027, 1.029)	0.392 (0.468, 2.281)
etec_lt							-0.191 (-0.478, 0.379)

Assess Model Fit

Model fit is assessed using the Bayesian p-value based on the following goodness of fit statistic: \[ \sum_{i=1}^N\sum_{j=1}^J\frac{(y_{ij} - \mathbb{E}(y_{ij})^2}{\text{Var}(y_{ij})} \] For those unfamiliar with Bayesian p-values, this is a well-established technique to assess model fit in the Bayesian framework. Values close to 0.5 indicate the model fits the data well, while values close to 0 or to 1 indicate a poor fit.

Note that with the sequential GLM modeling approach described above, we have \[\begin{align*} z_{ij} & \sim \text{Bernoulli}(p_{ij}) &\\ y_{ij} & \sim \begin{cases} \delta_0(y) & \text{ if }z_{ij}=0\\ N(\mu_{ij},\sigma^2_j)& \text{ if }z_{ij} = 1 \end{cases} & \\ \text{where logit} (p_{ij}) & = \alpha_{j0} + \sum_{k<j}y_{ik}\alpha_{jk}, &\\ \mu_{ij} & = \beta_{j0} + \sum_{k<j}y_{ik}\beta_{jk}. &\\ \text{Hence } \mathbb{E}(y_{ij}) & = \mathbb{E}(\mathbb{E}(y_{ij}|z_{ij})) &\\ & = \mathbb{E}(\mu_{ij}z_{ij}) &\\ & = \mu_{ij}p_{ij} & \\ \text{Var}(y_{ij}) & = \mathbb{E}(\text{Var}(y_{ij}|z_{ij})) + \text{Var}(\mathbb{E}(y_{ij}|z_{ij})) &\\ & = \mathbb{E}(\sigma^2_jz_{ij}) + \text{Var}(\mu_{ij}z_{ij}) &\\ & = \sigma^2_jp_{ij} + \mu_{ij}^2p_{ij}(1-p_{ij}). \end{align*}\]

First, get posterior samples of the model parameters:

z_samples = 
  z_fits %>% 
  lapply(as.matrix)
y_samples = 
  y_fits %>% 
  lapply(as.matrix)

Now we need to get samples from the posterior predictive density. This must be done sequentially according to how we fit the models:

set.seed(2024)
z_predictions = y_predictions = list()
## eaec_aata
z_predictions$eaec_aata =
  z_fits$eaec_aata %>% 
  posterior_predict()
y_predictions$eaec_aata = 
  y_fits$eaec_aata %>% 
  posterior_predict(newdata = data.frame(whatever = rep(1,nrow(soil))))
y_predictions$eaec_aata = 
  y_predictions$eaec_aata * z_predictions$eaec_aata
## aeromonas
z_predictions$aeromonas =
  z_fits$aeromonas %>% 
  posterior_predict()
y_predictions$aeromonas = 
  y_fits$aeromonas %>% 
  posterior_predict(newdata = data.frame(whatever = rep(1,nrow(soil))))
y_predictions$aeromonas = 
  y_predictions$aeromonas * z_predictions$aeromonas
## cholera
z_predictions$cholera = 
  z_samples$cholera[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  z_samples$cholera[,"aeromonas"] * y_predictions$aeromonas #first term will be repeated correctly
z_predictions$cholera = 
  1.0 * (matrix(runif(prod(dim(z_predictions$cholera))),
                nrow(z_predictions$cholera),
                ncol(z_predictions$cholera)) < 
           (1.0 / (1.0 + exp(-z_predictions$cholera) ) ) )
y_predictions$cholera =
  y_samples$cholera[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  y_samples$cholera[,"aeromonas"] * y_predictions$aeromonas + 
  matrix(rnorm(nrow(y_samples$cholera) * nrow(soil),
               sd = y_samples$cholera[,"sigma"]),
         nrow(y_samples$eaec_aata),
         nrow(soil))
y_predictions$cholera =
  y_predictions$cholera * z_predictions$cholera
## eaec_aaic
z_predictions$eaec_aaic = 
  z_samples$eaec_aaic[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  z_samples$eaec_aaic[,"aeromonas"] * y_predictions$aeromonas  +
  z_samples$eaec_aaic[,"cholera"] * y_predictions$cholera
z_predictions$eaec_aaic = 
  1.0 * (matrix(runif(prod(dim(z_predictions$eaec_aaic))),
                nrow(z_predictions$eaec_aaic),
                ncol(z_predictions$eaec_aaic)) < 
           1.0 / (1.0 + exp(-z_predictions$eaec_aaic)))
y_predictions$eaec_aaic =
  y_samples$eaec_aaic[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  y_samples$eaec_aaic[,"aeromonas"] * y_predictions$aeromonas +
  y_samples$eaec_aaic[,"cholera"] * y_predictions$cholera +
  matrix(rnorm(nrow(y_samples$eaec_aaic) * nrow(soil),
               sd = y_samples$eaec_aaic[,"sigma"]),
         nrow(y_samples$eaec_aata),
         nrow(soil))
y_predictions$eaec_aaic =
  y_predictions$eaec_aaic * z_predictions$eaec_aaic
## epec_bfpa
z_predictions$epec_bfpa = 
  z_samples$epec_bfpa[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  z_samples$epec_bfpa[,"aeromonas"] * y_predictions$aeromonas  +
  z_samples$epec_bfpa[,"cholera"] * y_predictions$cholera +
  z_samples$epec_bfpa[,"eaec_aaic"] * y_predictions$eaec_aaic
z_predictions$epec_bfpa = 
  1.0 * (matrix(runif(prod(dim(z_predictions$epec_bfpa))),
                nrow(z_predictions$epec_bfpa),
                ncol(z_predictions$epec_bfpa)) < 
           1.0 / (1.0 + exp(-z_predictions$epec_bfpa)))
y_predictions$epec_bfpa =
  y_samples$epec_bfpa[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  y_samples$epec_bfpa[,"aeromonas"] * y_predictions$aeromonas +
  y_samples$epec_bfpa[,"cholera"] * y_predictions$cholera +
  y_samples$epec_bfpa[,"eaec_aaic"] * y_predictions$eaec_aaic +
  matrix(rnorm(nrow(y_samples$epec_bfpa) * nrow(soil),
               sd = y_samples$epec_bfpa[,"sigma"]),
         nrow(y_samples$eaec_aata),
         nrow(soil))
y_predictions$epec_bfpa =
  y_predictions$epec_bfpa * z_predictions$epec_bfpa
## etec_lt
z_predictions$etec_lt = 
  z_samples$etec_lt[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  z_samples$etec_lt[,"aeromonas"] * y_predictions$aeromonas  +
  z_samples$etec_lt[,"cholera"] * y_predictions$cholera +
  z_samples$etec_lt[,"eaec_aaic"] * y_predictions$eaec_aaic +
  z_samples$etec_lt[,"epec_bfpa"] * y_predictions$epec_bfpa
z_predictions$etec_lt = 
  1.0 * (matrix(runif(prod(dim(z_predictions$etec_lt))),
                nrow(z_predictions$etec_lt),
                ncol(z_predictions$etec_lt)) < 
           1.0 / (1.0 + exp(-z_predictions$etec_lt)))
y_predictions$etec_lt =
  y_samples$etec_lt[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  y_samples$etec_lt[,"aeromonas"] * y_predictions$aeromonas +
  y_samples$etec_lt[,"cholera"] * y_predictions$cholera +
  y_samples$etec_lt[,"eaec_aaic"] * y_predictions$eaec_aaic +
  y_samples$etec_lt[,"epec_bfpa"] * y_predictions$epec_bfpa +
  matrix(rnorm(nrow(y_samples$etec_lt) * nrow(soil),
               sd = y_samples$etec_lt[,"sigma"]),
         nrow(y_samples$eaec_aata),
         nrow(soil))
y_predictions$etec_lt =
  y_predictions$etec_lt * z_predictions$etec_lt
## epec_eae
z_predictions$epec_eae = 
  z_samples$epec_eae[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  z_samples$epec_eae[,"aeromonas"] * y_predictions$aeromonas  +
  z_samples$epec_eae[,"cholera"] * y_predictions$cholera +
  z_samples$epec_eae[,"eaec_aaic"] * y_predictions$eaec_aaic +
  z_samples$epec_eae[,"epec_bfpa"] * y_predictions$epec_bfpa +
  z_samples$epec_eae[,"etec_lt"] * y_predictions$etec_lt
z_predictions$epec_eae = 
  1.0 * (matrix(runif(prod(dim(z_predictions$epec_eae))),
                nrow(z_predictions$epec_eae),
                ncol(z_predictions$epec_eae)) < 
           1.0 / (1.0 + exp(-z_predictions$epec_eae)))
y_predictions$epec_eae =
  y_samples$epec_eae[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  y_samples$epec_eae[,"aeromonas"] * y_predictions$aeromonas +
  y_samples$epec_eae[,"cholera"] * y_predictions$cholera +
  y_samples$epec_eae[,"eaec_aaic"] * y_predictions$eaec_aaic +
  y_samples$epec_eae[,"epec_bfpa"] * y_predictions$epec_bfpa +
  y_samples$epec_eae[,"etec_lt"] * y_predictions$etec_lt +
  matrix(rnorm(nrow(y_samples$epec_eae) * nrow(soil),
               sd = y_samples$epec_eae[,"sigma"]),
         nrow(y_samples$eaec_aata),
         nrow(soil))
y_predictions$epec_eae =
  y_predictions$epec_eae * z_predictions$epec_eae

Below is the function to obtain the GOF statistics (one for the observed data and one for the predictive sample) for a given posterior draw.

gof_stat = function(iter){
  dt.soil_sim = 
    soil %>% 
    dplyr::select(all_of(names(z_fits)))
  
  for(j in names(z_fits)) dt.soil_sim[[j]] = y_predictions[[j]][iter,]
  dt.soil_sim %<>%
    bind_cols(dt.soil_sim %>% 
                mutate_all(function(x) ifelse(is.na(x),0, 1)) %>% 
                rename_all(function(x) paste0(x,"_binary")))
  
  PrGr0 = 
    mu = 
    Var = 
    Exp = 
    matrix(0.0,nrow(dt.soil_sim),NROW(n_pos))
  
  ## Compute GOF stat for simulated data
  for(j in 1:NROW(n_pos)){
    mm = model.matrix(formula(z_fits[[j]]), # Formula is the same for z_fits and y_fits
                      data = dt.soil_sim)
    PrGr0[,j] = 
      1.0 / drop(1.0 + exp(-mm %*% z_samples[[j]][iter,]))
    
    mu[,j] = mm %*% y_samples[[j]][iter,-ncol(y_samples[[j]])]
    
    Exp[,j] = PrGr0[,j] * mu[,j]
    
    Var[,j] = 
      PrGr0[,j] * y_samples[[j]][iter,ncol(y_samples[[j]])]^2 +
      PrGr0[,j] * (1.0 - PrGr0[,j]) * mu[,j]^2
  }
  
  GOF_sim = 
    sum( (as.matrix(dt.soil_sim %>% 
                      dplyr::select(all_of(names(z_fits)))) - Exp)^2 / Var )
  
  ## Compute GOF stat for observed data
  for(j in 1:NROW(n_pos)){
    mm = model.matrix(formula(z_fits[[j]]),
                      data = soil)
    PrGr0[,j] = 
      1.0 / drop(1.0 + exp(-mm %*% z_samples[[j]][iter,]))
    
    mu[,j] = mm %*% y_samples[[j]][iter,-ncol(y_samples[[j]])]
    
    Exp[,j] = PrGr0[,j] * mu[,j]
    
    Var[,j] = 
      PrGr0[,j] * y_samples[[j]][iter,ncol(y_samples[[j]])]^2 +
      PrGr0[,j] * (1.0 - PrGr0[,j]) * mu[,j]^2
  }
  
  GOF_obs = 
    sum( (as.matrix(soil %>% 
                      dplyr::select(all_of(names(z_fits)))) - Exp)^2 / Var )
  
  return(c(GOF_sim = GOF_sim,
           GOF_obs = GOF_obs))
}

Now to compute and plot.

GOF_stats = 
  sapply(1:nrow(z_samples[[1]]),gof_stat) %>% 
  t() %>% 
  as_tibble()

plot(GOF_stats,
     pch = 16,
     col = c("indianred","steelblue")[1L + (GOF_stats$GOF_sim < GOF_stats$GOF_obs)],
     bty = 'l',
     xlim = quantile(GOF_stats$GOF_sim,c(0.01,0.99)),
     ylim = quantile(GOF_stats$GOF_obs,c(0.01,0.99)))
legend('topright',
       bty = 'n',
       cex = 2,
       legend = paste0("Bayesian p-value = ",round(mean(GOF_stats$GOF_sim < GOF_stats$GOF_obs),2)))

Behavioral modeling

Model fitting

We consider four different models: Poisson, negative binomial, zero-inflated Poisson (ZIP), and zero-inflated negative binomial (ZINB). We then use Bayes factors to determine the best fitting model.

Set up:

beh_names = names(behavior)[-c(1:3)]

b_fits = list()
for(j in beh_names) b_fits[[j]] = list()

For each model, we need to compile the STAN code. After that, we can simply change the data being fed into that STAN model and run the HMC algorithm.

b_fits$hand_to_object$poisson = 
  brm(y ~ offset(log(time_min)) + (1|age),
      family = "poisson",
      data = 
        behavior %>% 
        rename(y = hand_to_object),
      save_pars = save_pars(all = TRUE),
      iter = 10000,
      seed = 2024,
      cores = 4)
for(j in beh_names[-1]){
  b_fits[[j]]$poisson = 
    b_fits$hand_to_object$poisson %>% 
    update(newdata = 
             behavior %>% 
             rename(y = !!j),
           seed = 2024,
           cores = 4)
}

Check for convergence:

lapply(b_fits,
       function(X) rhat(X$poisson)) %>% 
  unlist() %>% 
  max()

[1] 1.012148

b_fits$hand_to_object$negbinomial = 
  brm(y ~ offset(log(time_min)) + (1|age),
      family = "negbinomial",
      data = 
        behavior %>% 
        rename(y = hand_to_object),
      save_pars = save_pars(all = TRUE),
      iter = 10000,
      seed = 2024,
      cores = 4)
for(j in beh_names[-1]){
  b_fits[[j]]$negbinomial = 
    b_fits$hand_to_object$negbinomial %>% 
    update(newdata = 
             behavior %>% 
             rename(y = !!j),
           seed = 2024,
      cores = 4)
}

Check for convergence:

lapply(b_fits,
       function(X) rhat(X$negbinomial)) %>% 
  unlist() %>% 
  max()

[1] 1.009607

b_fits$hand_to_object$zip = 
  brm(bf(y ~ offset(log(time_min)) + (1|age),
         zi = ~ log(time_min) + (1|age)),
      family = "zero_inflated_poisson",
      data = 
        behavior %>% 
        rename(y = hand_to_object),
      save_pars = save_pars(all = TRUE),
      iter = 10000,
      seed = 2024,
      cores = 4)
for(j in beh_names[-1]){
  b_fits[[j]]$zip = 
    b_fits$hand_to_object$zip %>% 
    update(newdata = 
             behavior %>% 
             rename(y = !!j),
           seed = 2024,
           cores = 4)
}

Check for convergence:

lapply(b_fits,
       function(X) rhat(X$zip)) %>% 
  unlist() %>% 
  max()

[1] 1.058392

b_fits$hand_to_object$zinb = 
  brm(bf(y ~ offset(log(time_min)) + (1|age),
         zi = ~ log(time_min) + (1|age)),
      family = "zero_inflated_negbinomial",
      data = 
        behavior %>% 
        rename(y = hand_to_object),
      save_pars = save_pars(all = TRUE),
      iter = 10000,
      seed = 2024,
      cores = 4)
for(j in beh_names[-1]){
  b_fits[[j]]$zinb = 
    b_fits$hand_to_object$zinb %>% 
    update(newdata = 
             behavior %>% 
             rename(y = !!j),
           seed = 2024,
           cores = 4)
}

Check for convergence:

lapply(b_fits,
       function(X) rhat(X$zinb)) %>% 
  unlist() %>% 
  max()

[1] 1.009326

Model selection

We select the best model for each behavior via Bayes factors.

bf_by_behavior = list()
for(j in names(b_fits)){
  cat(paste0("\n--- ",j))
  bf_by_behavior[[j]] = 
    matrix(0.0,4,4,
           dimnames = list(c("poisson","negbinomial","zip","zinb"),
                           c("poisson","negbinomial","zip","zinb")))
  bf_by_behavior[[j]][lower.tri(bf_by_behavior[[j]])] = NA
  diag(bf_by_behavior[[j]]) = NA
  bf_by_behavior[[j]][1,2] = 
    bayes_factor(b_fits[[j]]$poisson,
                 b_fits[[j]]$negbinomial)$bf
  bf_by_behavior[[j]][1,3] = 
    bayes_factor(b_fits[[j]]$poisson,
                 b_fits[[j]]$zip)$bf
  bf_by_behavior[[j]][1,4] = 
    bayes_factor(b_fits[[j]]$poisson,
                 b_fits[[j]]$zinb)$bf
  bf_by_behavior[[j]][2,3] = 
    bayes_factor(b_fits[[j]]$negbinomial,
                 b_fits[[j]]$zip)$bf
  bf_by_behavior[[j]][2,4] = 
    bayes_factor(b_fits[[j]]$negbinomial,
                 b_fits[[j]]$zinb)$bf
  bf_by_behavior[[j]][3,4] = 
    bayes_factor(b_fits[[j]]$zip,
                 b_fits[[j]]$zinb)$bf
}

best_fits = list()

for(j in names(bf_by_behavior)){
  bf_by_behavior[[j]][lower.tri(bf_by_behavior[[j]])] = 
    1.0 / t(bf_by_behavior[[j]])[lower.tri(bf_by_behavior[[j]])]
  diag(bf_by_behavior[[j]]) = 1.0
  
  best_model_name = 
    rownames(bf_by_behavior[[j]])[which(apply(bf_by_behavior[[j]],
                                              1,
                                              function(x) all(x >= 1.0)))]
  best_fits[[j]] = 
    b_fits[[j]][[best_model_name]]
}

Below are the estimated parameters of the behavioral distributions. Note that for the zero probabilities for the ZIP and ZINB, the results are on the scale of the linear predictor, i.e., the untransformed regression coefficients. This is because the log(minutes observed) was used as a covariate rather than an offset for this component.

best_fit_summary = 
  tibble(Behavior = rep(gsub("_","-",names(best_fits)),
                        each = 3),
         Age = rep(c("B","C","T"), length(best_fits)),
         `Poi mean` = NA,
         `NB mean` = NA,
         `NB dispersion` = NA,
         `ZIP Poi mean` = NA,
         `ZIP zero prob Intercept` = NA,
         `ZIP zero prob logtime (min)` = NA,
         `ZINB NB mean` = NA,
         `ZINB NB dispersion` = NA,
         `ZINB zero prob Intercept` = NA,
         `ZINB zero prob logtime (min)` = NA)
for(i in 1:length(best_fits)){
  if(best_fits[[i]]$family$family == "poisson"){
    best_fit_summary$`Poi mean`[3 * (i - 1) + 1:3] = 
      exp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
  }
  if(best_fits[[i]]$family$family == "negbinomial"){
    best_fit_summary$`NB mean`[3 * (i - 1) + 1:3] = 
      exp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
    best_fit_summary$`NB dispersion`[3 * (i - 1) + 1:3] = 
      summary(best_fits[[i]])$spec_pars$Estimate
  }
  if(best_fits[[i]]$family$family == "zero_inflated_poisson"){
    best_fit_summary$`ZIP Poi mean`[3 * (i - 1) + 1:3] = 
      exp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
    best_fit_summary$`ZIP zero prob Intercept`[3 * (i - 1) + 1:3] = 
      coef(best_fits[[i]])$age[,"Estimate","zi_Intercept"]
    best_fit_summary$`ZIP zero prob logtime (min)`[3 * (i - 1) + 1:3] = 
      coef(best_fits[[i]])$age[,"Estimate","zi_logtime_min"]
  }
  if(best_fits[[i]]$family$family == "zero_inflated_negbinomial"){
    best_fit_summary$`ZINB NB mean`[3 * (i - 1) + 1:3] = 
      exp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
    best_fit_summary$`ZINB NB dispersion`[3 * (i - 1) + 1:3] = 
      summary(best_fits[[i]])$spec_pars$Estimate
    best_fit_summary$`ZINB zero prob Intercept`[3 * (i - 1) + 1:3] = 
      coef(best_fits[[i]])$age[,"Estimate","zi_Intercept"]
    best_fit_summary$`ZINB zero prob logtime (min)`[3 * (i - 1) + 1:3] = 
      coef(best_fits[[i]])$age[,"Estimate","zi_logtime_min"]
  }
}

best_fit_summary %>% 
  flextable() %>% 
  merge_v(1) %>% 
  hline(1:3*3) %>% 
  colformat_double(digits = 3)

Behavior	Age	NB mean	NB dispersion	ZIP Poi mean	ZIP zero prob Intercept	ZIP zero prob logtime (min)
hand-to-object	B	0.367	2.338
	C	0.443	2.338
	T	0.361	2.338
hand-to-soil	B	0.187	0.854
	C	0.207	0.854
	T	0.186	0.854
mouth-to-object	B	0.062	0.319
	C	0.027	0.319
	T	0.042	0.319
geophagy	B			0.013	0.014	-1.523
	C			0.002	1.164	-1.523
	T			0.004	0.129	-1.523
mouth-to-hand	B	0.323	1.214
	C	0.161	1.214
	T	0.209	1.214

Monte Carlo simulation

First, we will draw new behavior counts in a 1hr period through the posterior predictive distributions.

set.seed(2024)
behavior_draws = list()

for(j in names(best_fits)){
  behavior_draws[[j]] = 
    best_fits[[j]] %>% 
    posterior_predict(newdata = tibble(age = c("B","T","C"),
                                       time_min = 60))
  colnames(behavior_draws[[j]]) = c("B","T","C")
}

behavior_draws_compiled = behavior_draws
for(j in names(behavior_draws_compiled)){
  behavior_draws_compiled[[j]] %<>% 
    as_tibble() %>% 
    mutate(Behavior = j) %>% 
    relocate(Behavior)
}
behavior_draws_compiled = 
  do.call(bind_rows,
          behavior_draws_compiled) %>% 
  pivot_longer(cols = B:C,
               names_to = "Age",
               values_to = "Count") %>% 
  mutate(Behavior = 
           case_match(Behavior,
                      "hand_to_object" ~ "Hand-to-object",
                      "hand_to_soil" ~ "Hand-to-soil",
                      "mouth_to_object" ~ "Mouth-to-object",
                      "geophagy" ~ "Geophagy",
                      "mouth_to_hand" ~ "Hand-to-mouth"),
         Age = 
           case_match(Age,
                      "B" ~ "Baby",
                      "T" ~ "Toddler",
                      "C" ~ "Child") %>% 
           factor(levels = c("Baby","Toddler","Child")))

behavior_summary = 
  behavior %>% 
  group_by(age) %>% 
  summarize(geophagy = sum(geophagy)/sum(time_min) * 60,
            hand_to_mouth = sum(mouth_to_hand)/sum(time_min) * 60,
            hand_to_object = sum(hand_to_object)/sum(time_min) * 60,
            hand_to_soil = sum(hand_to_soil)/sum(time_min) * 60,
            mouth_to_object = sum(mouth_to_object)/sum(time_min) * 60)

behavior_draws_compiled %>% 
  ggplot(aes(y = Count,
             x = Behavior,
             fill = Age)) +
  geom_boxplot(outliers = FALSE,
               col = "gray40") + 
  geom_point(data = data.frame(x = c(1:5 - 0.25,
                                     1:5,
                                     1:5 + 0.25),
                               y = c(unlist(behavior_summary[1,-1]),
                                     unlist(behavior_summary[3,-1]),
                                     unlist(behavior_summary[2,-1])),
                               Age = rep(c("Baby","Toddler","Child"),
                                         each = 5)),
             aes(y=y,x=x),
             col = "gray40",
             size = 3,
             pch = 8) +
  ylab("Count / hr") + 
  scale_fill_viridis_d() +
    theme(
      panel.background = element_rect(fill = "white"),
      panel.grid.major = element_line(color = "white"),
      panel.grid.minor = element_blank(),
      axis.line = element_line(color = "black"),
      axis.text.y = element_text(size = 16),
      axis.text.x = element_text(size = 16,angle = -45, hjust = 0),
      axis.title = element_text(size = 18, face = "bold"),
      axis.ticks = element_line(color = "black"),
      legend.background = element_rect(fill = "white"),
      legend.key = element_blank(),
      legend.text = element_text(size = 16),
      legend.title = element_text(size = 18, face = "bold")
      )

Next we will draw values/fix constants from the extant literature.

n_sim = nrow(behavior_draws$mouth_to_hand)
set.seed(2024)

## Helper function to get mean and sd from empirical quantiles
find_mean_and_sd = function(probs,
                            obs_quantiles,
                            init_for_optim){
  
  objective_function = function(x){
    sum( (obs_quantiles - qnorm(probs,
                                mean = x[1],
                                sd = exp(x[2])))^2 )
  }
  
  optim(init_for_optim,
        objective_function)
}


## Transfer Efficiencies
trans_soil_hand =
  rlnorm(n_sim, meanlog = 0.11, sdlog = 2.0) #Ozkaynak et al
trans_object_hand =
  rtrunc(n_sim, "norm", a = 0, b =1, mean = 0.23, sd=0.22) # Julian, et al
trans_hand_mouth =
  rbeta(n_sim,shape1 = 5.2, shape2 = 2.6) # Kwong et al
trans_object_mouth =
  rbeta(n_sim, shape1 = 2, shape2 = 8)  # Ozkaynak et al

# Surface areas
child_hand_surf_area_cm2 = list()
child_hand_surf_area_cm2$B =
  rtri(n_sim, min = 74.59, max = 119.18, mode=98.31) #Kwong et al 2019
child_hand_surf_area_cm2$T =
  rtri(n_sim, min = 80.01, max = 177.38, mode=121.18) #Kwong et al 2019
child_hand_surf_area_cm2$C =
  rtri(n_sim, min = 139.63, max = 153.28, mode=146.47) #Kwong et al 2019
object_surf_area_cm2 =
  rexp(n_sim,rate=0.11) #Kwong et al 2019

# Other miscellaneous constants
soil_handful_g =
  0.001 * rbeta(n_sim, shape1 = 5.3, shape2=158.6) #Kwong, et al 2019

hand_loading_g_cm2 =
  0.001 * rtri(n_sim, min = 0, max = 0.95, mode=0.1) #Kwong et al 2019 

obj_loading_g_cm2 =
  0.001 * rlnorm(n_sim, meanlog = 0.11, sdlog = 2.0) #Ozkaynak et al # Treating objects like hands

temp = 
  find_mean_and_sd(c(5, 25, 50, 75, 95, 99)/100,
                   c(0.10,0.16,0.21,0.25,0.31,0.60),
                 c(0.21,0.05))
temp$convergence == 0

[1] TRUE

print(c(mean = temp$par[1],sd = exp(temp$par[2])))

     mean        sd 
0.2299022 0.1080376

child_frac_hand_touch_soil =
  rtrunc(n_sim, 
         "norm", 
         a = 0, b =1, 
         mean = temp$par[1], 
         sd = exp(temp$par[2])) #Auyueng et al, 2008

temp = 
  find_mean_and_sd(c(5, 25, 50, 75, 95, 99)/100,
                   c(0.12,0.14,0.16,0.19,0.30,0.42),
                   c(0.16,log(0.05)))
temp$convergence == 0

[1] TRUE

print(c(mean = temp$par[1],sd = exp(temp$par[2])))

     mean        sd 
0.1933396 0.0730457

child_frac_hand_touch_mouth =
  rtrunc(n_sim, 
         "norm", 
         a = 0, b =1, 
         mean = temp$par[1], 
         sd = exp(temp$par[2])) #Auyueng et al, 2008

temp = 
  find_mean_and_sd(c(5, 25, 50, 75, 95, 99)/100,
                   c(0.13,0.14,0.15,0.17,0.20,0.23),
                   c(0.15,log(0.05)))
temp$convergence == 0

[1] TRUE

print(c(mean = temp$par[1],sd = exp(temp$par[2])))

      mean         sd 
0.16015715 0.02538482

frac_hand_touch_object =
  rtrunc(n_sim, 
         "norm", 
         a = 0, b =1, 
         mean = temp$par[1], 
         sd = exp(temp$par[2])) #Auyueng et al, 2008

max_obj_loading_g_cm2 =
  0.001 * runif(n_sim, 6, 8) #Ozkaynak et al # Treating objects like hands

Set up objects for exposure calculations and compute for each MC draw. Note that since soil samples are exchangeable, we can simply use the first column of each y_prediction matrix.

exposure_dose_per_hour_culture = exposure_dose_per_hour = list()
for(age in c("B","T","C")){
  exposure_dose_per_hour[[age]] = 
  exposure_dose_per_hour_culture[[age]] = 
    matrix(0.0, n_sim, length(y_fits),
            dimnames = list(NULL, names(y_fits)))
} 

for(age in names(exposure_dose_per_hour)){
  for(j in names(y_fits)){
    exposure_dose_per_hour[[age]][,j] = 
      ifelse(y_predictions[[j]][,1] == 0, 0, 10^y_predictions[[j]][,1]) * 
      ( (trans_soil_hand * 
           child_hand_surf_area_cm2[[age]] * 
           child_frac_hand_touch_soil * 
           child_frac_hand_touch_mouth * 
           trans_hand_mouth * 
           hand_loading_g_cm2 * 
           sapply(1:n_sim,function(i){
             min(behavior_draws$hand_to_soil[i,age],
                 behavior_draws$mouth_to_hand[i,age])
           })
         ) + 
          ( trans_object_hand * 
              child_hand_surf_area_cm2[[age]] * 
              frac_hand_touch_object * 
              child_frac_hand_touch_mouth * 
              trans_hand_mouth * 
              hand_loading_g_cm2 * 
              sapply(1:n_sim,function(i){
                min(behavior_draws$hand_to_object[i,age],
                    behavior_draws$mouth_to_hand[i,age])
              })
            ) + 
          ( trans_object_mouth * 
              object_surf_area_cm2 *
              sapply(1:n_sim,function(i){
                min(obj_loading_g_cm2,
                    max_obj_loading_g_cm2)
                }) *
              behavior_draws$mouth_to_object[,age]
            ) + 
          (soil_handful_g * 
             behavior_draws$geophagy[,age]))
  }
}

Results

Tabular form

exposure_results_table = list() 
for(j in names(exposure_dose_per_hour)){
  exposure_results_table[[j]] = 
    tibble(Organism = 
             names(y_fits),
           `Pr(+)` = 
             apply(exposure_dose_per_hour[[j]],2,function(x) mean(x > 0)),
           `Median | +` = 
             apply(exposure_dose_per_hour[[j]],2,function(x) median(x[which(x > 0)])),
           lower = 
             apply(exposure_dose_per_hour[[j]],2,function(x) quantile(x[which(x > 0)],probs = 0.025)),
           upper = 
             apply(exposure_dose_per_hour[[j]],2,function(x) quantile(x[which(x > 0)],probs = 0.975))) %>% 
    mutate(`Median | +` = round(log10(`Median | +`),2),
           `95% PI` = paste("(",
                            round(log10(lower),2),
                            ", ",
                            round(log10(upper),2),
                            ")",
                            sep = ""),.keep = "unused") %>% 
    rename_with( ~ paste(switch(j, B = "Baby", T = "Toddler", C = "Child"),
                         .x, sep = "_"))
}

exposure_results_table %>%
  do.call(what = bind_cols) %>% 
  select(-Toddler_Organism,-Child_Organism) %>% 
  rename(Organism = Baby_Organism) %>% 
  flextable() %>% 
  separate_header()

Organism	Baby			Toddler			Child
Organism	Pr(+)	Median \| +	95% PI	Pr(+)	Median \| +	95% PI	Pr(+)	Median \| +	95% PI
eaec_aata	0.03005	2.96	(0.6, 5.02)	0.02945	2.95	(-0.13, 5.05)	0.02885	3.01	(-0.27, 5.01)
aeromonas	0.06715	0.46	(-2.99, 4.08)	0.06620	0.49	(-3.27, 4.11)	0.06565	0.58	(-3.49, 4.2)
cholera	0.11780	3.28	(1.08, 5.36)	0.11605	3.27	(0.72, 5.4)	0.11435	3.34	(0.62, 5.34)
eaec_aaic	0.17330	0.84	(-1.48, 5.84)	0.17070	0.88	(-2.16, 5.84)	0.16880	0.91	(-1.93, 5.58)
epec_bfpa	0.18140	0.97	(-1.96, 4.26)	0.17780	0.97	(-2.22, 4.37)	0.17660	1.04	(-2.33, 4.35)
etec_lt	0.27610	0.69	(-2.03, 3.44)	0.27210	0.71	(-2.24, 3.45)	0.26915	0.75	(-2.34, 3.5)
epec_eae	0.48060	1.68	(-1.22, 4.75)	0.47380	1.69	(-1.38, 4.87)	0.46845	1.73	(-1.54, 4.88)

Graphical form

exposure_results =  
  exposure_dose_per_hour$B %>% 
  as_tibble() %>% 
  mutate(Age = "Baby") %>% 
  bind_rows(exposure_dose_per_hour$T %>% 
              as_tibble() %>% 
              mutate(Age = "Toddler")) %>% 
  bind_rows(exposure_dose_per_hour$C %>% 
              as_tibble() %>% 
              mutate(Age = "Child")) %>% 
  pivot_longer(!Age,
               names_to = "Organism",
               values_to = "Concentration") %>% 
  mutate(Organism = 
           case_match(Organism,
                      "eaec_aata" ~ "EAEC aata",
                      "aeromonas" ~ "Aeromonas",
                      "cholera" ~ "Cholera",
                      "eaec_aaic" ~ "EAEC aaic",
                      "epec_bfpa" ~ "EPEC bfpa",
                      "etec_lt" ~ "ETEC LT",
                      "epec_eae" ~ "EPEC eae"))

exposure_results %>%  
  group_by(Age,Organism) %>% 
  summarize_if(is.numeric,
               list(`Pr(+)` = 
                      function(x) mean(x > 0))) %>% 
  ggplot(aes(y = `Pr(+)`,
             x = Organism,
             fill = Age)) + 
  geom_bar(position = 'dodge', stat = 'identity') + 
  scale_fill_viridis_d() +
    theme(
      panel.background = element_rect(fill = "white"),
      panel.grid.major = element_line(color = "white"),
      panel.grid.minor = element_blank(),
      axis.line = element_line(color = "black"),
      axis.text.y = element_text(size = 16),
      axis.text.x = element_text(size = 16,angle = -45, hjust = 0),
      axis.title = element_text(size = 18, face = "bold"),
      axis.ticks = element_line(color = "black"),
      legend.background = element_rect(fill = "white"),
      legend.key = element_blank(),
      legend.text = element_text(size = 16),
      legend.title = element_text(size = 18, face = "bold")
      )

IQR_bounds =  
  exposure_results %>% 
  filter(!near(Concentration, 0)) %>% 
  mutate(Concentration = log10(Concentration)) %>% 
  group_by(Age,Organism) %>% 
  summarize(lower = quantile(Concentration,0.25) - 1.5 * IQR(Concentration),
            upper = quantile(Concentration,0.75) + 1.5 * IQR(Concentration))

exposure_results %>% 
  filter(!near(Concentration, 0)) %>% 
  mutate(Concentration = log10(Concentration)) %>% 
  ggplot(aes(y = Concentration,
             x = Organism,
             fill = Age)) +
  geom_violin(bounds = range(c(IQR_bounds$lower,
                               IQR_bounds$upper))) +
  scale_fill_viridis_d() +
  scale_y_continuous(breaks = c(-4:6),
                     labels = 10^c(-4:6)) +
  geom_hline(yintercept = c(-4:6), 
             color = "gray30",
             linetype = "dashed",
             alpha = 0.5) +
    theme(
    panel.background = element_rect(fill = "white"),
    panel.grid.major = element_line(color = "white"),
    panel.grid.minor = element_blank(),
    axis.line = element_line(color = "black"),
    axis.text.y = element_text(size = 16),
    axis.text.x = element_text(size = 16,angle = -45, hjust = 0),
    axis.title = element_text(size = 18, face = "bold"),
    axis.ticks = element_line(color = "black"),
    legend.background = element_rect(fill = "white"),
    legend.key = element_blank(),
    legend.text = element_text(size = 16),
    legend.title = element_text(size = 18, face = "bold")
    )

Multipathogen exposure

Graphically display the probability of multipathogen exposure:

multipath_exposure = list() 
for(j in names(exposure_dose_per_hour)){
  multipath_exposure[[j]] = 
    tibble(Age = switch(j, B = "Baby", T = "Toddler", C = "Child"),
           `N pathogens` = 
             apply(exposure_dose_per_hour[[j]],1,function(x) sum(x>0))
    )
}
multipath_exposure %<>%
  bind_rows() %>% 
  mutate(`N pathogens` = ifelse(`N pathogens` >= 4, "4+",as.character(`N pathogens`))) %$% 
  table(Age, `N pathogens`) %>% 
  prop.table(1) %>% 
  as_tibble() %>% 
  rename(`Posterior prob` = n) %>% 
  mutate(`N pathogens` = factor(`N pathogens`, 
                                levels = rev(c("0","1","2","3","4+"))))

multipath_exposure %>%  
  mutate(Age = factor(Age, levels = c("Baby","Toddler","Child"))) %>% 
  ggplot(aes(x = `Posterior prob`,
             y = Age,
             fill = `N pathogens`)) + 
  geom_bar(stat = "identity",position = "stack") +
      theme(
      panel.background = element_rect(fill = "white"),
      panel.grid.major = element_line(color = "white"),
      panel.grid.minor = element_blank(),
      axis.line = element_line(color = "black"),
      axis.text = element_text(size = 16),
      axis.title = element_text(size = 18, face = "bold"),
      axis.ticks = element_line(color = "black"),
      legend.background = element_rect(fill = "white"),
      legend.key = element_blank(),
      legend.text = element_text(size = 16),
      legend.title = element_text(size = 18, face = "bold")
      )

Correlation of dose

exposure_corr = 
  exposure_dose_per_hour$B %>% 
  as_tibble() %>% 
  bind_rows(exposure_dose_per_hour$T %>% 
              as_tibble()) %>% 
  bind_rows(exposure_dose_per_hour$C %>% 
              as_tibble()) %>% 
  as.matrix() %>% 
  cor(method = "spearman")
rownames(exposure_corr) = 
  colnames(exposure_corr) %<>%
  case_match("eaec_aata" ~ "EAEC aata",
             "aeromonas" ~ "Aeromonas",
             "cholera" ~ "Cholera",
             "eaec_aaic" ~ "EAEC aaic",
             "epec_bfpa" ~ "EPEC bfpa",
             "etec_lt" ~ "ETEC LT",
             "epec_eae" ~ "EPEC eae")

corrplot(exposure_corr, 
         type = "upper", 
         order = "hclust",
         col = viridis_pal()(15), 
         method = "ellipse",
         tl.col = "black",
         addCoef.col = 'white')