Analysis for ‘A Multi-Pathogen Exposure Model for Children in Developing Communities’

Author

Stephanie A. Houser, Daniel K. Sewell, Danielle N. Medgyesi, John M. Brogan, Jean Philippe Creve-Coeur, Kelly K. Baker

Preamble

Load in required packages:

library(magrittr)
library(bayesplot)
library(flextable)
library(corrplot)
library(truncdist)
library(rstanarm)
library(brms)
library(triangulr)
library(tidyverse)
library(ggplot2)
library(viridis)

Read in data:

soil = read_csv("https://myweb.uiowa.edu/dksewell/public/haiti-soil.csv")
behavior = read_csv("https://myweb.uiowa.edu/dksewell/public/haiti-behavioral.csv")

Look at summary statistics

soil %>% 
  select(-SiteID) %>% 
  pivot_longer(cols = everything(),
               names_to = "Pathogen",
               values_to = "log10conc") %>% 
  group_by(Pathogen) %>% 
  summarize(Detection = sum(!is.na(log10conc)),
            `Mean concentration` = mean(10^na.omit(log10conc)),
            `Median concentration` = median(10^na.omit(log10conc)),
            `Min concentration` = min(10^na.omit(log10conc)),
            `Max concentration` = max(10^na.omit(log10conc))) %>% 
  flextable() %>% 
  colformat_double(digits = 0)

Pathogen

Detection

Mean concentration

Median concentration

Min concentration

Max concentration

Aeromonas

5

5,478

130

85

19,938

Cholera

9

484,112

477,807

111,195

873,786

EAEC aaic

13

478,779

934

455

6,200,779

EAEC aata

2

206,474

206,474

119,515

293,432

EPEC bfpa

14

10,996

2,368

210

75,628

EPEC eae

38

306,567

3,928

689

6,947,445

ETEC LT

22

18,851

644

79

385,419

behavior %>% 
  group_by(age) %>% 
  summarize(`Total time observed` = sum(time_min),
            `Geophagy/hr` = sum(geophagy)/sum(time_min) * 60,
            `Mouth-to-hand/hr` = sum(mouth_to_hand)/sum(time_min) * 60,
            `Mouth-to-object/hr` = sum(mouth_to_object)/sum(time_min) * 60,
            `Hand-to-object/hr` = sum(hand_to_object)/sum(time_min) * 60,
            `Hand-to-soil/hr` = sum(hand_to_soil)/sum(time_min) * 60) %>% 
  flextable() %>% 
  colformat_double(digits = 1)

age

Total time observed

Geophagy/hr

Mouth-to-hand/hr

Mouth-to-object/hr

Hand-to-object/hr

Hand-to-soil/hr

B

356.4

0.7

17.5

4.4

20.2

10.4

C

1,859.9

0.1

8.5

1.9

26.6

12.0

T

1,069.6

0.1

11.2

2.2

19.2

9.3

Soil pathogen concentration modeling

Letting \(z_{ij}\) denote the binary indicator of the presence of pathogen \(j\) for subject \(i\) and \(y_{ij}\) its \(log_{10}\) concentration for \(i=1,\ldots,N\) and \(j=1,\ldots,J\), we wish to model

\[\begin{align*} \pi\left( \{z_{ij},y_{ij}\}_{j=1}^J \right). \end{align*}\] We fix \(\pi(y_{ij}|z_{ij}=0) = \delta_0(y_{ij})\), i.e., the log_10 concentration distribution is a point mass on 0 if not present. Zero is selected for computational convenience and is suitable since all \(y_{ij}|z_{ij}=1>0\) in our dataset.

We accomplish this joint modeling through telescoping out the likelihood: \[\begin{align*} & \pi\left( \{z_{ij}\}_{j=1}^J, \{y_{ij}\}_{j:z_{ij}=1} \right) &\\ & = \pi(z_{i1})\pi(y_{i1}|z_{i1})\prod_{j=2}^J \pi(z_{ij}|y_{i1},\ldots,y_{i(j-1)})\pi(y_{ij}|z_{ij},y_{i1},\ldots,y_{i(j-1)}). \end{align*}\] Note that this decomposition assumes that all the information contained in \(z_{ik}\) is also contained in \(y_{ik}\), and so \(y_{ij}\), \(j>k\), is conditionally independent of \(z_{ik}\) given \(y_{ik}\). We model these distributions using generalized linear models (GLMs) using \(\{y_{ik}\}_{k<j}\) as covariates for predicting both \(z_{ij}\) and \(y_{ij}\).

Model fitting

# Create the "z's", i.e., the presence/absence variables, and
# set log10 conc of non-detects to zero.
soil %<>% 
  select(-SiteID) %>% 
  mutate_all(function(x) ifelse(is.na(x),0,x)) %>% 
  bind_cols(soil %>% 
              select(-SiteID) %>% 
              mutate_all(function(x) ifelse(is.na(x),0, 1)) %>% 
              rename_all(function(x) paste0(x,"_binary"))) %>% 
  janitor::clean_names()

# Get pathogen names
path_names = 
  soil %>% 
  select(-contains("_binary")) %>% 
  colnames()

# Create objects to store model fits
z_fits = 
  y_fits = 
  list()

# Look at order of prevalence
n_pos = 
  soil %>% 
  select(all_of(path_names)) %>% 
  as.matrix() %>% 
  apply(2,function(x) sum(x > 0))

EAEC aata only had 2 positive detects, not enough to statistically identify associations with any other pathogens.

z_fits$eaec_aata = 
  stan_glm(eaec_aata_binary ~ 1,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)
summary(z_fits$eaec_aata)

Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      eaec_aata_binary ~ 1
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   1

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -3.6    0.6 -4.4  -3.5  -2.8 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.0    0.0  0.0   0.0   0.1  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  6006 
mean_PPD      0.0  1.0  9916 
log-posterior 0.0  1.0  5438 

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$eaec_aata)

y_fits$eaec_aata = 
  stan_glm(eaec_aata ~ 1,
           family = gaussian(),
           data = 
             soil %>% 
             filter(eaec_aata_binary == 1),
           iter = 10000,
           seed = 2024)
summary(y_fits$eaec_aata)

Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      eaec_aata ~ 1
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 2
 predictors:   1

Estimates:
              mean   sd   10%   50%   90%
(Intercept) 5.3    0.2  5.0   5.3   5.5  
sigma       0.3    0.2  0.2   0.3   0.6  

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 5.3    0.4  4.9   5.3   5.7  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  5237 
sigma         0.0  1.0  5816 
mean_PPD      0.0  1.0  9832 
log-posterior 0.0  1.0  4387 

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$eaec_aata)

z_fits$aeromonas = 
  stan_glm(aeromonas_binary ~ 1,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)
summary(z_fits$aeromonas)

Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      aeromonas_binary ~ 1
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   1

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -2.7    0.4 -3.3  -2.7  -2.1 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.1    0.0  0.0   0.1   0.1  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  5303 
mean_PPD      0.0  1.0  9169 
log-posterior 0.0  1.0  5069 

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$aeromonas)

y_fits$aeromonas = 
  stan_glm(aeromonas ~ 1,
           family = gaussian(),
           data = 
             soil %>% 
             filter(aeromonas_binary == 1),
           iter = 10000,
           seed = 2024)
summary(y_fits$aeromonas)

Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      aeromonas ~ 1
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 5
 predictors:   1

Estimates:
              mean   sd   10%   50%   90%
(Intercept) 2.8    0.6  2.1   2.8   3.6  
sigma       1.3    0.5  0.8   1.2   1.9  

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 2.8    0.9  1.8   2.8   3.9  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0   7232
sigma         0.0  1.0   6639
mean_PPD      0.0  1.0  11282
log-posterior 0.0  1.0   5094

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$aeromonas)

z_fits$cholera = 
  stan_glm(cholera_binary ~ aeromonas,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)
summary(z_fits$cholera)

Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      cholera_binary ~ aeromonas
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   2

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -2.3    0.4 -2.9  -2.3  -1.8 
aeromonas    0.8    0.4  0.4   0.8   1.3 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.1    0.0  0.1   0.1   0.2  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  11516
aeromonas     0.0  1.0  10880
mean_PPD      0.0  1.0  15621
log-posterior 0.0  1.0   7219

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$cholera)

y_fits$cholera = 
  stan_glm(cholera ~ aeromonas,
           family = gaussian(),
           data = 
             soil %>% 
             filter(cholera_binary == 1),
           iter = 10000,
           seed = 2024)
summary(y_fits$cholera)

Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      cholera ~ aeromonas
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 9
 predictors:   2

Estimates:
              mean   sd   10%   50%   90%
(Intercept)  5.5    0.2  5.3   5.5   5.7 
aeromonas    0.0    0.1 -0.1   0.0   0.2 
sigma        0.4    0.1  0.3   0.4   0.6 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 5.6    0.2  5.3   5.6   5.8  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  12120
aeromonas     0.0  1.0  13085
sigma         0.0  1.0  10149
mean_PPD      0.0  1.0  14891
log-posterior 0.0  1.0   6520

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$cholera)

z_fits$eaec_aaic = 
  stan_glm(eaec_aaic_binary ~ aeromonas + cholera,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)
summary(z_fits$eaec_aaic)

Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      eaec_aaic_binary ~ aeromonas + cholera
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   3

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -1.8    0.3 -2.2  -1.8  -1.4 
aeromonas   -0.5    0.6 -1.3  -0.4   0.2 
cholera      0.2    0.2  0.0   0.2   0.4 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.2    0.1  0.1   0.2   0.2  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  15782
aeromonas     0.0  1.0  10597
cholera       0.0  1.0  14452
mean_PPD      0.0  1.0  18823
log-posterior 0.0  1.0   6963

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$eaec_aaic)

y_fits$eaec_aaic = 
  stan_glm(eaec_aaic ~ aeromonas + cholera,
           family = gaussian(),
           data = 
             soil %>% 
             filter(eaec_aaic_binary == 1),
           iter = 10000,
           seed = 2024)
summary(y_fits$eaec_aaic)

Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      eaec_aaic ~ aeromonas + cholera
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 13
 predictors:   3

Estimates:
              mean   sd   10%   50%   90%
(Intercept)  3.1    0.1  2.9   3.1   3.3 
aeromonas    1.7    0.3  1.4   1.7   2.1 
cholera      0.0    0.1 -0.1   0.0   0.1 
sigma        0.4    0.1  0.3   0.4   0.6 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 3.4    0.2  3.2   3.4   3.6  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  16470
aeromonas     0.0  1.0  11946
cholera       0.0  1.0  11931
sigma         0.0  1.0   9782
mean_PPD      0.0  1.0  17551
log-posterior 0.0  1.0   6087

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$eaec_aaic)

z_fits$epec_bfpa = 
  stan_glm(epec_bfpa_binary ~ aeromonas + cholera + eaec_aaic,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)
summary(z_fits$epec_bfpa)

Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      epec_bfpa_binary ~ aeromonas + cholera + eaec_aaic
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   4

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -2.3    0.4 -2.9  -2.3  -1.8 
aeromonas   -0.8    0.9 -2.0  -0.6   0.3 
cholera     -0.2    0.3 -0.5  -0.2   0.1 
eaec_aaic    1.0    0.2  0.7   1.0   1.3 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.2    0.1  0.1   0.2   0.3  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  12700
aeromonas     0.0  1.0   9705
cholera       0.0  1.0  11473
eaec_aaic     0.0  1.0  10589
mean_PPD      0.0  1.0  19332
log-posterior 0.0  1.0   7017

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$epec_bfpa)

y_fits$epec_bfpa = 
  stan_glm(epec_bfpa ~ aeromonas + cholera + eaec_aaic,
           family = gaussian(),
           data = 
             soil %>% 
             filter(epec_bfpa_binary == 1),
           iter = 10000,
           seed = 2024)
summary(y_fits$epec_bfpa)

Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      epec_bfpa ~ aeromonas + cholera + eaec_aaic
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 14
 predictors:   4

Estimates:
              mean   sd   10%   50%   90%
(Intercept)  2.9    0.3  2.6   2.9   3.3 
aeromonas    0.8    0.5  0.1   0.8   1.5 
cholera     -0.2    0.2 -0.4  -0.2   0.0 
eaec_aaic    0.2    0.1  0.0   0.2   0.4 
sigma        0.7    0.2  0.5   0.7   0.9 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 3.3    0.3  3.0   3.3   3.7  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  15608
aeromonas     0.0  1.0  13547
cholera       0.0  1.0  13215
eaec_aaic     0.0  1.0  12866
sigma         0.0  1.0   9698
mean_PPD      0.0  1.0  17347
log-posterior 0.0  1.0   5626

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$epec_bfpa)

z_fits$etec_lt = 
  stan_glm(etec_lt_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)
summary(z_fits$etec_lt)

Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      etec_lt_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   5

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -1.2    0.3 -1.6  -1.2  -0.8 
aeromonas    0.2    0.4 -0.3   0.2   0.6 
cholera      0.2    0.2  0.0   0.2   0.4 
eaec_aaic   -0.5    0.3 -0.9  -0.5  -0.1 
epec_bfpa    0.5    0.3  0.2   0.5   0.8 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.3    0.1  0.2   0.3   0.4  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  17486
aeromonas     0.0  1.0  15152
cholera       0.0  1.0  15344
eaec_aaic     0.0  1.0  11247
epec_bfpa     0.0  1.0  11941
mean_PPD      0.0  1.0  18282
log-posterior 0.0  1.0   8321

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$etec_lt)

y_fits$etec_lt = 
  stan_glm(etec_lt ~ aeromonas + cholera + eaec_aaic + epec_bfpa,
           family = gaussian(),
           data = 
             soil %>% 
             filter(etec_lt_binary == 1),
           iter = 10000,
           seed = 2024)
summary(y_fits$etec_lt)

Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      etec_lt ~ aeromonas + cholera + eaec_aaic + epec_bfpa
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 22
 predictors:   5

Estimates:
              mean   sd   10%   50%   90%
(Intercept)  2.8    0.2  2.5   2.8   3.1 
aeromonas    0.2    0.2 -0.1   0.2   0.4 
cholera      0.0    0.1 -0.1   0.0   0.1 
eaec_aaic    0.0    0.2 -0.2   0.0   0.2 
epec_bfpa    0.2    0.1  0.0   0.2   0.4 
sigma        0.8    0.1  0.6   0.8   1.0 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 3.0    0.2  2.7   3.0   3.3  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  20560
aeromonas     0.0  1.0  21620
cholera       0.0  1.0  18880
eaec_aaic     0.0  1.0  17555
epec_bfpa     0.0  1.0  17485
sigma         0.0  1.0  15093
mean_PPD      0.0  1.0  20887
log-posterior 0.0  1.0   7426

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$etec_lt)

z_fits$epec_eae = 
  stan_glm(epec_eae_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa + etec_lt,
           family = binomial(),
           data = soil,
           iter = 10000,
           seed = 2024)
summary(z_fits$epec_eae)

Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      epec_eae_binary ~ aeromonas + cholera + eaec_aaic + epec_bfpa + 
       etec_lt
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 79
 predictors:   6

Estimates:
              mean   sd   10%   50%   90%
(Intercept) -0.7    0.3 -1.1  -0.7  -0.3 
aeromonas   -0.1    0.4 -0.7  -0.1   0.4 
cholera      0.3    0.2  0.1   0.3   0.6 
eaec_aaic    0.1    0.3 -0.3   0.1   0.5 
epec_bfpa    1.2    0.5  0.7   1.2   1.8 
etec_lt      0.0    0.2 -0.3   0.0   0.2 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 0.5    0.1  0.4   0.5   0.6  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  30714
aeromonas     0.0  1.0  15601
cholera       0.0  1.0  15342
eaec_aaic     0.0  1.0  17128
epec_bfpa     0.0  1.0  10870
etec_lt       0.0  1.0  18482
mean_PPD      0.0  1.0  23957
log-posterior 0.0  1.0   8172

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(z_fits$epec_eae)

y_fits$epec_eae = 
  stan_glm(epec_eae ~ aeromonas + cholera + eaec_aaic + epec_bfpa + etec_lt,
           family = gaussian(),
           data = 
             soil %>% 
             filter(epec_eae_binary == 1),
           iter = 10000,
           seed = 2024)
summary(y_fits$epec_eae)

Model Info:
 function:     stan_glm
 family:       gaussian [identity]
 formula:      epec_eae ~ aeromonas + cholera + eaec_aaic + epec_bfpa + etec_lt
 algorithm:    sampling
 sample:       20000 (posterior sample size)
 priors:       see help('prior_summary')
 observations: 38
 predictors:   6

Estimates:
              mean   sd   10%   50%   90%
(Intercept)  3.7    0.2  3.5   3.7   3.9 
aeromonas    0.1    0.2 -0.2   0.1   0.3 
cholera      0.0    0.1 -0.1   0.0   0.1 
eaec_aaic    0.1    0.1 -0.1   0.1   0.2 
epec_bfpa    0.4    0.1  0.3   0.4   0.5 
etec_lt     -0.2    0.1 -0.3  -0.2  -0.1 
sigma        0.8    0.1  0.6   0.8   0.9 

Fit Diagnostics:
           mean   sd   10%   50%   90%
mean_PPD 4.0    0.2  3.8   4.0   4.2  

The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).

MCMC diagnostics
              mcse Rhat n_eff
(Intercept)   0.0  1.0  19882
aeromonas     0.0  1.0  12866
cholera       0.0  1.0  12101
eaec_aaic     0.0  1.0  12432
epec_bfpa     0.0  1.0  12231
etec_lt       0.0  1.0  12554
sigma         0.0  1.0  12093
mean_PPD      0.0  1.0  18294
log-posterior 0.0  1.0   6736

For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
mcmc_trace(y_fits$epec_eae)

Collate results

# Set number of digits for output
n_digits = 3

# Create tables for reporting results, one for presence/absence, 
# the other for log_10 concentration
results = 
  list(z = tibble(Covariate = c("(Intercept)",(n_pos %>% sort() %>% names())[-c(1,length(path_names))])),
       y = tibble(Covariate = c("(Intercept)",(n_pos %>% sort() %>% names())[-c(1,length(path_names))]))
  )
for(j in (n_pos %>% sort() %>% names())){
  results$z[[j]] = results$y[[j]] = ""
  
  pmedian = 
    z_fits[[j]] %>% 
    as.matrix() %>% 
    apply(2,median) %>% 
    round(n_digits)
  ci = 
    posterior_interval(z_fits[[j]],prob = 0.95) %>% 
    round(n_digits)
  for(k in 1:length(pmedian)){
    results$z[[j]][ match(gsub("_binary","",names(pmedian))[k],
                          results$z$Covariate) ] = 
      paste(pmedian[k],
            " (",
            ci[k,1],
            ", ",
            ci[k,2],
            ")",
            collapse = "",
            sep = "")
  }
  
  pmedian = 
    y_fits[[j]] %>% 
    as.matrix() %>% 
    apply(2,median) %>% 
    round(n_digits)
  pmedian = pmedian[-length(pmedian)]
  ci = 
    posterior_interval(z_fits[[j]],prob = 0.95) %>% 
    round(n_digits)
  for(k in 1:length(pmedian)){
    results$y[[j]][ match(names(pmedian)[k],
                          results$y$Covariate) ] = 
      paste(pmedian[k],
            " (",
            ci[k,1],
            ", ",
            ci[k,2],
            ")",
            collapse = "",
            sep = "")
  }
  
}
results$z %>% 
  flextable()

Covariate

eaec_aata

aeromonas

cholera

eaec_aaic

epec_bfpa

etec_lt

epec_eae

(Intercept)

-3.51 (-4.967, -2.477)

-2.654 (-3.633, -1.886)

-2.31 (-3.186, -1.605)

-1.765 (-2.482, -1.152)

-2.305 (-3.279, -1.526)

-1.191 (-1.838, -0.601)

-0.656 (-1.298, -0.055)

aeromonas

0.827 (0.115, 1.648)

-0.437 (-1.97, 0.454)

-0.641 (-2.851, 0.63)

0.158 (-0.629, 0.913)

-0.099 (-1.029, 0.715)

cholera

0.227 (-0.106, 0.538)

-0.194 (-0.769, 0.233)

0.196 (-0.129, 0.507)

0.336 (0.011, 0.739)

eaec_aaic

1.011 (0.559, 1.536)

-0.516 (-1.164, 0.032)

0.071 (-0.574, 0.708)

epec_bfpa

0.478 (-0.027, 1.029)

1.172 (0.468, 2.281)

etec_lt

-0.038 (-0.478, 0.379)

results$y %>% 
  flextable()

Covariate

eaec_aata

aeromonas

cholera

eaec_aaic

epec_bfpa

etec_lt

epec_eae

(Intercept)

5.272 (-4.967, -2.477)

2.849 (-3.633, -1.886)

5.526 (-3.186, -1.605)

3.077 (-2.482, -1.152)

2.945 (-3.279, -1.526)

2.797 (-1.838, -0.601)

3.703 (-1.298, -0.055)

aeromonas

0.048 (0.115, 1.648)

1.745 (-1.97, 0.454)

0.784 (-2.851, 0.63)

0.165 (-0.629, 0.913)

0.051 (-1.029, 0.715)

cholera

0.018 (-0.106, 0.538)

-0.186 (-0.769, 0.233)

-0.031 (-0.129, 0.507)

-0.01 (0.011, 0.739)

eaec_aaic

0.188 (0.559, 1.536)

0.023 (-1.164, 0.032)

0.058 (-0.574, 0.708)

epec_bfpa

0.186 (-0.027, 1.029)

0.392 (0.468, 2.281)

etec_lt

-0.191 (-0.478, 0.379)

Assess Model Fit

Model fit is assessed using the Bayesian p-value based on the following goodness of fit statistic: \[ \sum_{i=1}^N\sum_{j=1}^J\frac{(y_{ij} - \mathbb{E}(y_{ij})^2}{\text{Var}(y_{ij})} \] For those unfamiliar with Bayesian p-values, this is a well-established technique to assess model fit in the Bayesian framework. Values close to 0.5 indicate the model fits the data well, while values close to 0 or to 1 indicate a poor fit.

Note that with the sequential GLM modeling approach described above, we have \[\begin{align*} z_{ij} & \sim \text{Bernoulli}(p_{ij}) &\\ y_{ij} & \sim \begin{cases} \delta_0(y) & \text{ if }z_{ij}=0\\ N(\mu_{ij},\sigma^2_j)& \text{ if }z_{ij} = 1 \end{cases} & \\ \text{where logit} (p_{ij}) & = \alpha_{j0} + \sum_{k<j}y_{ik}\alpha_{jk}, &\\ \mu_{ij} & = \beta_{j0} + \sum_{k<j}y_{ik}\beta_{jk}. &\\ \text{Hence } \mathbb{E}(y_{ij}) & = \mathbb{E}(\mathbb{E}(y_{ij}|z_{ij})) &\\ & = \mathbb{E}(\mu_{ij}z_{ij}) &\\ & = \mu_{ij}p_{ij} & \\ \text{Var}(y_{ij}) & = \mathbb{E}(\text{Var}(y_{ij}|z_{ij})) + \text{Var}(\mathbb{E}(y_{ij}|z_{ij})) &\\ & = \mathbb{E}(\sigma^2_jz_{ij}) + \text{Var}(\mu_{ij}z_{ij}) &\\ & = \sigma^2_jp_{ij} + \mu_{ij}^2p_{ij}(1-p_{ij}). \end{align*}\]

First, get posterior samples of the model parameters:

z_samples = 
  z_fits %>% 
  lapply(as.matrix)
y_samples = 
  y_fits %>% 
  lapply(as.matrix)

Now we need to get samples from the posterior predictive density. This must be done sequentially according to how we fit the models:

set.seed(2024)
z_predictions = y_predictions = list()
## eaec_aata
z_predictions$eaec_aata =
  z_fits$eaec_aata %>% 
  posterior_predict()
y_predictions$eaec_aata = 
  y_fits$eaec_aata %>% 
  posterior_predict(newdata = data.frame(whatever = rep(1,nrow(soil))))
y_predictions$eaec_aata = 
  y_predictions$eaec_aata * z_predictions$eaec_aata
## aeromonas
z_predictions$aeromonas =
  z_fits$aeromonas %>% 
  posterior_predict()
y_predictions$aeromonas = 
  y_fits$aeromonas %>% 
  posterior_predict(newdata = data.frame(whatever = rep(1,nrow(soil))))
y_predictions$aeromonas = 
  y_predictions$aeromonas * z_predictions$aeromonas
## cholera
z_predictions$cholera = 
  z_samples$cholera[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  z_samples$cholera[,"aeromonas"] * y_predictions$aeromonas #first term will be repeated correctly
z_predictions$cholera = 
  1.0 * (matrix(runif(prod(dim(z_predictions$cholera))),
                nrow(z_predictions$cholera),
                ncol(z_predictions$cholera)) < 
           (1.0 / (1.0 + exp(-z_predictions$cholera) ) ) )
y_predictions$cholera =
  y_samples$cholera[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  y_samples$cholera[,"aeromonas"] * y_predictions$aeromonas + 
  matrix(rnorm(nrow(y_samples$cholera) * nrow(soil),
               sd = y_samples$cholera[,"sigma"]),
         nrow(y_samples$eaec_aata),
         nrow(soil))
y_predictions$cholera =
  y_predictions$cholera * z_predictions$cholera
## eaec_aaic
z_predictions$eaec_aaic = 
  z_samples$eaec_aaic[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  z_samples$eaec_aaic[,"aeromonas"] * y_predictions$aeromonas  +
  z_samples$eaec_aaic[,"cholera"] * y_predictions$cholera
z_predictions$eaec_aaic = 
  1.0 * (matrix(runif(prod(dim(z_predictions$eaec_aaic))),
                nrow(z_predictions$eaec_aaic),
                ncol(z_predictions$eaec_aaic)) < 
           1.0 / (1.0 + exp(-z_predictions$eaec_aaic)))
y_predictions$eaec_aaic =
  y_samples$eaec_aaic[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  y_samples$eaec_aaic[,"aeromonas"] * y_predictions$aeromonas +
  y_samples$eaec_aaic[,"cholera"] * y_predictions$cholera +
  matrix(rnorm(nrow(y_samples$eaec_aaic) * nrow(soil),
               sd = y_samples$eaec_aaic[,"sigma"]),
         nrow(y_samples$eaec_aata),
         nrow(soil))
y_predictions$eaec_aaic =
  y_predictions$eaec_aaic * z_predictions$eaec_aaic
## epec_bfpa
z_predictions$epec_bfpa = 
  z_samples$epec_bfpa[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  z_samples$epec_bfpa[,"aeromonas"] * y_predictions$aeromonas  +
  z_samples$epec_bfpa[,"cholera"] * y_predictions$cholera +
  z_samples$epec_bfpa[,"eaec_aaic"] * y_predictions$eaec_aaic
z_predictions$epec_bfpa = 
  1.0 * (matrix(runif(prod(dim(z_predictions$epec_bfpa))),
                nrow(z_predictions$epec_bfpa),
                ncol(z_predictions$epec_bfpa)) < 
           1.0 / (1.0 + exp(-z_predictions$epec_bfpa)))
y_predictions$epec_bfpa =
  y_samples$epec_bfpa[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  y_samples$epec_bfpa[,"aeromonas"] * y_predictions$aeromonas +
  y_samples$epec_bfpa[,"cholera"] * y_predictions$cholera +
  y_samples$epec_bfpa[,"eaec_aaic"] * y_predictions$eaec_aaic +
  matrix(rnorm(nrow(y_samples$epec_bfpa) * nrow(soil),
               sd = y_samples$epec_bfpa[,"sigma"]),
         nrow(y_samples$eaec_aata),
         nrow(soil))
y_predictions$epec_bfpa =
  y_predictions$epec_bfpa * z_predictions$epec_bfpa
## etec_lt
z_predictions$etec_lt = 
  z_samples$etec_lt[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  z_samples$etec_lt[,"aeromonas"] * y_predictions$aeromonas  +
  z_samples$etec_lt[,"cholera"] * y_predictions$cholera +
  z_samples$etec_lt[,"eaec_aaic"] * y_predictions$eaec_aaic +
  z_samples$etec_lt[,"epec_bfpa"] * y_predictions$epec_bfpa
z_predictions$etec_lt = 
  1.0 * (matrix(runif(prod(dim(z_predictions$etec_lt))),
                nrow(z_predictions$etec_lt),
                ncol(z_predictions$etec_lt)) < 
           1.0 / (1.0 + exp(-z_predictions$etec_lt)))
y_predictions$etec_lt =
  y_samples$etec_lt[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  y_samples$etec_lt[,"aeromonas"] * y_predictions$aeromonas +
  y_samples$etec_lt[,"cholera"] * y_predictions$cholera +
  y_samples$etec_lt[,"eaec_aaic"] * y_predictions$eaec_aaic +
  y_samples$etec_lt[,"epec_bfpa"] * y_predictions$epec_bfpa +
  matrix(rnorm(nrow(y_samples$etec_lt) * nrow(soil),
               sd = y_samples$etec_lt[,"sigma"]),
         nrow(y_samples$eaec_aata),
         nrow(soil))
y_predictions$etec_lt =
  y_predictions$etec_lt * z_predictions$etec_lt
## epec_eae
z_predictions$epec_eae = 
  z_samples$epec_eae[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  z_samples$epec_eae[,"aeromonas"] * y_predictions$aeromonas  +
  z_samples$epec_eae[,"cholera"] * y_predictions$cholera +
  z_samples$epec_eae[,"eaec_aaic"] * y_predictions$eaec_aaic +
  z_samples$epec_eae[,"epec_bfpa"] * y_predictions$epec_bfpa +
  z_samples$epec_eae[,"etec_lt"] * y_predictions$etec_lt
z_predictions$epec_eae = 
  1.0 * (matrix(runif(prod(dim(z_predictions$epec_eae))),
                nrow(z_predictions$epec_eae),
                ncol(z_predictions$epec_eae)) < 
           1.0 / (1.0 + exp(-z_predictions$epec_eae)))
y_predictions$epec_eae =
  y_samples$epec_eae[,"(Intercept)"] %x% matrix(1,1,nrow(soil)) + 
  y_samples$epec_eae[,"aeromonas"] * y_predictions$aeromonas +
  y_samples$epec_eae[,"cholera"] * y_predictions$cholera +
  y_samples$epec_eae[,"eaec_aaic"] * y_predictions$eaec_aaic +
  y_samples$epec_eae[,"epec_bfpa"] * y_predictions$epec_bfpa +
  y_samples$epec_eae[,"etec_lt"] * y_predictions$etec_lt +
  matrix(rnorm(nrow(y_samples$epec_eae) * nrow(soil),
               sd = y_samples$epec_eae[,"sigma"]),
         nrow(y_samples$eaec_aata),
         nrow(soil))
y_predictions$epec_eae =
  y_predictions$epec_eae * z_predictions$epec_eae

Below is the function to obtain the GOF statistics (one for the observed data and one for the predictive sample) for a given posterior draw.

gof_stat = function(iter){
  dt.soil_sim = 
    soil %>% 
    dplyr::select(all_of(names(z_fits)))
  
  for(j in names(z_fits)) dt.soil_sim[[j]] = y_predictions[[j]][iter,]
  dt.soil_sim %<>%
    bind_cols(dt.soil_sim %>% 
                mutate_all(function(x) ifelse(is.na(x),0, 1)) %>% 
                rename_all(function(x) paste0(x,"_binary")))
  
  PrGr0 = 
    mu = 
    Var = 
    Exp = 
    matrix(0.0,nrow(dt.soil_sim),NROW(n_pos))
  
  ## Compute GOF stat for simulated data
  for(j in 1:NROW(n_pos)){
    mm = model.matrix(formula(z_fits[[j]]), # Formula is the same for z_fits and y_fits
                      data = dt.soil_sim)
    PrGr0[,j] = 
      1.0 / drop(1.0 + exp(-mm %*% z_samples[[j]][iter,]))
    
    mu[,j] = mm %*% y_samples[[j]][iter,-ncol(y_samples[[j]])]
    
    Exp[,j] = PrGr0[,j] * mu[,j]
    
    Var[,j] = 
      PrGr0[,j] * y_samples[[j]][iter,ncol(y_samples[[j]])]^2 +
      PrGr0[,j] * (1.0 - PrGr0[,j]) * mu[,j]^2
  }
  
  GOF_sim = 
    sum( (as.matrix(dt.soil_sim %>% 
                      dplyr::select(all_of(names(z_fits)))) - Exp)^2 / Var )
  
  ## Compute GOF stat for observed data
  for(j in 1:NROW(n_pos)){
    mm = model.matrix(formula(z_fits[[j]]),
                      data = soil)
    PrGr0[,j] = 
      1.0 / drop(1.0 + exp(-mm %*% z_samples[[j]][iter,]))
    
    mu[,j] = mm %*% y_samples[[j]][iter,-ncol(y_samples[[j]])]
    
    Exp[,j] = PrGr0[,j] * mu[,j]
    
    Var[,j] = 
      PrGr0[,j] * y_samples[[j]][iter,ncol(y_samples[[j]])]^2 +
      PrGr0[,j] * (1.0 - PrGr0[,j]) * mu[,j]^2
  }
  
  GOF_obs = 
    sum( (as.matrix(soil %>% 
                      dplyr::select(all_of(names(z_fits)))) - Exp)^2 / Var )
  
  return(c(GOF_sim = GOF_sim,
           GOF_obs = GOF_obs))
}

Now to compute and plot.

GOF_stats = 
  sapply(1:nrow(z_samples[[1]]),gof_stat) %>% 
  t() %>% 
  as_tibble()
plot(GOF_stats,
     pch = 16,
     col = c("indianred","steelblue")[1L + (GOF_stats$GOF_sim < GOF_stats$GOF_obs)],
     bty = 'l',
     xlim = quantile(GOF_stats$GOF_sim,c(0.01,0.99)),
     ylim = quantile(GOF_stats$GOF_obs,c(0.01,0.99)))
legend('topright',
       bty = 'n',
       cex = 2,
       legend = paste0("Bayesian p-value = ",round(mean(GOF_stats$GOF_sim < GOF_stats$GOF_obs),2)))

Behavioral modeling

Model fitting

We consider four different models: Poisson, negative binomial, zero-inflated Poisson (ZIP), and zero-inflated negative binomial (ZINB). We then use Bayes factors to determine the best fitting model.

Set up:

beh_names = names(behavior)[-c(1:3)]

b_fits = list()
for(j in beh_names) b_fits[[j]] = list()

For each model, we need to compile the STAN code. After that, we can simply change the data being fed into that STAN model and run the HMC algorithm.

b_fits$hand_to_object$poisson = 
  brm(y ~ offset(log(time_min)) + (1|age),
      family = "poisson",
      data = 
        behavior %>% 
        rename(y = hand_to_object),
      save_pars = save_pars(all = TRUE),
      iter = 10000,
      seed = 2024,
      cores = 4)
for(j in beh_names[-1]){
  b_fits[[j]]$poisson = 
    b_fits$hand_to_object$poisson %>% 
    update(newdata = 
             behavior %>% 
             rename(y = !!j),
           seed = 2024,
           cores = 4)
}

Check for convergence:

lapply(b_fits,
       function(X) rhat(X$poisson)) %>% 
  unlist() %>% 
  max()
[1] 1.012148
b_fits$hand_to_object$negbinomial = 
  brm(y ~ offset(log(time_min)) + (1|age),
      family = "negbinomial",
      data = 
        behavior %>% 
        rename(y = hand_to_object),
      save_pars = save_pars(all = TRUE),
      iter = 10000,
      seed = 2024,
      cores = 4)
for(j in beh_names[-1]){
  b_fits[[j]]$negbinomial = 
    b_fits$hand_to_object$negbinomial %>% 
    update(newdata = 
             behavior %>% 
             rename(y = !!j),
           seed = 2024,
      cores = 4)
}

Check for convergence:

lapply(b_fits,
       function(X) rhat(X$negbinomial)) %>% 
  unlist() %>% 
  max()
[1] 1.009607
b_fits$hand_to_object$zip = 
  brm(bf(y ~ offset(log(time_min)) + (1|age),
         zi = ~ log(time_min) + (1|age)),
      family = "zero_inflated_poisson",
      data = 
        behavior %>% 
        rename(y = hand_to_object),
      save_pars = save_pars(all = TRUE),
      iter = 10000,
      seed = 2024,
      cores = 4)
for(j in beh_names[-1]){
  b_fits[[j]]$zip = 
    b_fits$hand_to_object$zip %>% 
    update(newdata = 
             behavior %>% 
             rename(y = !!j),
           seed = 2024,
           cores = 4)
}

Check for convergence:

lapply(b_fits,
       function(X) rhat(X$zip)) %>% 
  unlist() %>% 
  max()
[1] 1.058392
b_fits$hand_to_object$zinb = 
  brm(bf(y ~ offset(log(time_min)) + (1|age),
         zi = ~ log(time_min) + (1|age)),
      family = "zero_inflated_negbinomial",
      data = 
        behavior %>% 
        rename(y = hand_to_object),
      save_pars = save_pars(all = TRUE),
      iter = 10000,
      seed = 2024,
      cores = 4)
for(j in beh_names[-1]){
  b_fits[[j]]$zinb = 
    b_fits$hand_to_object$zinb %>% 
    update(newdata = 
             behavior %>% 
             rename(y = !!j),
           seed = 2024,
           cores = 4)
}

Check for convergence:

lapply(b_fits,
       function(X) rhat(X$zinb)) %>% 
  unlist() %>% 
  max()
[1] 1.009326

Model selection

We select the best model for each behavior via Bayes factors.

bf_by_behavior = list()
for(j in names(b_fits)){
  cat(paste0("\n--- ",j))
  bf_by_behavior[[j]] = 
    matrix(0.0,4,4,
           dimnames = list(c("poisson","negbinomial","zip","zinb"),
                           c("poisson","negbinomial","zip","zinb")))
  bf_by_behavior[[j]][lower.tri(bf_by_behavior[[j]])] = NA
  diag(bf_by_behavior[[j]]) = NA
  bf_by_behavior[[j]][1,2] = 
    bayes_factor(b_fits[[j]]$poisson,
                 b_fits[[j]]$negbinomial)$bf
  bf_by_behavior[[j]][1,3] = 
    bayes_factor(b_fits[[j]]$poisson,
                 b_fits[[j]]$zip)$bf
  bf_by_behavior[[j]][1,4] = 
    bayes_factor(b_fits[[j]]$poisson,
                 b_fits[[j]]$zinb)$bf
  bf_by_behavior[[j]][2,3] = 
    bayes_factor(b_fits[[j]]$negbinomial,
                 b_fits[[j]]$zip)$bf
  bf_by_behavior[[j]][2,4] = 
    bayes_factor(b_fits[[j]]$negbinomial,
                 b_fits[[j]]$zinb)$bf
  bf_by_behavior[[j]][3,4] = 
    bayes_factor(b_fits[[j]]$zip,
                 b_fits[[j]]$zinb)$bf
}
best_fits = list()

for(j in names(bf_by_behavior)){
  bf_by_behavior[[j]][lower.tri(bf_by_behavior[[j]])] = 
    1.0 / t(bf_by_behavior[[j]])[lower.tri(bf_by_behavior[[j]])]
  diag(bf_by_behavior[[j]]) = 1.0
  
  best_model_name = 
    rownames(bf_by_behavior[[j]])[which(apply(bf_by_behavior[[j]],
                                              1,
                                              function(x) all(x >= 1.0)))]
  best_fits[[j]] = 
    b_fits[[j]][[best_model_name]]
}

Below are the estimated parameters of the behavioral distributions. Note that for the zero probabilities for the ZIP and ZINB, the results are on the scale of the linear predictor, i.e., the untransformed regression coefficients. This is because the log(minutes observed) was used as a covariate rather than an offset for this component.

best_fit_summary = 
  tibble(Behavior = rep(gsub("_","-",names(best_fits)),
                        each = 3),
         Age = rep(c("B","C","T"), length(best_fits)),
         `Poi mean` = NA,
         `NB mean` = NA,
         `NB dispersion` = NA,
         `ZIP Poi mean` = NA,
         `ZIP zero prob Intercept` = NA,
         `ZIP zero prob logtime (min)` = NA,
         `ZINB NB mean` = NA,
         `ZINB NB dispersion` = NA,
         `ZINB zero prob Intercept` = NA,
         `ZINB zero prob logtime (min)` = NA)
for(i in 1:length(best_fits)){
  if(best_fits[[i]]$family$family == "poisson"){
    best_fit_summary$`Poi mean`[3 * (i - 1) + 1:3] = 
      exp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
  }
  if(best_fits[[i]]$family$family == "negbinomial"){
    best_fit_summary$`NB mean`[3 * (i - 1) + 1:3] = 
      exp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
    best_fit_summary$`NB dispersion`[3 * (i - 1) + 1:3] = 
      summary(best_fits[[i]])$spec_pars$Estimate
  }
  if(best_fits[[i]]$family$family == "zero_inflated_poisson"){
    best_fit_summary$`ZIP Poi mean`[3 * (i - 1) + 1:3] = 
      exp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
    best_fit_summary$`ZIP zero prob Intercept`[3 * (i - 1) + 1:3] = 
      coef(best_fits[[i]])$age[,"Estimate","zi_Intercept"]
    best_fit_summary$`ZIP zero prob logtime (min)`[3 * (i - 1) + 1:3] = 
      coef(best_fits[[i]])$age[,"Estimate","zi_logtime_min"]
  }
  if(best_fits[[i]]$family$family == "zero_inflated_negbinomial"){
    best_fit_summary$`ZINB NB mean`[3 * (i - 1) + 1:3] = 
      exp(coef(best_fits[[i]])$age[,"Estimate","Intercept"])
    best_fit_summary$`ZINB NB dispersion`[3 * (i - 1) + 1:3] = 
      summary(best_fits[[i]])$spec_pars$Estimate
    best_fit_summary$`ZINB zero prob Intercept`[3 * (i - 1) + 1:3] = 
      coef(best_fits[[i]])$age[,"Estimate","zi_Intercept"]
    best_fit_summary$`ZINB zero prob logtime (min)`[3 * (i - 1) + 1:3] = 
      coef(best_fits[[i]])$age[,"Estimate","zi_logtime_min"]
  }
}

best_fit_summary %>% 
  flextable() %>% 
  merge_v(1) %>% 
  hline(1:3*3) %>% 
  colformat_double(digits = 3)

Behavior

Age

Poi mean

NB mean

NB dispersion

ZIP Poi mean

ZIP zero prob Intercept

ZIP zero prob logtime (min)

ZINB NB mean

ZINB NB dispersion

ZINB zero prob Intercept

ZINB zero prob logtime (min)

hand-to-object

B

0.367

2.338

C

0.443

2.338

T

0.361

2.338

hand-to-soil

B

0.187

0.854

C

0.207

0.854

T

0.186

0.854

mouth-to-object

B

0.062

0.319

C

0.027

0.319

T

0.042

0.319

geophagy

B

0.013

0.014

-1.523

C

0.002

1.164

-1.523

T

0.004

0.129

-1.523

mouth-to-hand

B

0.323

1.214

C

0.161

1.214

T

0.209

1.214

Monte Carlo simulation

First, we will draw new behavior counts in a 1hr period through the posterior predictive distributions.

set.seed(2024)
behavior_draws = list()

for(j in names(best_fits)){
  behavior_draws[[j]] = 
    best_fits[[j]] %>% 
    posterior_predict(newdata = tibble(age = c("B","T","C"),
                                       time_min = 60))
  colnames(behavior_draws[[j]]) = c("B","T","C")
}
behavior_draws_compiled = behavior_draws
for(j in names(behavior_draws_compiled)){
  behavior_draws_compiled[[j]] %<>% 
    as_tibble() %>% 
    mutate(Behavior = j) %>% 
    relocate(Behavior)
}
behavior_draws_compiled = 
  do.call(bind_rows,
          behavior_draws_compiled) %>% 
  pivot_longer(cols = B:C,
               names_to = "Age",
               values_to = "Count") %>% 
  mutate(Behavior = 
           case_match(Behavior,
                      "hand_to_object" ~ "Hand-to-object",
                      "hand_to_soil" ~ "Hand-to-soil",
                      "mouth_to_object" ~ "Mouth-to-object",
                      "geophagy" ~ "Geophagy",
                      "mouth_to_hand" ~ "Hand-to-mouth"),
         Age = 
           case_match(Age,
                      "B" ~ "Baby",
                      "T" ~ "Toddler",
                      "C" ~ "Child") %>% 
           factor(levels = c("Baby","Toddler","Child")))

behavior_summary = 
  behavior %>% 
  group_by(age) %>% 
  summarize(geophagy = sum(geophagy)/sum(time_min) * 60,
            hand_to_mouth = sum(mouth_to_hand)/sum(time_min) * 60,
            hand_to_object = sum(hand_to_object)/sum(time_min) * 60,
            hand_to_soil = sum(hand_to_soil)/sum(time_min) * 60,
            mouth_to_object = sum(mouth_to_object)/sum(time_min) * 60)

behavior_draws_compiled %>% 
  ggplot(aes(y = Count,
             x = Behavior,
             fill = Age)) +
  geom_boxplot(outliers = FALSE,
               col = "gray40") + 
  geom_point(data = data.frame(x = c(1:5 - 0.25,
                                     1:5,
                                     1:5 + 0.25),
                               y = c(unlist(behavior_summary[1,-1]),
                                     unlist(behavior_summary[3,-1]),
                                     unlist(behavior_summary[2,-1])),
                               Age = rep(c("Baby","Toddler","Child"),
                                         each = 5)),
             aes(y=y,x=x),
             col = "gray40",
             size = 3,
             pch = 8) +
  ylab("Count / hr") + 
  scale_fill_viridis_d() +
    theme(
      panel.background = element_rect(fill = "white"),
      panel.grid.major = element_line(color = "white"),
      panel.grid.minor = element_blank(),
      axis.line = element_line(color = "black"),
      axis.text.y = element_text(size = 16),
      axis.text.x = element_text(size = 16,angle = -45, hjust = 0),
      axis.title = element_text(size = 18, face = "bold"),
      axis.ticks = element_line(color = "black"),
      legend.background = element_rect(fill = "white"),
      legend.key = element_blank(),
      legend.text = element_text(size = 16),
      legend.title = element_text(size = 18, face = "bold")
      )

Next we will draw values/fix constants from the extant literature.

n_sim = nrow(behavior_draws$mouth_to_hand)
set.seed(2024)

## Helper function to get mean and sd from empirical quantiles
find_mean_and_sd = function(probs,
                            obs_quantiles,
                            init_for_optim){
  
  objective_function = function(x){
    sum( (obs_quantiles - qnorm(probs,
                                mean = x[1],
                                sd = exp(x[2])))^2 )
  }
  
  optim(init_for_optim,
        objective_function)
}


## Transfer Efficiencies
trans_soil_hand =
  rlnorm(n_sim, meanlog = 0.11, sdlog = 2.0) #Ozkaynak et al
trans_object_hand =
  rtrunc(n_sim, "norm", a = 0, b =1, mean = 0.23, sd=0.22) # Julian, et al
trans_hand_mouth =
  rbeta(n_sim,shape1 = 5.2, shape2 = 2.6) # Kwong et al
trans_object_mouth =
  rbeta(n_sim, shape1 = 2, shape2 = 8)  # Ozkaynak et al

# Surface areas
child_hand_surf_area_cm2 = list()
child_hand_surf_area_cm2$B =
  rtri(n_sim, min = 74.59, max = 119.18, mode=98.31) #Kwong et al 2019
child_hand_surf_area_cm2$T =
  rtri(n_sim, min = 80.01, max = 177.38, mode=121.18) #Kwong et al 2019
child_hand_surf_area_cm2$C =
  rtri(n_sim, min = 139.63, max = 153.28, mode=146.47) #Kwong et al 2019
object_surf_area_cm2 =
  rexp(n_sim,rate=0.11) #Kwong et al 2019

# Other miscellaneous constants
soil_handful_g =
  0.001 * rbeta(n_sim, shape1 = 5.3, shape2=158.6) #Kwong, et al 2019

hand_loading_g_cm2 =
  0.001 * rtri(n_sim, min = 0, max = 0.95, mode=0.1) #Kwong et al 2019 

obj_loading_g_cm2 =
  0.001 * rlnorm(n_sim, meanlog = 0.11, sdlog = 2.0) #Ozkaynak et al # Treating objects like hands

temp = 
  find_mean_and_sd(c(5, 25, 50, 75, 95, 99)/100,
                   c(0.10,0.16,0.21,0.25,0.31,0.60),
                 c(0.21,0.05))
temp$convergence == 0
[1] TRUE
print(c(mean = temp$par[1],sd = exp(temp$par[2])))
     mean        sd 
0.2299022 0.1080376 
child_frac_hand_touch_soil =
  rtrunc(n_sim, 
         "norm", 
         a = 0, b =1, 
         mean = temp$par[1], 
         sd = exp(temp$par[2])) #Auyueng et al, 2008

temp = 
  find_mean_and_sd(c(5, 25, 50, 75, 95, 99)/100,
                   c(0.12,0.14,0.16,0.19,0.30,0.42),
                   c(0.16,log(0.05)))
temp$convergence == 0
[1] TRUE
print(c(mean = temp$par[1],sd = exp(temp$par[2])))
     mean        sd 
0.1933396 0.0730457 
child_frac_hand_touch_mouth =
  rtrunc(n_sim, 
         "norm", 
         a = 0, b =1, 
         mean = temp$par[1], 
         sd = exp(temp$par[2])) #Auyueng et al, 2008

temp = 
  find_mean_and_sd(c(5, 25, 50, 75, 95, 99)/100,
                   c(0.13,0.14,0.15,0.17,0.20,0.23),
                   c(0.15,log(0.05)))
temp$convergence == 0
[1] TRUE
print(c(mean = temp$par[1],sd = exp(temp$par[2])))
      mean         sd 
0.16015715 0.02538482 
frac_hand_touch_object =
  rtrunc(n_sim, 
         "norm", 
         a = 0, b =1, 
         mean = temp$par[1], 
         sd = exp(temp$par[2])) #Auyueng et al, 2008

max_obj_loading_g_cm2 =
  0.001 * runif(n_sim, 6, 8) #Ozkaynak et al # Treating objects like hands

Set up objects for exposure calculations and compute for each MC draw. Note that since soil samples are exchangeable, we can simply use the first column of each y_prediction matrix.

exposure_dose_per_hour_culture = exposure_dose_per_hour = list()
for(age in c("B","T","C")){
  exposure_dose_per_hour[[age]] = 
  exposure_dose_per_hour_culture[[age]] = 
    matrix(0.0, n_sim, length(y_fits),
            dimnames = list(NULL, names(y_fits)))
} 

for(age in names(exposure_dose_per_hour)){
  for(j in names(y_fits)){
    exposure_dose_per_hour[[age]][,j] = 
      ifelse(y_predictions[[j]][,1] == 0, 0, 10^y_predictions[[j]][,1]) * 
      ( (trans_soil_hand * 
           child_hand_surf_area_cm2[[age]] * 
           child_frac_hand_touch_soil * 
           child_frac_hand_touch_mouth * 
           trans_hand_mouth * 
           hand_loading_g_cm2 * 
           sapply(1:n_sim,function(i){
             min(behavior_draws$hand_to_soil[i,age],
                 behavior_draws$mouth_to_hand[i,age])
           })
         ) + 
          ( trans_object_hand * 
              child_hand_surf_area_cm2[[age]] * 
              frac_hand_touch_object * 
              child_frac_hand_touch_mouth * 
              trans_hand_mouth * 
              hand_loading_g_cm2 * 
              sapply(1:n_sim,function(i){
                min(behavior_draws$hand_to_object[i,age],
                    behavior_draws$mouth_to_hand[i,age])
              })
            ) + 
          ( trans_object_mouth * 
              object_surf_area_cm2 *
              sapply(1:n_sim,function(i){
                min(obj_loading_g_cm2,
                    max_obj_loading_g_cm2)
                }) *
              behavior_draws$mouth_to_object[,age]
            ) + 
          (soil_handful_g * 
             behavior_draws$geophagy[,age]))
  }
}

Results

Tabular form

exposure_results_table = list() 
for(j in names(exposure_dose_per_hour)){
  exposure_results_table[[j]] = 
    tibble(Organism = 
             names(y_fits),
           `Pr(+)` = 
             apply(exposure_dose_per_hour[[j]],2,function(x) mean(x > 0)),
           `Median | +` = 
             apply(exposure_dose_per_hour[[j]],2,function(x) median(x[which(x > 0)])),
           lower = 
             apply(exposure_dose_per_hour[[j]],2,function(x) quantile(x[which(x > 0)],probs = 0.025)),
           upper = 
             apply(exposure_dose_per_hour[[j]],2,function(x) quantile(x[which(x > 0)],probs = 0.975))) %>% 
    mutate(`Median | +` = round(log10(`Median | +`),2),
           `95% PI` = paste("(",
                            round(log10(lower),2),
                            ", ",
                            round(log10(upper),2),
                            ")",
                            sep = ""),.keep = "unused") %>% 
    rename_with( ~ paste(switch(j, B = "Baby", T = "Toddler", C = "Child"),
                         .x, sep = "_"))
}

exposure_results_table %>%
  do.call(what = bind_cols) %>% 
  select(-Toddler_Organism,-Child_Organism) %>% 
  rename(Organism = Baby_Organism) %>% 
  flextable() %>% 
  separate_header()

Organism

Baby

Toddler

Child

Pr(+)

Median | +

95% PI

Pr(+)

Median | +

95% PI

Pr(+)

Median | +

95% PI

eaec_aata

0.03005

2.96

(0.6, 5.02)

0.02945

2.95

(-0.13, 5.05)

0.02885

3.01

(-0.27, 5.01)

aeromonas

0.06715

0.46

(-2.99, 4.08)

0.06620

0.49

(-3.27, 4.11)

0.06565

0.58

(-3.49, 4.2)

cholera

0.11780

3.28

(1.08, 5.36)

0.11605

3.27

(0.72, 5.4)

0.11435

3.34

(0.62, 5.34)

eaec_aaic

0.17330

0.84

(-1.48, 5.84)

0.17070

0.88

(-2.16, 5.84)

0.16880

0.91

(-1.93, 5.58)

epec_bfpa

0.18140

0.97

(-1.96, 4.26)

0.17780

0.97

(-2.22, 4.37)

0.17660

1.04

(-2.33, 4.35)

etec_lt

0.27610

0.69

(-2.03, 3.44)

0.27210

0.71

(-2.24, 3.45)

0.26915

0.75

(-2.34, 3.5)

epec_eae

0.48060

1.68

(-1.22, 4.75)

0.47380

1.69

(-1.38, 4.87)

0.46845

1.73

(-1.54, 4.88)

Graphical form

exposure_results =  
  exposure_dose_per_hour$B %>% 
  as_tibble() %>% 
  mutate(Age = "Baby") %>% 
  bind_rows(exposure_dose_per_hour$T %>% 
              as_tibble() %>% 
              mutate(Age = "Toddler")) %>% 
  bind_rows(exposure_dose_per_hour$C %>% 
              as_tibble() %>% 
              mutate(Age = "Child")) %>% 
  pivot_longer(!Age,
               names_to = "Organism",
               values_to = "Concentration") %>% 
  mutate(Organism = 
           case_match(Organism,
                      "eaec_aata" ~ "EAEC aata",
                      "aeromonas" ~ "Aeromonas",
                      "cholera" ~ "Cholera",
                      "eaec_aaic" ~ "EAEC aaic",
                      "epec_bfpa" ~ "EPEC bfpa",
                      "etec_lt" ~ "ETEC LT",
                      "epec_eae" ~ "EPEC eae"))
exposure_results %>%  
  group_by(Age,Organism) %>% 
  summarize_if(is.numeric,
               list(`Pr(+)` = 
                      function(x) mean(x > 0))) %>% 
  ggplot(aes(y = `Pr(+)`,
             x = Organism,
             fill = Age)) + 
  geom_bar(position = 'dodge', stat = 'identity') + 
  scale_fill_viridis_d() +
    theme(
      panel.background = element_rect(fill = "white"),
      panel.grid.major = element_line(color = "white"),
      panel.grid.minor = element_blank(),
      axis.line = element_line(color = "black"),
      axis.text.y = element_text(size = 16),
      axis.text.x = element_text(size = 16,angle = -45, hjust = 0),
      axis.title = element_text(size = 18, face = "bold"),
      axis.ticks = element_line(color = "black"),
      legend.background = element_rect(fill = "white"),
      legend.key = element_blank(),
      legend.text = element_text(size = 16),
      legend.title = element_text(size = 18, face = "bold")
      )

IQR_bounds =  
  exposure_results %>% 
  filter(!near(Concentration, 0)) %>% 
  mutate(Concentration = log10(Concentration)) %>% 
  group_by(Age,Organism) %>% 
  summarize(lower = quantile(Concentration,0.25) - 1.5 * IQR(Concentration),
            upper = quantile(Concentration,0.75) + 1.5 * IQR(Concentration))

exposure_results %>% 
  filter(!near(Concentration, 0)) %>% 
  mutate(Concentration = log10(Concentration)) %>% 
  ggplot(aes(y = Concentration,
             x = Organism,
             fill = Age)) +
  geom_violin(bounds = range(c(IQR_bounds$lower,
                               IQR_bounds$upper))) +
  scale_fill_viridis_d() +
  scale_y_continuous(breaks = c(-4:6),
                     labels = 10^c(-4:6)) +
  geom_hline(yintercept = c(-4:6), 
             color = "gray30",
             linetype = "dashed",
             alpha = 0.5) +
    theme(
    panel.background = element_rect(fill = "white"),
    panel.grid.major = element_line(color = "white"),
    panel.grid.minor = element_blank(),
    axis.line = element_line(color = "black"),
    axis.text.y = element_text(size = 16),
    axis.text.x = element_text(size = 16,angle = -45, hjust = 0),
    axis.title = element_text(size = 18, face = "bold"),
    axis.ticks = element_line(color = "black"),
    legend.background = element_rect(fill = "white"),
    legend.key = element_blank(),
    legend.text = element_text(size = 16),
    legend.title = element_text(size = 18, face = "bold")
    )

Multipathogen exposure

Graphically display the probability of multipathogen exposure:

multipath_exposure = list() 
for(j in names(exposure_dose_per_hour)){
  multipath_exposure[[j]] = 
    tibble(Age = switch(j, B = "Baby", T = "Toddler", C = "Child"),
           `N pathogens` = 
             apply(exposure_dose_per_hour[[j]],1,function(x) sum(x>0))
    )
}
multipath_exposure %<>%
  bind_rows() %>% 
  mutate(`N pathogens` = ifelse(`N pathogens` >= 4, "4+",as.character(`N pathogens`))) %$% 
  table(Age, `N pathogens`) %>% 
  prop.table(1) %>% 
  as_tibble() %>% 
  rename(`Posterior prob` = n) %>% 
  mutate(`N pathogens` = factor(`N pathogens`, 
                                levels = rev(c("0","1","2","3","4+"))))
multipath_exposure %>%  
  mutate(Age = factor(Age, levels = c("Baby","Toddler","Child"))) %>% 
  ggplot(aes(x = `Posterior prob`,
             y = Age,
             fill = `N pathogens`)) + 
  geom_bar(stat = "identity",position = "stack") +
      theme(
      panel.background = element_rect(fill = "white"),
      panel.grid.major = element_line(color = "white"),
      panel.grid.minor = element_blank(),
      axis.line = element_line(color = "black"),
      axis.text = element_text(size = 16),
      axis.title = element_text(size = 18, face = "bold"),
      axis.ticks = element_line(color = "black"),
      legend.background = element_rect(fill = "white"),
      legend.key = element_blank(),
      legend.text = element_text(size = 16),
      legend.title = element_text(size = 18, face = "bold")
      )

Correlation of dose

exposure_corr = 
  exposure_dose_per_hour$B %>% 
  as_tibble() %>% 
  bind_rows(exposure_dose_per_hour$T %>% 
              as_tibble()) %>% 
  bind_rows(exposure_dose_per_hour$C %>% 
              as_tibble()) %>% 
  as.matrix() %>% 
  cor(method = "spearman")
rownames(exposure_corr) = 
  colnames(exposure_corr) %<>%
  case_match("eaec_aata" ~ "EAEC aata",
             "aeromonas" ~ "Aeromonas",
             "cholera" ~ "Cholera",
             "eaec_aaic" ~ "EAEC aaic",
             "epec_bfpa" ~ "EPEC bfpa",
             "etec_lt" ~ "ETEC LT",
             "epec_eae" ~ "EPEC eae")

corrplot(exposure_corr, 
         type = "upper", 
         order = "hclust",
         col = viridis_pal()(15), 
         method = "ellipse",
         tl.col = "black",
         addCoef.col = 'white')