Homework 02

Author

Alec L. Robitaille

Published

February 15, 2024

Setup

targets::tar_source('R')

Question 1

From the Howell1 dataset, consider only the people younger than 13 years old. Estimate the causal association between age and weight. Assume that age influences weight through two paths. First, age influences height, and height influences weight. Second, age directly influences weight through age- related changes in muscle growth and body proportions.

Draw the DAG that represents these causal relationships. And then write a generative simulation that takes age as an input and simulates height and weight, obeying the relationships in the DAG.

coords <- data.frame(
    name = c('H', 'A', 'W'),
    x =    c(2,    1,   1),
    y =    c(1,    2,   0)
)

dagify(
    W ~ H + A,
    H ~ A,
  coords = coords
) |> ggdag(seed = 2) + theme_dag()

n <- 1e2
max_age <- 13

simulate_hw <- function(n, max_age) {
    age <- runif(n, 0, max_age)
    
    # General vague height at 13 years old (cm) = 130
    beta_age_on_height <- 10
    height <- rnorm(n = n, mean = age * beta_age_on_height, 2)

    # General vague weight at 13 years old (kg) = 45
    # Betas chosen to hit 45 kg approximately at 13 years old
    beta_height_on_weight <- 0.2
    beta_age_on_weight <- 1.5
    weight <- (age * beta_age_on_weight) + 
        (beta_height_on_weight * height)
    
    data.table(age, height, weight)
}
ggplot(simulate_hw(n, max_age)) + 
    geom_point(aes(age, weight))

Question 2

Estimate the total causal effect of each year of growth on weight.

The total causal effect of age on weight is estimated by excluding the height variable to include both paths in the DAG from age to weight directly and indirectly through height (Lecture 4: slide 28).

Prior predictive simulation

# Function for preparing Howell data
data_Howell

function() {
    if (!'rethinking' %in% .packages()) {
        stop('please load the rethinking package')
    }
    if (!'data.table' %in% .packages()) {
        stop('please load the data.table package')
    }

    data(Howell1)
    DT <- data.table::data.table(Howell1)
    DT[, sex := .GRP, by = male]

    DT[, scale_height := scale(height)]
    DT[, scale_weight := scale(weight)]

    DT[, scale_height_div_mean := height / mean(height)]
    DT[, scale_weight_div_mean := weight / mean(weight)]

    return(DT)
}

# Load data
DT <- tar_read(Howell_lte_13)

# Print priors used
tar_read(h02_q02_brms_prior)

             prior     class coef group resp dpar nlpar   lb   ub source
 normal(22.5, 0.5) Intercept                            <NA> <NA>   user
    normal(3, 0.5)         b                            <NA> <NA>   user
    exponential(1)     sigma                            <NA> <NA>   user

# Load model
tar_load(h02_q02_brms_sample_prior)
h02_q02_brms_sample_prior

 Family: gaussian 
  Links: mu = identity; sigma = identity 
Formula: weight ~ age 
   Data: h02_q02_brms_data (Number of observations: 157) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept     4.74      3.10    -1.40    10.63 1.00     2994     2474
age           2.99      0.51     2.02     3.99 1.00     3047     2383

Further Distributional Parameters:
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma     0.99      0.98     0.02     3.59 1.00     2148     1209

Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

# Read N draws from the priors and append expected predictions
n_draws <- 50
q02_draws <- h02_q02_brms_sample_prior |>
    add_epred_draws(newdata = unique(DT[, .(age)]),
                                    ndraws = n_draws)

# Plot prior expectations for relationship between age and weight
ggplot(q02_draws) + 
    geom_line(aes(age, .epred, group = .draw), alpha = 0.5) +
    labs(x = 'Age', y = 'Weight')

Analyse the data

# Load model
tar_load(h02_q02_brms_sample)
h02_q02_brms_sample

 Family: gaussian 
  Links: mu = identity; sigma = identity 
Formula: weight ~ age 
   Data: h02_q02_brms_data (Number of observations: 157) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept     8.55      0.41     7.77     9.37 1.00     3004     2831
age           1.35      0.05     1.24     1.46 1.00     3259     2290

Further Distributional Parameters:
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma     2.83      0.20     2.47     3.27 1.00     2708     2595

Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

# Tidy draws
q02_draws <- h02_q02_brms_sample |> 
    tidy_draws()

q02_newdata <- data_grid(DT, age = seq_range(age, 20))
q02_epred <- h02_q02_brms_sample |> 
    epred_draws(q02_newdata)

# Total causal effect of each year of growth on weight
g1 <- ggplot(q02_draws) + 
    stat_halfeye(aes(b_age)) + 
    labs(x = 'Increase in weight per year of growth', y = '')

g2 <- ggplot(q02_epred) + 
    stat_ribbon(aes(age, .epred), alpha = 0.5) +
    scale_fill_grey(start = 0.8, end = 0.2) + 
    labs(x = 'Age', y = 'Weight')

g1 / g2

Question 3

The data in data(Oxboys) (rethinking package) are growth records for 26 boys measured over 9 periods. I want you to model their growth. Specifically, model the increments in growth from one period (Occasion in the data table) to the next. Each increment is simply the difference between height in one occasion and height in the previous occasion. Since none of these boys shrunk during the study, all of the growth increments are greater than zero. Estimate the posterior distribution of these increments. Constrain the distribution so it is always positive—it should not be possible for the model to think that boys can shrink from year to year. Finally compute the posterior distribution of the total growth over all 9 occasions.

Prior predictive simulation

# Function for preparing Oxboys data
data_Oxboys

function() {
    if (!'rethinking' %in% .packages()) {
        stop('please load the rethinking package')
    }
    if (!'data.table' %in% .packages()) {
        stop('please load the data.table package')
    }

    data("Oxboys")
    DT <- data.table::data.table(Oxboys)

    DT[, diff_height := height - shift(height), by = Subject]

    DT[, occasion_factor := factor(Occasion)]

    return(DT)
}

# Load data
DT <- tar_read(prep_Oxboys)

# Print priors used
tar_read(h02_q03_brms_prior)

          prior     class coef group resp dpar nlpar   lb   ub source
   normal(5, 2) Intercept                            <NA> <NA>   user
   normal(3, 1)         b                            <NA> <NA>   user
 exponential(1)     sigma                            <NA> <NA>   user

# Load model
tar_load(h02_q03_brms_sample_prior)
h02_q03_brms_sample_prior

 Family: gaussian 
  Links: mu = identity; sigma = identity 
Formula: diff_height | trunc(lb = 0) ~ occasion_factor 
   Data: h02_q03_brms_data (Number of observations: 208) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
                 Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept            2.38      2.01    -1.56     6.43 1.00     7337     2974
occasion_factor3     3.01      1.02     1.02     4.99 1.00     6519     2990
occasion_factor4     3.01      1.00     1.02     4.94 1.00     8041     2922
occasion_factor5     3.00      1.01     1.06     5.00 1.00     7578     2782
occasion_factor6     3.01      0.97     1.17     4.93 1.00     6750     3239
occasion_factor7     3.00      1.01     1.03     4.94 1.00     7103     3230
occasion_factor8     3.00      0.97     1.07     4.92 1.00     7255     3110
occasion_factor9     2.98      1.01     1.01     4.99 1.00     6316     2813

Further Distributional Parameters:
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma     1.00      1.00     0.02     3.62 1.00     4453     1966

Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

# Read N draws from the priors and append expected predictions
n_draws <- 100
q02_newdata <- data_grid(DT, age = seq_range(age, 20))

q03_newdata <- na.omit(DT)[, .(occasion_factor = droplevels(unique(occasion_factor)))]
q03_epred_prior <- h02_q03_brms_sample_prior |>
    add_epred_draws(newdata = q03_newdata, ndraws = n_draws)

# Plot prior expectations for relationship between occasion and height
ggplot(q03_epred_prior) + 
    stat_halfeye(aes(occasion_factor, .epred), alpha = 0.5) +
    labs(x = 'Occasion', y = 'Difference in height')

Warning: Computation failed in `stat_slabinterval()`.
Caused by error in `seq.default()`:
! 'to' must be a finite number

Analyse the data

# Load model
tar_load(h02_q03_brms_sample)
h02_q03_brms_sample

 Family: gaussian 
  Links: mu = identity; sigma = identity 
Formula: diff_height | trunc(lb = 0) ~ occasion_factor 
   Data: h02_q03_brms_data (Number of observations: 208) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
                 Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept            0.82      0.27     0.21     1.29 1.01      677      732
occasion_factor3     0.56      0.31    -0.01     1.21 1.01     1042     1299
occasion_factor4     1.03      0.32     0.47     1.70 1.01      940     1098
occasion_factor5     0.09      0.32    -0.50     0.75 1.00     1184     1608
occasion_factor6     0.42      0.32    -0.18     1.06 1.01     1119     1379
occasion_factor7     1.46      0.32     0.90     2.15 1.01      918     1014
occasion_factor8     1.21      0.32     0.62     1.90 1.01      935     1074
occasion_factor9     0.71      0.31     0.13     1.37 1.01      940     1060

Further Distributional Parameters:
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma     0.93      0.08     0.79     1.10 1.00     1180     1385

Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

# Expectation draws
q03_epred <- h02_q03_brms_sample |>
    add_epred_draws(newdata = q03_newdata, ndraws = n_draws)

# Plot posterior expectations for relationship between occasion and height
ggplot(q03_epred) + 
    stat_halfeye(aes(occasion_factor, .epred), alpha = 0.5) +
    labs(x = 'Occasion', y = 'Difference in height')

# Tidy draws
q03_draws  <- h02_q03_brms_sample |>
    tidy_draws() |>
    data.table()

# Calculate total growth
q03_draws[, total_growth := rowSums(.SD), .SDcols = patterns('b_occasion')]

ggplot(q03_draws) + 
    stat_halfeye(aes(total_growth)) + 
    labs(x = 'Total growth over 9 occasions', y = '')