Homework 02

Author

Alec L. Robitaille

Published

February 5, 2023

source('R/packages.R')

Question 1

From the Howell1 dataset, consider only the people younger than 13 years old. Estimate the causal association between age and weight. Assume that age influences weight through two paths. First, age influences height, and height influences weight. Second, age directly influences weight through age-related changes in muscle growth and body proportions.

Draw the DAG that represents these causal relationships. And then write a generative simulation that takes age as an input and simulates height and weight, obeying the relationships in the DAG.

coords <- data.frame(
    name = c('H', 'A', 'W'),
    x =    c(1,    1,   2),
    y =    c(0,    1,   0)
)
dagify(
    W ~ H + A,
    H ~ A,
  coords = coords
) |> ggdag(seed = 2) + theme_dag()

# Recall: https://speakerdeck.com/rmcelreath/statistical-rethinking-2023-lecture-04?slide=22

# Set number of individuals
n_individuals <- 1e3

# Some numbers to guide the simulation
# 0 years: 50cm, 3.5kg
# 12 years: 150cm, 45kg
simulate_heights <- function(n) {
    beta_age_height <- 7
    beta_height_weight <- 3
    beta_age_weight <- 2.5
    
    age <- runif(n, 0, 12)
    height <- rnorm(n, beta_age_height * age, 2)
    weight <- rnorm(
        n, 
        (beta_height_weight * height) + (beta_age_weight * age), 
        2
    )
    
    return(data.table(
        age = age,
        height = height,
        weight = weight
    ))  
}


DT <- simulate_heights(20)

ggplot(DT) + 
    geom_point(aes(age, weight))

Question 2

Use a linear regression to estimate the total causal effect of each year of growth on weight.

# Simulate to determine appropriate priors
# Here just expecting a model defined for W ~ A, with alpha + beta_age * age
simulate_weights <- function(n, mean_alpha, sd_alpha, mean_age, sd_age, sigma_rate) {
    age <- runif(n, 0, 12)

    alpha <- rnorm(n, mean_alpha, sd_alpha)
    beta_age <- rnorm(n, mean_age, sd_age)
    sigma <- rexp(n, sigma_rate)
    
    weight <- rnorm(n, beta_age * age, sigma)
    
    return(data.table(
        age = age,
        weight = weight
    ))  
}


ggplot(simulate_weights(
    n = 1e3, 
    mean_alpha = 4, 
    sd_alpha = 0.5,
    mean_age = 4, 
    sd_age = 1, 
    sigma_rate = 1)
) + 
    geom_point(aes(age, weight))

# Data
source('R/data_Howell1.R')

# height: Height in cm
# weight: Weight in kg
# age: Age in years
DT <- data_Howell()[age < 12]
# Model
m <- quap(
    alist(
        weight ~ dnorm(mu, sigma),
        mu <- alpha + beta_age * age,
        alpha ~ dnorm(4, 0.5),
        beta_age ~ dnorm(4, 1),
        sigma ~ dexp(1)
    ),
    data = DT
)

precis(m)
             mean         sd     5.5%    94.5%
alpha    6.360395 0.30637164 5.870754 6.850036
beta_age 1.479591 0.05682131 1.388780 1.570402
sigma    2.315786 0.15042059 2.075385 2.556187
tar_load(m_h02_q02)
m_h02_q02$formula
weight ~ age 
m_h02_q02$prior
          prior     class coef group resp dpar nlpar lb ub       source
   normal(4, 1)         b                                          user
   normal(4, 1)         b  age                             (vectorized)
 normal(4, 0.5) Intercept                                          user
 exponential(1)     sigma                             0            user
posterior_summary(m_h02_q02)
               Estimate   Est.Error        Q2.5       Q97.5
b_Intercept    5.096988  0.69264976    3.564935    6.282286
b_age          1.322800  0.08676978    1.158969    1.498257
sigma          3.281210  0.44570929    2.565650    4.304295
lprior      -113.486825 14.72751678 -139.469285  -81.376835
lp__        -453.857198  1.22352857 -456.988103 -452.413963

Question 3

Now suppose the causal association between age and weight might be different for boys and girls. Use a single linear regression, with a categorical variable for sex, to estimate the total causal effect of age on weight separately for boys and girls. How do girls and boys differ? Provide one or more posterior contrasts as a summary.

coords <- data.frame(
    name = c('H', 'A', 'W', 'S'),
    x =    c(1,    1,   2,    2),
    y =    c(0,    1,   0,    1)
)
dagify(
    W ~ H + A + S,
    H ~ A + S,
  coords = coords
) |> ggdag(seed = 2) + theme_dag()

# Group index for sex
DT[, sex := .GRP, by = male]

# Set up key of sex factor levels and labels
key <- unique(DT[, .(sex, sex_char = fifelse(male == 1, 'male', 'female'))])
# Model
m <- quap(
    alist(
        weight ~ dnorm(mu, sigma),
        mu <- alpha[sex] + beta_age[sex] * age,
        alpha[sex] ~ dnorm(4, 0.5),
        beta_age[sex] ~ dnorm(4, 1),
        sigma ~ dexp(1)
    ),
    data = DT
)

# Set up the options for predictions
DT_pred <- data.table(
    age = seq(0, 12, 0.5)
)[, variable := paste0('V', .I)]


# Simulate where sex = 1, and where sex = 2
sim_sex_1 <- sim(m, DT_pred[, .(age, sex = 1)])
sim_sex_2 <- sim(m, DT_pred[, .(age, sex = 2)])

# Posterior of differences
sim_diff <- sim_sex_1 - sim_sex_2

# Melt and tidy
melt_diff <- melt(as.data.table(sim_diff))
DT_diff <- melt_diff[DT_pred, on = 'variable']


# Plot
ggplot(DT_diff, aes(age, value)) +
    stat_lineribbon() +
    scale_fill_scico_d(
        direction = -1,
        palette = 'nuuk',
        end = 0.9) +
    labs(
        y = 'predicted difference in weight',
        title = paste('Posterior predicted difference in weight between',
                                    key[sex == 1, sex_char], 'and',
                                    key[sex == 2, sex_char])
    )

# Note: differences with link
# Predict where sex = 1, and where sex = 2
link_sex_1 <- link(m, DT_pred[, .(age, sex = 1)])
link_sex_2 <- link(m, DT_pred[, .(age, sex = 2)])

# Posterior of differences
link_diff <- link_sex_1 - link_sex_2

# Melt and tidy
melt_diff <- melt(as.data.table(link_diff))
DT_diff_link <- melt_diff[DT_pred, on = 'variable']


# Plot
ggplot(DT_diff_link, aes(age, value)) +
    stat_lineribbon() +
    scale_fill_scico_d(
        direction = -1,
        palette = 'nuuk',
        end = 0.9) +
    labs(
        y = 'predicted difference in weight',
        title = paste('Predicted difference in weight between',
                                    key[sex == 1, sex_char], 'and',
                                    key[sex == 2, sex_char]),
        subtitle = 'Note: uncertainty is not incorporated into predictions with "link"'
    )

# Model
tar_load(m_h02_q03)

m_h02_q03$formula
weight ~ age + sex 
m_h02_q03$prior
          prior     class coef group resp dpar nlpar lb ub       source
   normal(4, 1)         b                                          user
   normal(4, 1)         b  age                             (vectorized)
   normal(4, 1)         b  sex                             (vectorized)
 normal(4, 0.5) Intercept                                          user
 exponential(1)     sigma                             0            user
# Set up the options for predictions
DT_pred <- CJ(
    age = seq(0, 12, 0.5)
)


# Simulate where sex = 1, and where sex = 2
post_pred_sex_1 <- predicted_draws(m_h02_q03, DT_pred[, .(age, sex = 1)])
post_pred_sex_2 <- predicted_draws(m_h02_q03, DT_pred[, .(age, sex = 2)])

# Predicted table
DT_diff <- data.table(
    predict_sex_1 = post_pred_sex_1$.prediction,
    predict_sex_2 = post_pred_sex_2$.prediction,
    age = post_pred_sex_1$age
)

# Posterior of differences
DT_diff[, diff := predict_sex_1 - predict_sex_2]


# Plot
ggplot(DT_diff, aes(age, diff)) +
    stat_lineribbon() +
    scale_fill_scico_d(direction = -1,
                                         palette = 'nuuk',
                                         end = 0.9) +
    labs(
        y = 'predicted difference in weight',
        title = paste('Posterior predicted difference in weight between',
                                    key[sex == 1, sex_char], 'and',
                                    key[sex == 2, sex_char])
    )

Details on the difference between sim and link are in the textbook around pages 105-110, the Overthinking box “How does link work?”, code 4.58-4.62, Rethinking box “two kinds of uncertainty”.

  • link takes the model formula to return what the average line would predict about your new data
  • sim takes the model formula and uncertainty to return the posterior distribution of lines would predict about your new data

Question 4 - optional challenge

The data in data(Oxboys) (rethinking package) are growth records for 26 boys measured over 9 periods. I want you to model their growth. Specifically, model the increments in growth from one period (Occasion in the data table) to the next. Each increment is simply the difference between height in one occasion and height in the previous occasion. Since none of these boys shrunk during the study, all of the growth increments are greater than zero. Estimate the posterior distribution of these increments. Constrain the distribution so it is always positive — it should not be possible for the model to think that boys can shrink from year to year. Finally compute the posterior distribution of the total growth over all 9 occasions.

# Get data and prepare it
data(Oxboys)

DT_ox <- data.table(Oxboys)

setorder(DT_ox, Occasion)
setnames(DT_ox, colnames(DT_ox), make_clean_names(colnames(DT_ox)))
DT_ox[, diff_height := height - shift(height), by = subject]
DT_ox[, index_occasion := occasion - 1]


# Simulate to evaluate priors
n <- 100
sim_prior <- CJ(occasion = unique(DT_ox$occasion),
                                row_id = seq.int(n))
sim_prior[, diff_height := rlnorm(.N, 0.1, 0.3)]

ggplot(sim_prior, aes(factor(occasion), diff_height)) +
    stat_slab() +
    labs(x = 'ocassion', y = 'difference in height (cm)')

m <- quap(
    alist(
        diff_height ~ dlnorm(mu, sigma),
        mu <- alpha[index_occasion],
        alpha[index_occasion] ~ dnorm(0, 0.1),
        sigma ~ dexp(1)
    ),
    data = DT_ox[!is.na(diff_height)]
)

plot(precis(m, 2))

sims <- sim(m, data.table(index_occasion = seq.int(8)))

# Posterior distribution of total growth 
dens(apply(sims, 1, sum))