Data analysis

1 Overview of this file

Visualizing distributions
Specifying models
Checking assumptions
Tables and figures of model output
Specifying more models

1.1 Setting up

Let’s load packages/libraries.

Code

# these are mostly for data management/wrangling and visualization
library(tidyverse) # for most things
library(foreign) # for reading SPSS files
library(readxl) # read MS Excel files
library(showtext) # get fonts
library(glue) # simplifies mixing text and code in figures and tables
library(arrow) # support for efficient file formats
library(grateful) # create table+references for packages used in a project
library(styler) # only a one-time installation (it is an Rstudio plugin)
library(car) # for car::recode only
library(skimr) # data skimming
library(lubridate) # for handling dates in data
library(janitor) # for many things in data cleaning

# these are mostly for data analysis and visualization
library(gtsummary)
library(scales)
library(visdat)
library(psych)
library(lme4)
library(nlme)
library(broom.mixed)
library(patchwork)
library(easystats)
library(mice)
library(modelsummary)
library(ggdist)
library(kableExtra)
library(formattable)
library(ggrepel)
library(ggrain)
library(sjPlot)

Read the data that we prepared previously.

Code

df.long <- read_csv("data/dataLong.csv")

Define a ggplot theme theme_ki(), a standard table function, kbl_ki(), and a color palette based on KI’s design guide, ki_color_palette.

Code

source("ki.R") # this reads an external file and loads whatever is in it

2 Visualizations

2.1 Histogram

Code

df.long %>% 
  filter(Group == "Control") %>% 
  ggplot(aes(x = value, fill = measure)) +
  geom_histogram(binwidth = 4) +
  facet_grid(measure~time) +
  theme_ki() +
  scale_fill_manual(values = ki_color_palette) +
  labs(title = "Outcomes over time",
       subtitle = "Control group",
       x = "",
       y = "Number of respondents")

Code

df.long %>% 
  filter(!Group == "Control") %>% 
  ggplot(aes(x = value, fill = measure)) +
  geom_histogram(binwidth = 4) +
  facet_grid(measure~time) +
  theme_ki() +
  scale_fill_manual(values = ki_color_palette) +
  labs(title = "Outcomes over time",
       subtitle = "Intervention group",
       x = "",
       y = "Number of respondents")

2.2 Density

Code

df.long %>% 
  filter(Group == "Control") %>% 
  ggplot(aes(x = value, fill = measure)) +
  geom_density() +
  facet_grid(measure~time) +
  theme_ki() +
  scale_fill_manual(values = ki_color_palette) +
  labs(title = "Outcomes over time",
       subtitle = "Control group",
       x = "",
       y = "Density of respondents")

Code

df.long %>% 
  filter(!Group == "Control") %>% 
  ggplot(aes(x = value, fill = measure)) +
  geom_density() +
  facet_grid(measure~time) +
  theme_ki() +
  scale_fill_manual(values = ki_color_palette) +
  labs(title = "Outcomes over time",
       subtitle = "Intervention group",
       x = "",
       y = "Density of respondents")

2.3 stat_halfeye

Code

df.long %>% 
  filter(measure == "DEP") %>% 
  ggplot(aes(x = value, fill = Group)) +
  stat_halfeye() +
  facet_grid(Group~time) +
  theme_ki() +
  #scale_fill_manual(values = ki_color_palette) +
  labs(title = "Depression outcomes over time",
       x = "",
       y = "Density of respondents")

What do these distributions look like to you?

2.4 Box + violin

Code

df.long %>% 
  mutate(time = as.factor(time)) %>% 
  ggplot(aes(x = time, y = value, fill = Group)) +
  geom_violin(position = position_dodge(0.9),
              alpha = 0.9) +
  geom_boxplot(position = position_dodge(0.9),
               color = "white",
               width = .2,
               notch = TRUE,
               outlier.shape = NA) +
  facet_wrap(~measure,
             ncol = 1) +
  theme_ki() +
  labs(title = "Outcomes over time",
       x = "Time point",
       y = "Distribution of outcome measurements")

Notice the notch in the boxplot. This is from the documentation (?geom_boxplot): > If FALSE (default) make a standard box plot. If TRUE, make a notched box plot. Notches are used to compare groups; if the notches of two boxes do not overlap, this suggests that the medians are significantly different.

2.5 Individuals over time

Using the package ggrain we can get this sweet figure that combines a boxplot, a half/split violin plot, jittered points for individuals, and lines between individuals across time points!

Code

df.long %>%
  filter(measure == "DEP") %>% 
  ggplot(aes(x = time, y = value, group = time, fill = Group, color = Group)) +
  geom_rain(
    boxplot.args = list(color = "black", outlier.shape = NA),
    id.long.var = "id"
  ) +
  scale_fill_brewer(palette = "Dark2",
                    aesthetics = c("color","fill")) +
  theme_ki() +
  facet_wrap(~Group,
             nrow = 2) +
  labs(title = "Depression over time")

And a slightly different version - can you see what is different and how it relates to the code?

Code

df.long %>%
  filter(measure == "DEP") %>% 
  ggplot(aes(x = time, y = value, group = time, fill = Group, color = factor(id))) +
  geom_rain(
    alpha = .6,
    boxplot.args = list(color = "black", outlier.shape = NA),
    id.long.var = "id"
  ) +
  scale_fill_brewer(palette = "Dark2") +
  scale_color_viridis_d(guide = "none") +
  theme_ki() +
  facet_wrap(~Group,
             nrow = 2) +
  labs(title = "Depression over time")

2.6 Mean+SD

Code

df.long %>%
  filter(measure == "DEP") %>% 
  group_by(time,Group) %>% 
  reframe(Mean = mean(value, na.rm = T),
          SD = sd(value, na.rm = T)) %>% 
  ggplot(aes(x = time, 
             y = Mean, 
             group = Group, 
             color = Group)) +
  geom_point(size = 3) +
  geom_line(linewidth = 1.3) +
  theme_ki() +
  scale_y_continuous(limits = c(0,42)) +
  geom_ribbon(aes(ymin = Mean-SD,
                  ymax = Mean+SD,
                  fill = Group),
              alpha = 0.1,
              linetype = 0) +
  labs(y = "Mean Depression Score",
       x = "Time point",
       caption = "Note. Shaded area indicates one standard deviation.") +
  theme(plot.caption = element_text(hjust = 0))

2.6.1 Error-bar

Code

df.long %>%
  filter(measure == "DEP") %>%
  group_by(time, Group) %>%
  reframe(
    Mean = mean(value, na.rm = T),
    SD = sd(value, na.rm = T)
  ) %>%
  ggplot(aes(
    x = time,
    y = Mean,
    group = Group,
    color = Group
  )) +
  geom_point(
    size = 3,
    position = position_dodge(.5)
  ) +
  geom_line(
    linewidth = 1.3,
    position = position_dodge(.5)
  ) +
  theme_ki() +
  scale_y_continuous(limits = c(0, 42)) +
  geom_errorbar(
    aes(
      ymin = Mean - SD,
      ymax = Mean + SD
    ),
    position = position_dodge(.5), width = .2
  ) +
  labs( # title = "Depression over time by group",
    y = "Mean Depression Score",
    x = "Time point",
    caption = "Note. Error bars indicate one standard deviation."
  ) +
  theme(plot.caption = element_text(hjust = 0))

3 Linear model 1

We will start with DEP as outcome and fit a linear model.

3.1 Data wrangling

First, we’ll split our measure variable into three separate variables (while retaining time as its own variable), using pivot_wider().

Second, we’ll remove the last time point in this first model, in order to make our data fit a linear model better.

Both of these transformations are made in the code chunk below.

Code

df.model1 <- df.long %>% 
  filter(!time == "3") %>% # since time is a character variable in df.long we need ""
  pivot_wider(names_from = "measure",
              values_from = "value") %>% 
  rename(Depression = DEP,
         Anxiety = ANX,
         Stress = STRESS) %>% 
  mutate(time = as.integer(time))

3.2 Model specification

Now we can specify and fit a model:

Code

m1 <- lmer(data = df.model1,
         Depression ~ time + Group + time*Group + (1 | id),
         REML = TRUE)

Exercise

Can you decipher the syntax (before looking below)?

3.3 Syntax for `lmer`

The general trick is that the formula follows the form - dependent ~ independent | grouping. The grouping is generally a random factor, you can include fixed factors without any grouping and you can have additional random factors without any fixed factor (an intercept-only model). A + between factors indicates no interaction, a * indicates interaction.

For random factors, you have three basic variants:

Intercepts only by random factor: (1 | random.factor)

Slopes only by random factor: (0 + fixed.factor | random.factor)

Intercepts and slopes by random factor: (1 + fixed.factor | random.factor)

Note that variant 3 has the slope and the intercept calculated in the same grouping, i.e. at the same time. If we want the slope and the intercept calculated independently, i.e. without any assumed correlation between the two, we need a fourth variant:

Intercept and slope, separately, by random factor: (1 | random.factor) + (0 + fixed.factor | random.factor). An alternative way to write this is using the double-bar notation fixed.factor + (fixed.factor || random.factor).

Reference links for formulas:

3.4 Check assumptions

Our linear mixed model makes a lot of assumptions about the structure of data. It is important that we investigate this before looking at the output of the model, which may be entirely misleading if assumptions are not met.

Code

check_model(m1)

3.5 Table with results

Code

tab_model(m1, show.se = TRUE)

Table 1: Linear Mixed Model 1 summary statistics
	Depression
Predictors	Estimates	std. Error	CI	p
(Intercept)	16.91	1.39	14.17 – 19.65	<0.001
time	-0.74	0.57	-1.86 – 0.38	0.196
Group [MiCBT]	-0.75	2.02	-4.72 – 3.22	0.709
time × Group [MiCBT]	-2.72	0.84	-4.38 – -1.06	0.001
Random Effects
σ²	31.09
τ₀₀_id	80.29
ICC	0.72
N _id	106
Observations	282
Marginal R² / Conditional R²	0.055 / 0.736

See https://cran.r-project.org/web/packages/sjPlot/vignettes/tab_model_estimates.html for more details on generating tables with sjPlot functions. One drawback with this package is that it only generates HTML-tables, which don’t work well with PDF and Word output formats.

3.5.1 gtsummary

An alternative is gtsummary, which is more flexible regarding the output, but may need a bit more work. See https://www.danieldsjoberg.com/gtsummary/index.html for examples.

Code

tbl_regression(m1)

Table 2: Linear Mixed Model 1 summary statistics
Characteristic	Beta	95% CI¹
time	-0.74	-1.9, 0.38
Group
Control	—	—
MiCBT	-0.75	-4.7, 3.2
time * Group
time * MiCBT	-2.7	-4.4, -1.1
¹ CI = Confidence Interval

3.5.2 summary

You can also get “raw” output.

Code

summary(m1)

Linear mixed model fit by REML ['lmerMod']
Formula: Depression ~ time + Group + time * Group + (1 | id)
   Data: df.model1

REML criterion at convergence: 1974.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.3880 -0.5395 -0.0722  0.4928  3.3691 

Random effects:
 Groups   Name        Variance Std.Dev.
 id       (Intercept) 80.29    8.960   
 Residual             31.09    5.576   
Number of obs: 282, groups:  id, 106

Fixed effects:
                Estimate Std. Error t value
(Intercept)      16.9085     1.3922  12.145
time             -0.7405     0.5711  -1.297
GroupMiCBT       -0.7535     2.0160  -0.374
time:GroupMiCBT  -2.7173     0.8442  -3.219

Correlation of Fixed Effects:
            (Intr) time   GrMCBT
time        -0.372              
GroupMiCBT  -0.691  0.257       
tim:GrpMCBT  0.252 -0.676 -0.366

3.5.3 tidy

More friendly formatted with tidy() makes it easy to create a simple table.

Code

tidy(m1) %>% 
  mutate_if(is.numeric, round, 2) %>% 
  kbl_ki() # note that this function is specified in the file "ki.R"

effect	group	term	estimate	std.error	statistic
fixed	NA	(Intercept)	16.91	1.39	12.14
fixed	NA	time	-0.74	0.57	-1.30
fixed	NA	GroupMiCBT	-0.75	2.02	-0.37
fixed	NA	time:GroupMiCBT	-2.72	0.84	-3.22
ran_pars	id	sd__(Intercept)	8.96	NA	NA
ran_pars	Residual	sd__Observation	5.58	NA	NA

3.5.4 glance

And some additional summary stats.

Code

glance(m1) %>% 
  kbl_ki()

nobs	sigma	logLik	AIC	BIC	REMLcrit	df.residual
282	5.576221	-987.2302	1986.46	2008.312	1974.46	276

3.6 Figures

3.6.1 Predicted response

This uses functions from easystats. You can look at the link for other examples.

Code

estimate_expectation(m1, data = "grid") %>% 
  plot() +
  theme_ki()

3.6.2 Plot parameters

Code

plot(parameters(m1))

3.7 Report

This function autogenerates text describing the model output.

Code

report(m1)

We fitted a linear mixed model (estimated using REML and nloptwrap optimizer)
to predict Depression with time and Group (formula: Depression ~ time + Group +
time * Group). The model included id as random effect (formula: ~1 | id). The
model's total explanatory power is substantial (conditional R2 = 0.74) and the
part related to the fixed effects alone (marginal R2) is of 0.06. The model's
intercept, corresponding to time = 0 and Group = Control, is at 16.91 (95% CI
[14.17, 19.65], t(276) = 12.14, p < .001). Within this model:

  - The effect of time is statistically non-significant and negative (beta =
-0.74, 95% CI [-1.86, 0.38], t(276) = -1.30, p = 0.196; Std. beta = -0.06, 95%
CI [-0.14, 0.03])
  - The effect of Group [MiCBT] is statistically non-significant and negative
(beta = -0.75, 95% CI [-4.72, 3.22], t(276) = -0.37, p = 0.709; Std. beta =
-0.30, 95% CI [-0.65, 0.04])
  - The effect of time × Group [MiCBT] is statistically significant and negative
(beta = -2.72, 95% CI [-4.38, -1.06], t(276) = -3.22, p = 0.001; Std. beta =
-0.21, 95% CI [-0.33, -0.08])

Standardized parameters were obtained by fitting the model on a standardized
version of the dataset. 95% Confidence Intervals (CIs) and p-values were
computed using a Wald t-distribution approximation.

4 More linear models…

What happens if we define time as a factor and add time point 3?

4.1 Other possible additional models

Add random slopes?

Add pre-measurement covariate?

5 Marginal effects

https://twitter.com/stephenjwild/status/1666056019993034755

This package is really great and has nice documentation: https://vincentarelbundock.github.io/marginaleffects/

6 Generalized Linear Models

This series of blog posts on GLM’s and causal inference is awesome:

https://solomonkurz.netlify.app/blog/2023-04-12-boost-your-power-with-baseline-covariates/

7 Baseline covariates

We can create a similar example to the blog post linked above. It makes a comparison of post-measurements between intervention and control groups, and adds pre-measurement as a covariate. We can use the last time point in our data

First some data wrangling to get our pre/post measurement into wide format.

Code

df.bc <- df.long %>% 
  mutate(time = as.integer(time)) %>% 
  filter(time %in% c(0,3), 
         measure == "DEP") %>%
  pivot_wider(names_from = c("measure","time"),
              values_from = "value")

We’ll add a mean centered pre-measurement variable, just like in Kurz’ blog post, and test both in our models.

Code

df.bc <- df.bc %>% 
  mutate(pre_centered = DEP_0 - mean(DEP_0, na.rm = T))

7.1 Model spec

Then we can specify our models.

Code

bc1 <- lm(data = df.bc,
          DEP_3 ~ Group)

bc2 <- lm(data = df.bc,
          DEP_3 ~ Group + DEP_0)

bc3 <- lm(data = df.bc,
          DEP_3 ~ Group + pre_centered)

7.2 Check assumptions

Code

check_model(bc3)

7.3 Output

Code

tab_model(bc1,bc2,bc3,
          show.se = TRUE)

	DEP 3				DEP 3				DEP 3
Predictors	Estimates	std. Error	CI	p	Estimates	std. Error	CI	p	Estimates	std. Error	CI	p
(Intercept)	14.04	1.39	11.27 – 16.82	<0.001	5.51	1.76	2.00 – 9.02	0.002	13.97	1.14	11.71 – 16.22	<0.001
Group [MiCBT]	-2.44	2.05	-6.51 – 1.62	0.236	-2.81	1.68	-6.15 – 0.52	0.097	-2.81	1.68	-6.15 – 0.52	0.097
DEP 0					0.51	0.08	0.35 – 0.68	<0.001
pre centered									0.51	0.08	0.35 – 0.68	<0.001
Observations	86				85				85
R² / R² adjusted	0.017 / 0.005				0.344 / 0.328				0.344 / 0.328

We can see that the standard error does indeed shrink, and the mean centered pre-measurement produces a correct intercept value. Nice!

1 Overview of this file

1.1 Setting up

2 Visualizations

2.1 Histogram

2.2 Density

2.3 stat_halfeye

2.4 Box + violin

2.5 Individuals over time

2.6 Mean+SD

2.6.1 Error-bar

3 Linear model 1

3.1 Data wrangling

3.2 Model specification

3.3 Syntax for lmer

3.4 Check assumptions

3.5 Table with results

3.5.1 gtsummary

3.5.2 summary

3.5.3 tidy

3.5.4 glance

3.6 Figures

3.6.1 Predicted response

3.6.2 Plot parameters

3.7 Report

4 More linear models…

4.1 Other possible additional models

5 Marginal effects

6 Generalized Linear Models

7 Baseline covariates

7.1 Model spec

7.2 Check assumptions

7.3 Output

3.3 Syntax for `lmer`