A brief comparison of test equating methods
Magnus Johansson, PhD
2026-02-04
Source:vignettes/articles/comparison.Rmd
comparison.Rmd
library(dplyr)
library(tidyr)
library(ggplot2)
library(leunbachR)
library(equate)
library(SNSequate)
library(knitr)
library(mirt)
library(kequate)
set.seed(1234) # for reproducibility of bootstrap results
select <- dplyr::select
d3a_sum <- read.delim("data/data3a_item.csv", sep = ",") %>%
select(a01:a10,b01:b10) %>%
mutate(a_sum = rowSums(across(c(a01:a10))),
b_sum = rowSums(across(c(b01:b10)))) %>%
select(a_sum,b_sum)
d3a <- read.delim("data/data3a_item.csv", sep = ",") %>%
select(a01:a10,b01:b10)
head(d3a)## a01 a02 a03 a04 a05 a06 a07 a08 a09 a10 b01 b02 b03 b04 b05 b06 b07 b08 b09
## 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0
## 2 1 1 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0
## 3 1 0 1 0 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0 0 0
## 5 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 1 1 0
## 6 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 0 0 1 1
## b10
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 1
While the Leunbach method allows for test equating based on observed sum scores under the assumption that the underlying data fits a Rasch model adequately, most other methods require item-level response data.
We will first focus on comparing IRT true score equating to Leunbach, but also include some other methods.
Direct equating
mod1 <- mirt(d3a[,1:10], 1, 'Rasch', verbose = FALSE)
mod2 <- mirt(d3a[,11:20], 1, 'Rasch', verbose = FALSE)
coef_ls1 <- list(a = rep(1,10),
b = coef(mod1, simplify=TRUE, IRTpars = TRUE)$item[,'b'],
c = rep(0,10))
coef_ls2 <- list(a = rep(1,10),
b = coef(mod2, simplify=TRUE, IRTpars = TRUE)$item[,'b'],
c = rep(0,10))
req1 <- irt.eq(n_items = 10, coef_ls1, coef_ls2, method="TS", A=1, B=0) # true score
rx <- freqtab(rowSums(d3a[,1:10]), scales = 0:10)
ry <- freqtab(rowSums(d3a[,11:20]), scales = 0:10)
req2 <- equate(rx, ry, type = "i") # identity
req3 <- equate(rx, ry, type = "l") # linear
req4 <- equate(rx, ry, type = "e", smooth = "loglin", degrees = 3) # equipercentile with loglinear smoothing
# Leunbach model
lfit <- leunbach_ipf(d3a_sum)
lboot <- leunbach_bootstrap(lfit, n_cores = 4, nsim = 100)
leq <- get_equating_table(lboot)
# summary table
max_score <- 10
eq_table <- data.frame(identity = pmin(pmax(req2$conc$yx, 0), max_score),
linear = pmin(pmax(req3$conc$yx, 0), max_score),
equiperc_loglin3 = pmin(pmax(req4$conc$yx, 0), max_score),
IRT_truescore = req1$tau_y,
leunbach_expected = leq$expected,
IRT_thetaequivalent = req1$theta_equivalent,
leunbach_theta = leq$log_theta
) %>%
round(2)
kable(eq_table)| identity | linear | equiperc_loglin3 | IRT_truescore | leunbach_expected | IRT_thetaequivalent | leunbach_theta |
|---|---|---|---|---|---|---|
| 0 | 0.36 | 0.00 | 0.00 | 0.00 | NA | -5.00 |
| 1 | 1.30 | 0.96 | 0.91 | 0.79 | -2.18 | -1.98 |
| 2 | 2.23 | 2.10 | 1.93 | 1.90 | -1.49 | -1.33 |
| 3 | 3.16 | 3.19 | 2.99 | 3.07 | -0.94 | -0.86 |
| 4 | 4.10 | 4.20 | 4.04 | 4.16 | -0.45 | -0.45 |
| 5 | 5.03 | 5.14 | 5.08 | 5.11 | 0.02 | -0.07 |
| 6 | 5.97 | 6.03 | 6.10 | 5.95 | 0.49 | 0.29 |
| 7 | 6.90 | 6.88 | 7.09 | 6.71 | 0.98 | 0.66 |
| 8 | 7.84 | 7.64 | 8.08 | 7.53 | 1.51 | 1.13 |
| 9 | 8.77 | 8.42 | 9.06 | 8.80 | 2.19 | 2.01 |
| 10 | 9.71 | 9.38 | 10.00 | 10.00 | NA | 5.00 |
eq_table %>%
select(!c(leunbach_theta,IRT_thetaequivalent)) %>%
pivot_longer(!identity) %>%
ggplot(aes(x = identity, y = value, color = name, shape = name, linetype = name)) +
geom_point(size = 2.5) +
geom_line() +
scale_x_continuous(breaks = c(0:10), minor_breaks = NULL) +
scale_y_continuous(breaks = c(0:10)) +
scale_color_viridis_d() +
theme_bw() +
labs(title = "Test equating on sum score scale",
color = "Equating model", shape = "Equating model", linetype = "Equating model",
y = "Predicted Y value", x = "X value")
eq_table %>%
select(c(identity,leunbach_theta,IRT_thetaequivalent)) %>%
pivot_longer(!identity) %>%
filter(identity %in% c(1:9)) %>%
ggplot(aes(x = identity, y = value, color = name, shape = name)) +
geom_point(size = 3) +
geom_line() +
scale_x_continuous(breaks = c(0:10), minor_breaks = NULL) +
scale_y_continuous(breaks = c(-2:2)) +
scale_color_viridis_d() +
theme_bw() +
labs(title = "Test equating on theta scale",
color = "Equating model", shape = "Equating model",
y = "Predicted Y value", x = "X value", caption = "Note: Min and max scores are excluded, since these need to be imputed for both methods.") +
theme(plot.caption = element_text(face = "italic", hjust = 0))
Kernel equating
We’ll apply the equivalent groups design, following the
kequate package vignette (Andersson,
Bränberg, and Wiberg 2022). First, we fit two separate
generalized linear models (GLM) using the poisson distribution for the
counts of each score. The kequate vignette suggests using
AIC to evaluate model fit (lower values are better) and finding the
optimal number of moments to include in the model specification. I have
opted for using B-splines instead for slightly improved model fit, and
adjusting the degrees of freedom based on AIC. The glm()
output objects are then used by the kequate() function.
Notably, like Leunbach, kernel equating also allows for equating using only sum scores.
glm_a <- glm(count ~ splines::bs(total, df = 3),
family = "poisson", data = rx, x = TRUE)
glm_b <- glm(count ~ splines::bs(total, df = 3),
family = "poisson", data = ry, x = TRUE)Bias correction following Kosmidis et al. (2020).
library(brglm2)
glm_a2 <- update(glm_a, method = "brglmFit")
glm_b2 <- update(glm_b, method = "brglmFit")
eg_eq <- kequate(
design = "EG",
r = glm_a2, s = glm_b2,
x = c(0:10), y = c(0:10)
)
data.frame(
predicted = c(eg_eq@equating$eqYx, leq$expected),
Model = c(rep("Kernel equating",11),rep("Leunbach",11)),
sumscore = c(0:10,0:10)
) %>%
ggplot(aes(x = sumscore, y = predicted, color = Model, shape = Model)) +
geom_point(size = 3) +
geom_line() +
scale_x_continuous(breaks = c(0:10), minor_breaks = NULL) +
#scale_y_continuous(breaks = c(-2:2)) +
scale_color_viridis_d() +
theme_bw() +
labs(title = "Test equating expected scores",
color = "Equating model", shape = "Equating model",
y = "Predicted Y value", x = "X value") +
theme(plot.caption = element_text(face = "italic", hjust = 0))
Standard error of equating
In order to make reasonable comparisons between kernel equating and Leunbach, we need to use the expected values for estimating standard errors of equating (SEE) in the Leunbach bootstrap, rather than the rounded values (default). It should be noted that the methods of estimating SEE are different for these two equating models.
lboot2 <- leunbach_bootstrap(lfit, n_cores = 4, nsim = 100, see_type = "expected")
data.frame(score = c(0:10,0:10),
see = c(lboot2[["see_1to2"]],eg_eq@equating$SEEYx),
model = c(rep("Leunbach",11),rep("Kernel\n(GLM poisson)",11))
) %>% ggplot(aes(x=score,y=see, color = model)) +
geom_point(size = 3) +
geom_line() +
theme_bw() +
scale_x_continuous(breaks = c(0:10), minor_breaks = NULL) +
scale_color_viridis_d(option = "G", end = 0.8) +
labs(title = "Standard error of equating (SEE)",
x = "Sum score", y = "SEE", color = "Equating method")