Raw File
simulate_simpson.R
#' Simpson's paradox dataset simulation
#'
#' Simpson's paradox, or the Yule-Simpson effect, is a phenomenon in probability
#' and statistics, in which a trend appears in several different groups of data
#' but disappears or reverses when these groups are combined.
#'
#' @param n The number of observations for each group to be generated (minimum
#'   4).
#' @param groups Number of groups (groups can be participants, clusters,
#'   anything).
#' @param difference Difference between groups.
#' @param group_prefix The prefix of the group name (e.g., "G_1", "G_2", "G_3", ...).
#' @inheritParams simulate_correlation
#'
#' @return A dataset.
#'
#' @examples
#' data <- simulate_simpson(n = 10, groups = 5, r = 0.5)
#'
#' if (require("ggplot2")) {
#'   ggplot(data, aes(x = V1, y = V2)) +
#'     geom_point(aes(color = Group)) +
#'     geom_smooth(aes(color = Group), method = "lm") +
#'     geom_smooth(method = "lm")
#' }
#' @export
simulate_simpson <- function(n = 100,
                             r = 0.5,
                             groups = 3,
                             difference = 1,
                             group_prefix = "G_") {
  if (n <= 3) {
    stop("The number of observation `n` should be higher than 3", call. = FALSE)
  }

  data <- data.frame()
  for (i in 1:groups) {
    dat <- simulate_correlation(n = n, r = r)
    dat$V1 <- dat$V1 + difference * i # (i * -sign(r))
    dat$V2 <- dat$V2 + difference * (i * -sign(r))
    dat$Group <- sprintf(paste0(group_prefix, "%0", nchar(trunc(abs(groups))), "d"), i)
    data <- rbind(data, dat)
  }

  data
}
back to top