Revision 85d0a0463621c30063752c5577e591be0de5ae60 authored by Dominique Makowski on 26 July 2021, 08:40:08 UTC, committed by cran-robot on 26 July 2021, 08:40:08 UTC
1 parent 601edcd
Raw File
simulate_simpson.R
#' Simpson's paradox dataset simulation
#'
#' Simpson's paradox, or the Yule-Simpson effect, is a phenomenon in probability
#' and statistics, in which a trend appears in several different groups of data
#' but disappears or reverses when these groups are combined.
#'
#' @param n The number of observations for each group to be generated (minimum
#'   4).
#' @param groups Number of groups (groups can be participants, clusters,
#'   anything).
#' @param difference Difference between groups.
#' @param group_prefix The prefix of the group name (e.g., "G_1", "G_2", "G_3", ...).
#' @inheritParams simulate_correlation
#'
#' @return A dataset.
#'
#' @examples
#' data <- simulate_simpson(n = 10, groups = 5, r = 0.5)
#'
#' if(require("ggplot2")) {
#' ggplot(data, aes(x = V1, y = V2)) +
#'   geom_point(aes(color = Group)) +
#'   geom_smooth(aes(color = Group), method = "lm") +
#'   geom_smooth(method = "lm")
#' }
#'
#' @export
simulate_simpson <- function(n = 100,
                             r = 0.5,
                             groups = 3,
                             difference = 1,
                             group_prefix = "G_") {
  if (n <= 3) {
    stop("The number of observation `n` should be higher than 3")
  }

  data <- data.frame()
  for (i in 1:groups) {
    dat <- simulate_correlation(n = n, r = r)
    dat$V1 <- dat$V1 + difference * i # (i * -sign(r))
    dat$V2 <- dat$V2 + difference * (i * -sign(r))
    dat$Group <- sprintf(paste0(group_prefix, "%0", nchar(trunc(abs(groups))), "d"), i)
    data <- rbind(data, dat)
  }

  data
}
back to top