#' Simpson's paradox dataset simulation #' #' Simpson's paradox, or the Yule-Simpson effect, is a phenomenon in probability #' and statistics, in which a trend appears in several different groups of data #' but disappears or reverses when these groups are combined. #' #' @param n The number of observations for each group to be generated (minimum #' 4). #' @param groups Number of groups (groups can be participants, clusters, #' anything). #' @param difference Difference between groups. #' @param group_prefix The prefix of the group name (e.g., "G_1", "G_2", "G_3", ...). #' @inheritParams simulate_correlation #' #' @return A dataset. #' #' @examples #' data <- simulate_simpson(n = 10, groups = 5, r = 0.5) #' #' if (require("ggplot2")) { #' ggplot(data, aes(x = V1, y = V2)) + #' geom_point(aes(color = Group)) + #' geom_smooth(aes(color = Group), method = "lm") + #' geom_smooth(method = "lm") #' } #' @export simulate_simpson <- function(n = 100, r = 0.5, groups = 3, difference = 1, group_prefix = "G_") { if (n <= 3) { stop("The number of observation `n` should be higher than 3", call. = FALSE) } data <- data.frame() for (i in 1:groups) { dat <- simulate_correlation(n = n, r = r) dat$V1 <- dat$V1 + difference * i # (i * -sign(r)) dat$V2 <- dat$V2 + difference * (i * -sign(r)) dat$Group <- sprintf(paste0(group_prefix, "%0", nchar(trunc(abs(groups))), "d"), i) data <- rbind(data, dat) } data }