swh:1:snp:2c68a6c5a8af2f06ac2c0225927f25b54fd1f9d0
Raw File
Tip revision: 2565fc870cd7f0a64d857ff89e682dc9344dc7c1 authored by Dominique Makowski on 12 February 2020, 04:10:16 UTC
version 0.5.2
Tip revision: 2565fc8
simulate_data.R
#' Data Simulation
#'
#' Simulate data with specific characteristics.
#'
#' @param n The number of observations to be generated.
#' @param r A value or vector corresponding to the desired correlation coefficients.
#' @param d A value or vector corresponding to the desired difference between the groups.
#' @param mean A value or vector corresponding to the mean of the variables.
#' @param sd A value or vector corresponding to the SD of the variables.
#' @param names A character vector of desired variable names.
#' @param ... Arguments passed to or from other methods.
#' @examples
#'
#' # Correlation --------------------------------
#' data <- simulate_correlation(r = 0.5)
#' plot(data$V1, data$V2)
#' cor.test(data$V1, data$V2)
#' summary(lm(V2 ~ V1, data = data))
#'
#' # Specify mean and SD
#' data <- simulate_correlation(r = 0.5, n = 50, mean = c(0, 1), sd = c(0.7, 1.7))
#' cor.test(data$V1, data$V2)
#' round(c(mean(data$V1), sd(data$V1)), 1)
#' round(c(mean(data$V2), sd(data$V2)), 1)
#' summary(lm(V2 ~ V1, data = data))
#'
#' # Generate multiple variables
#' cor_matrix <- matrix(c(
#'   1.0, 0.2, 0.4,
#'   0.2, 1.0, 0.3,
#'   0.4, 0.3, 1.0
#' ),
#' nrow = 3
#' )
#'
#' data <- simulate_correlation(r = cor_matrix, names = c("y", "x1", "x2"))
#' cor(data)
#' summary(lm(y ~ x1, data = data))
#'
#' # t-test --------------------------------
#' data <- simulate_ttest(n = 30, d = 0.3)
#' plot(data$V1, data$V0)
#' round(c(mean(data$V1), sd(data$V1)), 1)
#' diff(t.test(data$V1 ~ data$V0)$estimate)
#' summary(lm(V1 ~ V0, data = data))
#' summary(glm(V0 ~ V1, data = data, family = "binomial"))
#' @export
simulate_correlation <- function(n = 100, r = 0.5, mean = 0, sd = 1, names = NULL, ...) {
  if (!requireNamespace("MASS", quietly = TRUE)) {
    stop("Package 'MASS' required for this function to work. Please install it by running `install.packages('MASS')`.")
  }

  # Define matrix
  if (is.matrix(r)) {
    if (isSymmetric(r)) {
      if (any(r > 1)) {
        stop("'r' should only contain values between -1 and 1.")
      } else {
        sigma <- r
      }
    } else {
      stop("'r' should be a symetric matrix (relative to the diagonal).")
    }
  } else if (length(r) == 1) {
    if (abs(r) > 1) {
      stop("'r' should only contain values between -1 and 1.")
    } else {
      sigma <- matrix(c(1, r, r, 1), nrow = 2)
    }
  } else {
    stop("'r' should be a value (e.g., r = 0.5) or a square matrix.")
  }


  # Get data
  data <- MASS::mvrnorm(
    n = n,
    mu = rep_len(0, ncol(sigma)), # Means of variables
    Sigma = sigma,
    empirical = TRUE
  )

  # Adjust scale
  if (any(sd != 1)) {
    data <- t(t(data) * rep_len(sd, ncol(sigma)))
  }

  # Adjust mean
  if (any(mean != 0)) {
    data <- t(t(data) + rep_len(mean, ncol(sigma)))
  }

  data <- as.data.frame(data)

  # Rename
  if (!is.null(names)) {
    if (length(names) == ncol(data)) {
      names(data) <- names
    }
  }
  data
}



#' @rdname simulate_correlation
#' @export
simulate_ttest <- function(n = 100, d = 0.5, names = NULL, ...) {
  x <- distribution_normal(n, 0, 1) # Continuous variables
  z <- 0 + d * x # Linear combination
  pr <- 1 / (1 + exp(-z)) # Pass it through an inverse logit function
  y <- distribution_binomial(n, 1, pr, random = 3) # Bernoulli response variable

  data <- data.frame(y = as.factor(y), x = x)
  names(data) <- paste0("V", 0:(ncol(data) - 1))

  if (!is.null(names)) {
    if (length(names) == ncol(data)) {
      names(data) <- names
    }
  }
  data
}
back to top