Content - 9bb662e571834525d76fec4bc634639f1780b5f0 - 5324d50/R/check_prior.R

swh:1:snp:cf20732a2a91717cec7759cd75c5d1a72600b018

Tip revision: 85d0a0463621c30063752c5577e591be0de5ae60 authored by Dominique Makowski on 26 July 2021, 08:40:08 UTC
version 0.10.5
Tip revision: 85d0a04
check_prior.R
#' Check if Prior is Informative
#'
#' Performs a simple test to check whether the prior is informative to the posterior. This idea, and the accompanying heuristics, were discussed in \href{https://statmodeling.stat.columbia.edu/2019/08/10/}{this blogpost}.
#'
#' @param method Can be \code{"gelman"} or \code{"lakeland"}. For the
#'   \code{"gelman"} method, if the SD of the posterior is more than 0.1 times
#'   the SD of the prior, then the prior is considered as informative. For the
#'   \code{"lakeland"} method, the prior is considered as informative if the
#'   posterior falls within the 95\% HDI of the prior.
#' @param simulate_priors Should prior distributions be simulated using
#'   \code{\link{simulate_prior}} (default; faster) or sampled via
#'   \code{\link{unupdate}} (slower, more accurate).
#' @inheritParams effective_sample
#' @inheritParams hdi
#'
#' @return A data frame with two columns: The parameter names and the quality
#'   of the prior (which might be \code{"informative"}, \code{"uninformative"})
#'   or \code{"not determinable"} if the prior distribution could not be
#'   determined).
#'
#' @examples
#' \dontrun{
#' library(bayestestR)
#' if (require("rstanarm")) {
#'   model <- stan_glm(mpg ~ wt + am, data = mtcars, chains = 1, refresh = 0)
#'   check_prior(model, method = "gelman")
#'   check_prior(model, method = "lakeland")
#'
#'   # An extreme example where both methods diverge:
#'   model <- stan_glm(mpg ~ wt,
#'     data = mtcars[1:3, ],
#'     prior = normal(-3.3, 1, FALSE),
#'     prior_intercept = normal(0, 1000, FALSE),
#'     refresh = 0
#'   )
#'   check_prior(model, method = "gelman")
#'   check_prior(model, method = "lakeland")
#'   plot(si(model)) # can provide visual confirmation to the Lakeland method
#' }
#' }
#' @references https://statmodeling.stat.columbia.edu/2019/08/10/
#' @export
check_prior <- function(model, method = "gelman", simulate_priors = TRUE, ...) {
  UseMethod("check_prior")
}





#' @export
check_prior.brmsfit <- function(model,
                                method = "gelman",
                                simulate_priors = TRUE,
                                effects = c("fixed", "random", "all"),
                                component = c("conditional", "zi", "zero_inflated", "all"),
                                parameters = NULL,
                                verbose = TRUE,
                                ...) {

  # check arguments
  effects <- match.arg(effects)
  component <- match.arg(component)

  posteriors <- insight::get_parameters(
    model,
    effects = effects,
    component = component,
    parameters = parameters
  )

  if (isTRUE(simulate_priors)) {
    priors <- simulate_prior(
      model,
      effects = effects,
      component = component,
      parameters = parameters,
      verbose = verbose
    )
  } else {
    priors <- unupdate(model, verbose = FALSE)
    priors <- insight::get_parameters(
      priors,
      effects = effects,
      component = component,
      parameters = parameters
    )
  }

  .check_prior(priors, posteriors, method,
    verbose = verbose,
    cleaned_parameters = insight::clean_parameters(model)
  )
}

#' @export
check_prior.stanreg <- check_prior.brmsfit

#' @export
check_prior.blavaan <- check_prior.brmsfit


#' @keywords internal
.check_prior <- function(priors, posteriors, method = "gelman", verbose = TRUE, cleaned_parameters = NULL) {

  # sanity check for matching parameters. Some weird priors like
  # rstanarm's R2 prior might cause problems

  if (!is.null(cleaned_parameters) && ncol(priors) != ncol(posteriors)) {

    ## TODO for now only fixed effects
    if ("Effects" %in% colnames(cleaned_parameters)) {
      cleaned_parameters <- cleaned_parameters[cleaned_parameters$Effects == "fixed", ]
    }

    # rename cleaned parameters, so they match name of prior parameter column
    cp <- cleaned_parameters$Cleaned_Parameter
    cp <- gsub("(.*)(\\.|\\[)\\d+(\\.|\\])", "\\1", cp)
    cp[cp == "Intercept"] <- "(Intercept)"
    cleaned_parameters$Cleaned_Parameter <- cp
    colnames(priors)[colnames(priors) == "Intercept"] <- "(Intercept)"

    # at this point, the colnames of "posteriors" should match "cp$Parameter",
    # while colnames of "priors" should match "cp$Cleaned_Parameter". To ensure
    # that ncol of priors is the same as ncol of posteriors, we now duplicate
    # prior columns and match them with the posteriors

    if (ncol(posteriors) > ncol(priors)) {
      matched_columns <- stats::na.omit(match(cleaned_parameters$Cleaned_Parameter, colnames(priors)))
      matched_column_names <- stats::na.omit(match(colnames(priors), cleaned_parameters$Cleaned_Parameter))
      priors <- priors[matched_columns]
    } else {
      matched_columns <- stats::na.omit(match(colnames(priors), cleaned_parameters$Cleaned_Parameter))
      matched_column_names <- stats::na.omit(match(cleaned_parameters$Cleaned_Parameter, colnames(priors)))
      priors <- priors[matched_columns]
    }
    colnames(priors) <- cleaned_parameters$Parameter[matched_column_names]
  }

  # still different ncols?
  if (ncol(priors) != ncol(posteriors)) {
    common_columns <- intersect(colnames(priors), colnames(posteriors))
    priors <- priors[common_columns]
    posteriors <- posteriors[common_columns]
    if (verbose) {
      warning("Parameters and priors could not be fully matched. Only returning results for parameters with matching priors.", call. = FALSE)
    }
  }


  # for priors whose distribution cannot be simulated, prior values are
  # all NA. Catch those, and warn user
  all_missing <- sapply(priors, function(i) {
    all(is.na(i))
  })

  if (any(all_missing) && verbose) {
    warning("Some priors could not be simulated.", call. = FALSE)
  }

  .gelman <- function(prior, posterior) {
    if (all(is.na(prior))) {
      "not determinable"
    } else if (stats::sd(posterior, na.rm = TRUE) > 0.1 * stats::sd(prior, na.rm = TRUE)) {
      "informative"
    } else {
      "uninformative"
    }
  }

  .lakeland <- function(prior, posterior) {
    if (all(is.na(prior))) {
      "not determinable"
    } else {
      hdi <- hdi(prior, ci = .95)
      r <- rope(posterior, ci = 1, range = c(hdi$CI_low, hdi$CI_high))
      if (as.numeric(r) > 0.99) {
        "informative"
      } else {
        "misinformative"
      }
    }
  }

  if (method == "gelman") {
    result <- mapply(.gelman, priors, posteriors)
  } else if (method == "lakeland") {
    result <- mapply(.lakeland, priors, posteriors)
  } else {
    stop("method should be 'gelman' or 'lakeland'.")
  }

  data.frame(
    Parameter = names(posteriors),
    Prior_Quality = unname(result),
    stringsAsFactors = FALSE
  )
}