We are hiring ! See our job offers.
Revision 06a935fcd0daca8842f0d3138ff354753033c0ec authored by Thomas Nagler on 11 May 2021, 23:50:12 UTC, committed by cran-robot on 11 May 2021, 23:50:12 UTC
1 parent 950ccc0
Raw File
kde1d.R
#' Univariate kernel density estimation for bounded and unbounded support
#'
#' Discrete variables are convoluted with the uniform distribution (see, Nagler,
#' 2017). If a variable should be treated as discrete, declare it as
#' [ordered()].
#'
#' @param x vector of length \eqn{n}.
#' @param mult numeric; the actual bandwidth used is \eqn{bw*mult}.
#' @param xmin lower bound for the support of the density.
#' @param xmax upper bound for the support of the density.
#' @param bw bandwidth parameter; has to be a positive number or \code{NULL};
#'   the latter calls [`KernSmooth::dpik()`].
#' @param bw_min minimum value for the bandwidth.
#' @param ... unused.
#'
#' @return An object of class \code{kde1d}.
#'
#' @details If \code{xmin} or \code{xmax} are finite, the density estimate will
#'   be 0 outside of \eqn{[xmin, xmax]}. Mirror-reflection is used to correct
#'   for boundary bias. Discrete variables are convoluted with the uniform
#'   distribution (see, Nagler, 2017).
#'
#' @seealso \code{\link{dkde1d}}, \code{\link{pkde1d}}, \code{\link{qkde1d}},
#'   \code{\link{rkde1d}} \code{\link{plot.kde1d}} , \code{\link{lines.kde1d}}
#'
#' @references Nagler, T. (2017). *A generic approach to nonparametric function
#'   estimation with mixed data.* [arXiv:1704.07457](https://arxiv.org/abs/1704.07457)
#'
#' @examples
#' data(wdbc, package = "kdecopula")  # load data
#' fit <- kde1d(wdbc[, 5])            # estimate density
#' dkde1d(1000, fit)                  # evaluate density estimate
#'
#' @importFrom KernSmooth dpik
#' @importFrom MASS bandwidth.nrd
#' @importFrom cctools cont_conv
#' @export
kde1d <- function(x, mult = 1, xmin = -Inf, xmax = Inf, bw = NULL, bw_min = 0, ...) {
    if (missing(x) & !is.null(list(...)$data))  # for backwards compatibility
        x <- list(...)$data

    ## check/complete function call
    stopifnot(NCOL(x) == 1)
    if (is.null(xmin))
        xmin <- NaN
    if (is.null(xmax))
        xmax <- NaN
    if (!is.finite(xmin))
        xmin <- NaN
    if (!is.finite(xmax))
        xmax <- NaN
    if (!is.nan(xmax) & !is.nan(xmin)) {
        if (xmin > xmax)
            stop("'xmin' is larger than 'xmax'")
        if (any(x < xmin) || any(x > xmax))
            stop("Not all data are contained in the interval [xmin, xmax].")
    } else if (!is.nan(xmin)) {
        if (any(x < xmin))
            stop("Not all data are larger than xmin.")
    } else if (!is.nan(xmax)) {
        if (any(x > xmax))
            stop("Not all data are samller than xmax")
    }

    ## make continuous if discrete
    if (!is.ordered(x) & is.factor(x))
        stop("Factors not allowed; use kdevine() or cctools::cckde().")
    lvls <- levels(x)
    x_cc <- cctools::cont_conv(x, ...)

    ## bandwidth selection
    if (is.null(bw))
        bw <- NA
    if (is.na(bw)) {
        # plug in method
        bw <- try(KernSmooth::dpik(x_cc))
        # if it fails: normal rule of thumb
        if (inherits(bw, "try-error"))
            bw <- MASS::bandwidth.nrd(x_cc)
        # if everything fails:
        if (all(bw == 0))
            bw <- 1
    }
    # bound from below
    if (length(attr(x_cc, "i_disc")) == 1) {
        # for discrete use 1 - theta
        bw <- max(bw, 0.5 - attr(x_cc, "theta"))
    } else {
        # for continuous use bw_min
        bw <- max(bw, bw_min)
    }

    ## return kde1d object
    res <- list(x_cc = x_cc,
                levels = lvls,
                xmin = xmin,
                xmax = xmax,
                bw   = bw * mult)
    class(res) <- "kde1d"
    res
}


#' Working with a kde1d object
#'
#' The density, cdf, or quantile function of a kernel density estimate are
#' evaluated at arbitrary points with \code{\link{dkde1d}}, \code{\link{pkde1d}},
#' and \code{\link{qkde1d}} respectively.
#'
#' @aliases pkde1d, qkde1d, rkde1d
#'
#' @param x vector of evaluation points.
#' @param obj a \code{kde1d} object.
#'
#'
#' @return The density or cdf estimate evaluated at \code{x}.
#'
#' @seealso
#' \code{\link{kde1d}}
#'
#' @examples
#' data(wdbc)  # load data
#' fit <- kde1d(wdbc[, 5])  # estimate density
#' dkde1d(1000, fit)        # evaluate density estimate
#' pkde1d(1000, fit)        # evaluate corresponding cdf
#' qkde1d(0.5, fit)         # quantile function
#' hist(rkde1d(100, fit))   # simulate
#'
#' @useDynLib kdevine
#' @importFrom Rcpp evalCpp
#' @importFrom cctools expand_as_numeric
#' @export
dkde1d <- function(x, obj) {
    if (is.data.frame(x))
        x <- x[[1]]
    if (!is.ordered(x))
        stopifnot(!is.factor(x))
    x <- cctools::expand_as_numeric(x)
    f <- eval_kde1d(sort(obj$x_cc), x, obj$xmin, obj$xmax, obj$bw)
    if (length(attr(obj$x_cc, "i_disc") == 1)) {
        # for discrete variables we can normalize
        x_all_num <- expand_as_numeric(as.ordered(obj$levels))
        f_all <- eval_kde1d(sort(obj$x_cc), x_all_num, obj$xmin, obj$xmax, obj$bw)
        f <- f / sum(f_all)
    }

    f
}

#' @rdname dkde1d
#' @export
pkde1d <- function(x, obj) {
    if (is.data.frame(x))
        x <- x[[1]]
    if (!is.ordered(x))
        stopifnot(!is.factor(x))
    x <- cctools::expand_as_numeric(x)
    if (length(attr(obj$x_cc, "i_disc") == 1)) {
        # for discrete variables we have to add the missing probability mass
        x_all_num <- expand_as_numeric(as.ordered(obj$levels))
        f_all <- dkde1d(x_all_num, obj)
        p <- sapply(x, function(y) sum(f_all[x_all_num <= y]))
    } else {
        p <- eval_pkde1d(sort(obj$x_cc), x, obj$xmin, obj$xmax, obj$bw)
    }

    p
}

#' @rdname dkde1d
#' @export
qkde1d <- function(x, obj) {
    if (is.data.frame(x))
        x <- x[[1]]
    stopifnot(all((x >= 0) & (x <= 1)))
    q <- eval_qkde1d(sort(obj$x_cc), x, obj$xmin, obj$xmax, obj$bw)

    ## for discrete variables compute quantile from the density
    if (length(attr(obj$x_cc, "i_disc") == 1)) {
        x_all_num <- expand_as_numeric(as.ordered(obj$levels))

        # pdf at all possible values of x
        dd <- eval_kde1d(sort(obj$x_cc), x_all_num, obj$xmin, obj$xmax, obj$bw)
        pp <- c(cumsum(dd)) / sum(dd)

        # generalized inverse
        q <- x_all_num[vapply(x, function(y) which(y <= pp)[1], integer(1))]
        q <- ordered(obj$levels[q + 1], levels = obj$levels)
    }

    q
}

#' @param n integer; number of observations.
#' @param quasi logical; the default (\code{FALSE}) returns pseudo-random
#' numbers, use \code{TRUE} for quasi-random numbers (generalized Halton, see
#' \code{\link[qrng:ghalton]{ghalton}}).
#'
#' @rdname dkde1d
#' @importFrom qrng ghalton
#' @importFrom stats runif
#' @export
rkde1d <- function(n, obj, quasi = FALSE) {
    # simulate (psuedo/quasi) uniform random variables
    if (!quasi) {
        U <- runif(n)
    } else {
        U <- ghalton(n, d = 1)
    }
    # simulated data from KDE is the quantile transform of U
    qkde1d(U, obj)
}

#' Plotting kde1d objects
#'
#' @aliases lines.kde1d
#' @method plot kde1d
#'
#' @param x \code{kde1d} object.
#' @param ... further arguments passed to \code{\link{plot.default}}.
#'
#' @seealso
#' \code{\link{kde1d}}
#' \code{\link{lines.kde1d}}
#'
#' @examples
#' data(wdbc)  # load data
#' fit <- kde1d(wdbc[, 7])  # estimate density
#' plot(fit)  # plot density estimate
#'
#' fit2 <- kde1d(as.ordered(wdbc[, 1])) # discrete variable
#' plot(fit2, col = 2)
#'
#' @importFrom graphics plot
#' @importFrom utils modifyList
#' @export
plot.kde1d <- function(x, ...) {
    p.l <- if (is.nan(x$xmin)) min(x$x_cc) - x$bw else x$xmin
    p.u <- if (is.nan(x$xmax)) max(x$x_cc) + x$bw else x$xmax
    ev <- seq(p.l, p.u, l = 100)
    plot_type <- "l"  # for continuous variables, use a line plot
    if (length(attr(x$x_cc, "i_disc")) == 1) {
        ev <- as.ordered(x$levels)
        plot_type <- "h"  # for discrete variables, use a histrogram
    }
    fhat <- dkde1d(expand_as_numeric(ev), x)

    pars <- list(x = ev,
                 y = fhat,
                 type = plot_type,
                 xlab = "x",
                 ylab = "density",
                 ylim = c(0, 1.1 * max(fhat)))

    do.call(plot, modifyList(pars, list(...)))
}

#' @method lines kde1d
#'
#' @rdname plot.kde1d
#' @importFrom graphics lines
#' @importFrom utils modifyList
#' @export
lines.kde1d <- function(x, ...) {
    if (length(attr(x$x_cc, "i_disc") == 1))
        stop("lines does not work for discrete estimates.")
    p.l <- if (is.nan(x$xmin)) min(x$x_cc) - x$bw else x$xmin
    p.u <- if (is.nan(x$xmax)) max(x$x_cc) + x$bw else x$xmax
    ev <- seq(p.l, p.u, l = 100)

    fhat <- dkde1d(ev, x)

    pars <- list(x = ev, y = fhat)
    do.call(lines, modifyList(pars, list(...)))
}



#' #' Bandwidth selection for kde1d
#' #'
#' #' @param data data vector.
#' #' @param xmin lower bound for the support of the density.
#' #' @param xmax upper bound for the support of the density.
#' #' @param n.subs number of subsamples used to estimate the cross-validation
#' #' criterion.
#' #'
#' #' @importFrom stats integrate mad sd optimize
#' bw_kde1d <- function(data, xmin, xmax, n.subs) {
#'     n <- length(data)
#'     cK <- 1
#'     if (missing(xmin))
#'         xmin <- NaN
#'     if (missing(xmax))
#'         xmax <- NaN
#'     ## bandwidth selection
#'     if (is.nan(xmin) & is.nan(xmax)) {
#'         ## for unbounded data calll hpi from ks package
#'         bw <- hpi(data)
#'     } else {
#'         ## LSCV for bounded data
#'         subs <- sample(n.subs)
#'         M1 <- data[subs] %*% t(rep(1, n.subs))
#'         M2 <- rep(1, n.subs) %*% t(data[subs])
#'         difs <- M1 - M2
#'         sums <- M1 + M2
#'         diag(difs) <- rep(NA, n.subs)
#'         lsc <- function(bw) {
#'             K <- kern_gauss(difs / bw) / bw
#'             if (!is.nan(xmin))
#'                 K <- K + kern_gauss((2 * xmin - sums) / bw) / bw
#'             if (!is.nan(xmax))
#'                 K <- K + kern_gauss((2 * xmax - sums) / bw) / bw
#'             lowr <-    if (is.list(data))
#'                 data <- data[[1]]
#'             if (is.nan(xmin)) min(data) - bw else xmin
#'             upr  <- if (is.nan(xmax)) max(data) + bw else xmax
#'             integrand <- function(x) eval_kde1d(sort(data), x, xmin, xmax, bw)^2
#'
#'             tmp1 <- try(integrate(integrand, lowr, upr, rel.tol = 5e-2)$value,
#'                         silent = TRUE)
#'             tmp2 <- 2 * sum(K, na.rm = TRUE) / (n.subs*(n.subs-1))
#'
#'             if ("try-error" %in% class(tmp1)) Inf else tmp1 - tmp2
#'         }
#'         # starting parameter with normal reference rule
#'         bw.start <- cK * min(sd(data), mad(data)) * n^(-1/5)
#'         if (bw.start == 0)
#'             bw.start = 0.5
#'         opt <- suppressWarnings(optimize(lsc,
#'                                          lower = bw.start / 5,
#'                                          upper = bw.start * 5))
#'         bw <- opt$minimum * (n / n.subs)^(-1/5)
#'
#'         # correction for discreteness
#'         bw <- min(bw * (1 + log(n/length(levels(as.factor(data))))),
#'                   0.5 * diff(range(data)))
#'     }
#'
#'     ## return results
#'     bw
#' }
back to top