https://github.com/cran/kdevine
Tip revision: 950ccc0c0c9d3f6c25ac1cfbddbf65faf191659d authored by Thomas Nagler on 17 December 2018, 15:00:03 UTC
version 0.4.2
version 0.4.2
Tip revision: 950ccc0
kde1d.R
#' Univariate kernel density estimation for bounded and unbounded support
#'
#' Discrete variables are convoluted with the uniform distribution (see, Nagler,
#' 2017). If a variable should be treated as discrete, declare it as
#' [ordered()].
#'
#' @param x vector of length \eqn{n}.
#' @param mult numeric; the actual bandwidth used is \eqn{bw*mult}.
#' @param xmin lower bound for the support of the density.
#' @param xmax upper bound for the support of the density.
#' @param bw bandwidth parameter; has to be a positive number or \code{NULL};
#' the latter calls [`KernSmooth::dpik()`].
#' @param bw_min minimum value for the bandwidth.
#' @param ... unused.
#'
#' @return An object of class \code{kde1d}.
#'
#' @details If \code{xmin} or \code{xmax} are finite, the density estimate will
#' be 0 outside of \eqn{[xmin, xmax]}. Mirror-reflection is used to correct
#' for boundary bias. Discrete variables are convoluted with the uniform
#' distribution (see, Nagler, 2017).
#'
#' @seealso \code{\link{dkde1d}}, \code{\link{pkde1d}}, \code{\link{qkde1d}},
#' \code{\link{rkde1d}} \code{\link{plot.kde1d}} , \code{\link{lines.kde1d}}
#'
#' @references Nagler, T. (2017). *A generic approach to nonparametric function
#' estimation with mixed data.* [arXiv:1704.07457](https://arxiv.org/abs/1704.07457)
#'
#' @examples
#' data(wdbc, package = "kdecopula") # load data
#' fit <- kde1d(wdbc[, 5]) # estimate density
#' dkde1d(1000, fit) # evaluate density estimate
#'
#' @importFrom KernSmooth dpik
#' @importFrom MASS bandwidth.nrd
#' @importFrom cctools cont_conv
#' @export
kde1d <- function(x, mult = 1, xmin = -Inf, xmax = Inf, bw = NULL, bw_min = 0, ...) {
if (missing(x) & !is.null(list(...)$data)) # for backwards compatibility
x <- list(...)$data
## check/complete function call
stopifnot(NCOL(x) == 1)
if (is.null(xmin))
xmin <- NaN
if (is.null(xmax))
xmax <- NaN
if (!is.finite(xmin))
xmin <- NaN
if (!is.finite(xmax))
xmax <- NaN
if (!is.nan(xmax) & !is.nan(xmin)) {
if (xmin > xmax)
stop("'xmin' is larger than 'xmax'")
if (any(x < xmin) || any(x > xmax))
stop("Not all data are contained in the interval [xmin, xmax].")
} else if (!is.nan(xmin)) {
if (any(x < xmin))
stop("Not all data are larger than xmin.")
} else if (!is.nan(xmax)) {
if (any(x > xmax))
stop("Not all data are samller than xmax")
}
## make continuous if discrete
if (!is.ordered(x) & is.factor(x))
stop("Factors not allowed; use kdevine() or cctools::cckde().")
lvls <- levels(x)
x_cc <- cctools::cont_conv(x, ...)
## bandwidth selection
if (is.null(bw))
bw <- NA
if (is.na(bw)) {
# plug in method
bw <- try(KernSmooth::dpik(x_cc))
# if it fails: normal rule of thumb
if (inherits(bw, "try-error"))
bw <- MASS::bandwidth.nrd(x_cc)
# if everything fails:
if (all(bw == 0))
bw <- 1
}
# bound from below
if (length(attr(x_cc, "i_disc")) == 1) {
# for discrete use 1 - theta
bw <- max(bw, 0.5 - attr(x_cc, "theta"))
} else {
# for continuous use bw_min
bw <- max(bw, bw_min)
}
## return kde1d object
res <- list(x_cc = x_cc,
levels = lvls,
xmin = xmin,
xmax = xmax,
bw = bw * mult)
class(res) <- "kde1d"
res
}
#' Working with a kde1d object
#'
#' The density, cdf, or quantile function of a kernel density estimate are
#' evaluated at arbitrary points with \code{\link{dkde1d}}, \code{\link{pkde1d}},
#' and \code{\link{qkde1d}} respectively.
#'
#' @aliases pkde1d, qkde1d, rkde1d
#'
#' @param x vector of evaluation points.
#' @param obj a \code{kde1d} object.
#'
#'
#' @return The density or cdf estimate evaluated at \code{x}.
#'
#' @seealso
#' \code{\link{kde1d}}
#'
#' @examples
#' data(wdbc) # load data
#' fit <- kde1d(wdbc[, 5]) # estimate density
#' dkde1d(1000, fit) # evaluate density estimate
#' pkde1d(1000, fit) # evaluate corresponding cdf
#' qkde1d(0.5, fit) # quantile function
#' hist(rkde1d(100, fit)) # simulate
#'
#' @useDynLib kdevine
#' @importFrom Rcpp evalCpp
#' @importFrom cctools expand_as_numeric
#' @export
dkde1d <- function(x, obj) {
if (is.data.frame(x))
x <- x[[1]]
if (!is.ordered(x))
stopifnot(!is.factor(x))
x <- cctools::expand_as_numeric(x)
f <- eval_kde1d(sort(obj$x_cc), x, obj$xmin, obj$xmax, obj$bw)
if (length(attr(obj$x_cc, "i_disc") == 1)) {
# for discrete variables we can normalize
x_all_num <- expand_as_numeric(as.ordered(obj$levels))
f_all <- eval_kde1d(sort(obj$x_cc), x_all_num, obj$xmin, obj$xmax, obj$bw)
f <- f / sum(f_all)
}
f
}
#' @rdname dkde1d
#' @export
pkde1d <- function(x, obj) {
if (is.data.frame(x))
x <- x[[1]]
if (!is.ordered(x))
stopifnot(!is.factor(x))
x <- cctools::expand_as_numeric(x)
if (length(attr(obj$x_cc, "i_disc") == 1)) {
# for discrete variables we have to add the missing probability mass
x_all_num <- expand_as_numeric(as.ordered(obj$levels))
f_all <- dkde1d(x_all_num, obj)
p <- sapply(x, function(y) sum(f_all[x_all_num <= y]))
} else {
p <- eval_pkde1d(sort(obj$x_cc), x, obj$xmin, obj$xmax, obj$bw)
}
p
}
#' @rdname dkde1d
#' @export
qkde1d <- function(x, obj) {
if (is.data.frame(x))
x <- x[[1]]
stopifnot(all((x >= 0) & (x <= 1)))
q <- eval_qkde1d(sort(obj$x_cc), x, obj$xmin, obj$xmax, obj$bw)
## for discrete variables compute quantile from the density
if (length(attr(obj$x_cc, "i_disc") == 1)) {
x_all_num <- expand_as_numeric(as.ordered(obj$levels))
# pdf at all possible values of x
dd <- eval_kde1d(sort(obj$x_cc), x_all_num, obj$xmin, obj$xmax, obj$bw)
pp <- c(cumsum(dd)) / sum(dd)
# generalized inverse
q <- x_all_num[vapply(x, function(y) which(y <= pp)[1], integer(1))]
q <- ordered(obj$levels[q + 1], levels = obj$levels)
}
q
}
#' @param n integer; number of observations.
#' @param quasi logical; the default (\code{FALSE}) returns pseudo-random
#' numbers, use \code{TRUE} for quasi-random numbers (generalized Halton, see
#' \code{\link[qrng:ghalton]{ghalton}}).
#'
#' @rdname dkde1d
#' @importFrom qrng ghalton
#' @importFrom stats runif
#' @export
rkde1d <- function(n, obj, quasi = FALSE) {
# simulate (psuedo/quasi) uniform random variables
if (!quasi) {
U <- runif(n)
} else {
U <- ghalton(n, d = 1)
}
# simulated data from KDE is the quantile transform of U
qkde1d(U, obj)
}
#' Plotting kde1d objects
#'
#' @aliases lines.kde1d
#' @method plot kde1d
#'
#' @param x \code{kde1d} object.
#' @param ... further arguments passed to \code{\link{plot.default}}.
#'
#' @seealso
#' \code{\link{kde1d}}
#' \code{\link{lines.kde1d}}
#'
#' @examples
#' data(wdbc) # load data
#' fit <- kde1d(wdbc[, 7]) # estimate density
#' plot(fit) # plot density estimate
#'
#' fit2 <- kde1d(as.ordered(wdbc[, 1])) # discrete variable
#' plot(fit2, col = 2)
#'
#' @importFrom graphics plot
#' @importFrom utils modifyList
#' @export
plot.kde1d <- function(x, ...) {
p.l <- if (is.nan(x$xmin)) min(x$x_cc) - x$bw else x$xmin
p.u <- if (is.nan(x$xmax)) max(x$x_cc) + x$bw else x$xmax
ev <- seq(p.l, p.u, l = 100)
plot_type <- "l" # for continuous variables, use a line plot
if (length(attr(x$x_cc, "i_disc")) == 1) {
ev <- as.ordered(x$levels)
plot_type <- "h" # for discrete variables, use a histrogram
}
fhat <- dkde1d(expand_as_numeric(ev), x)
pars <- list(x = ev,
y = fhat,
type = plot_type,
xlab = "x",
ylab = "density",
ylim = c(0, 1.1 * max(fhat)))
do.call(plot, modifyList(pars, list(...)))
}
#' @method lines kde1d
#'
#' @rdname plot.kde1d
#' @importFrom graphics lines
#' @importFrom utils modifyList
#' @export
lines.kde1d <- function(x, ...) {
if (length(attr(x$x_cc, "i_disc") == 1))
stop("lines does not work for discrete estimates.")
p.l <- if (is.nan(x$xmin)) min(x$x_cc) - x$bw else x$xmin
p.u <- if (is.nan(x$xmax)) max(x$x_cc) + x$bw else x$xmax
ev <- seq(p.l, p.u, l = 100)
fhat <- dkde1d(ev, x)
pars <- list(x = ev, y = fhat)
do.call(lines, modifyList(pars, list(...)))
}
#' #' Bandwidth selection for kde1d
#' #'
#' #' @param data data vector.
#' #' @param xmin lower bound for the support of the density.
#' #' @param xmax upper bound for the support of the density.
#' #' @param n.subs number of subsamples used to estimate the cross-validation
#' #' criterion.
#' #'
#' #' @importFrom stats integrate mad sd optimize
#' bw_kde1d <- function(data, xmin, xmax, n.subs) {
#' n <- length(data)
#' cK <- 1
#' if (missing(xmin))
#' xmin <- NaN
#' if (missing(xmax))
#' xmax <- NaN
#' ## bandwidth selection
#' if (is.nan(xmin) & is.nan(xmax)) {
#' ## for unbounded data calll hpi from ks package
#' bw <- hpi(data)
#' } else {
#' ## LSCV for bounded data
#' subs <- sample(n.subs)
#' M1 <- data[subs] %*% t(rep(1, n.subs))
#' M2 <- rep(1, n.subs) %*% t(data[subs])
#' difs <- M1 - M2
#' sums <- M1 + M2
#' diag(difs) <- rep(NA, n.subs)
#' lsc <- function(bw) {
#' K <- kern_gauss(difs / bw) / bw
#' if (!is.nan(xmin))
#' K <- K + kern_gauss((2 * xmin - sums) / bw) / bw
#' if (!is.nan(xmax))
#' K <- K + kern_gauss((2 * xmax - sums) / bw) / bw
#' lowr <- if (is.list(data))
#' data <- data[[1]]
#' if (is.nan(xmin)) min(data) - bw else xmin
#' upr <- if (is.nan(xmax)) max(data) + bw else xmax
#' integrand <- function(x) eval_kde1d(sort(data), x, xmin, xmax, bw)^2
#'
#' tmp1 <- try(integrate(integrand, lowr, upr, rel.tol = 5e-2)$value,
#' silent = TRUE)
#' tmp2 <- 2 * sum(K, na.rm = TRUE) / (n.subs*(n.subs-1))
#'
#' if ("try-error" %in% class(tmp1)) Inf else tmp1 - tmp2
#' }
#' # starting parameter with normal reference rule
#' bw.start <- cK * min(sd(data), mad(data)) * n^(-1/5)
#' if (bw.start == 0)
#' bw.start = 0.5
#' opt <- suppressWarnings(optimize(lsc,
#' lower = bw.start / 5,
#' upper = bw.start * 5))
#' bw <- opt$minimum * (n / n.subs)^(-1/5)
#'
#' # correction for discreteness
#' bw <- min(bw * (1 + log(n/length(levels(as.factor(data))))),
#' 0.5 * diff(range(data)))
#' }
#'
#' ## return results
#' bw
#' }