Revision 94a7e298b1a50d93e8a9ccb813a070f7b30f3da1 authored by Christian Thiele on 21 March 2018, 08:27:24 UTC, committed by cran-robot on 21 March 2018, 08:27:24 UTC
0 parent
cutpointr.Rd
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cutpointr.R
\name{cutpointr}
\alias{cutpointr}
\alias{cutpointr.default}
\alias{cutpointr.numeric}
\title{Determine and evaluate optimal cutpoints}
\usage{
cutpointr(...)
\method{cutpointr}{default}(data, x, class, subgroup = NULL,
method = maximize_metric, metric = sum_sens_spec, pos_class = NULL,
neg_class = NULL, direction = NULL, boot_runs = 0,
use_midpoints = FALSE, break_ties = c, na.rm = FALSE,
allowParallel = FALSE, silent = FALSE, tol_metric = 1e-06, ...)
\method{cutpointr}{numeric}(x, class, subgroup = NULL,
method = maximize_metric, metric = sum_sens_spec, pos_class = NULL,
neg_class = NULL, direction = NULL, boot_runs = 0,
use_midpoints = FALSE, break_ties = median, na.rm = FALSE,
allowParallel = FALSE, silent = FALSE, tol_metric = 1e-06, ...)
}
\arguments{
\item{...}{Further optional arguments that will be passed to method.
minimize_metric and maximize_metric pass ... to metric.}
\item{data}{A data.frame with the data needed for x, class and subgroup.}
\item{x}{The variable name without quotes to be used for classification,
e.g. predictions, or an expression. The raw vector of values if the data argument
is unused.}
\item{class}{The variable name without quotes indicating class membership
or an expression. The raw vector of values if the data argument is unused.}
\item{subgroup}{An additional covariate that identifies subgroups or the raw data if
data = NULL. Separate optimal cutpoints will be determined per group.
Numeric, character and factor are allowed.}
\item{method}{(function) A function for determining cutpoints. Can
be user supplied or use some of the built in methods. See details.}
\item{metric}{(function) The function for computing a metric when using
maximize_metric or minimize_metric as method and and for the
out-of-bag values during bootstrapping. A way of internally validating the performance.
User defined functions can be supplied, see details.}
\item{pos_class}{(optional) The value of class that indicates the positive class.}
\item{neg_class}{(optional) The value of class that indicates the negative class.}
\item{direction}{(character, optional) Use ">=" or "<=" to indicate whether x
is supposed to be larger or smaller for the positive class.}
\item{boot_runs}{(numerical) If positive, this number of bootstrap samples
will be used to assess the variability and the out-of-sample performance.}
\item{use_midpoints}{(logical) If TRUE (default FALSE) the returned optimal
cutpoint will be the mean of the optimal cutpoint and the next highest
observation (for direction = ">") or the next lowest observation
(for direction = "<") which avoids biasing the optimal cutpoint.}
\item{break_ties}{If multiple cutpoints are found, they can be summarized using
this function, e.g. mean or median. To return all cutpoints use c as the function.}
\item{na.rm}{(logical) Set to TRUE (default FALSE) to keep only complete
cases of x, class and subgroup (if specified). Missing values with
na.rm = FALSE will raise an error.}
\item{allowParallel}{(logical) If TRUE, the bootstrapping will be parallelized
using foreach. A local cluster, for example, should be started manually
beforehand.}
\item{silent}{(logical) If TRUE suppresses all messages.}
\item{tol_metric}{All cutpoints will be returned that lead to a metric
value in the interval [m_max - tol_metric, m_max + tol_metric] where
m_max is the maximum achievable metric value. This can be used to return
multiple decent cutpoints and to avoid floating-point problems. Not supported
by all \code{method} functions, see details.}
}
\value{
A cutpointr object which is also a data.frame and tbl_df.
}
\description{
Using predictions (or e.g. biological marker values) and binary class labels, this function
will determine "optimal" cutpoints using various selectable methods. The
methods for cutpoint determination can be evaluated using bootstrapping. An
estimate of the cutpoint variability and the out-of-sample performance will then
be returned.
}
\details{
If \code{direction} and/or \code{pos_class} and \code{neg_class} are not given, the function will
assume that higher values indicate the positive class and use the class
with a higher median as the positive class.
Different methods can be selected for determining the optimal cutpoint via
the method argument. The package includes the following method functions:
\itemize{
\item \code{maximize_metric}: Maximize the metric function
\item \code{minimize_metric}: Minimize the metric function
\item \code{maximize_loess_metric}: Maximize the metric function after LOESS
smoothing
\item \code{minimize_loess_metric}: Minimize the metric function after LOESS
smoothing
\item \code{maximize_spline_metric}: Maximize the metric function after spline
smoothing
\item \code{minimize_spline_metric}: Minimize the metric function after spline
smoothing
\item \code{maximize_boot_metric}: Maximize the metric function as a summary of
the optimal cutpoints in bootstrapped samples
\item \code{minimize_boot_metric}: Minimize the metric function as a summary of
the optimal cutpoints in bootstrapped samples
\item \code{oc_youden_kernel}: Maximize the Youden-Index after kernel smoothing
the distributions of the two classes
\item \code{oc_youden_normal}: Maximize the Youden-Index parametrically
assuming normally distributed data in both classes
\item \code{oc_manual}: Specify the cutpoint manually
}
User-defined functions can be supplied to method, too. As a reference,
the code of all included method functions can be accessed by simply typing
their name. To define a new method function, create a function that may take
as input(s):
\itemize{
\item \code{data}: A \code{data.frame} or \code{tbl_df}
\item \code{x}: (character) The name of the predictor or independent variable
\item \code{class}: (character) The name of the class or dependent variable
\item \code{metric_func}: A function for calculating a metric, e.g. accuracy
\item \code{pos_class}: The positive class
\item \code{neg_class}: The negative class
\item \code{direction}: ">=" if the positive class has higher x values, "<=" otherwise
\item \code{tol_metric}: (numeric) In the built-in methods a tolerance around
the optimal metric value
\item \code{use_midpoints}: (logical) In the built-in methods whether to
use midpoints instead of exact optimal cutpoints
\item \code{...} Further arguments
}
The \code{...} argument can be used to avoid an error if not all of the above
arguments are needed or in order to pass additional arguments to method.
The function should return a \code{data.frame} or \code{tbl_df} with
one row, the column "optimal_cutpoint", and an optional column with an arbitrary name
with the metric value at the optimal cutpoint.
Built-in metric functions include:
\itemize{
\item \code{accuracy}: Fraction correctly classified
\item \code{youden}: Youden- or J-Index = sensitivity + specificity - 1
\item \code{sum_sens_spec}: sensitivity + specificity
\item \code{sum_ppv_npv}: The sum of positive predictive value (PPV) and negative
predictive value (NPV)
\item \code{prod_sens_spec}: sensitivity * specificity
\item \code{prod_ppv_npv}: The product of positive predictive value (PPV) and
negative predictive value (NPV)
\item \code{cohens_kappa}: Cohen's Kappa
\item \code{abs_d_sens_spec}: The absolute difference between
sensitivity and specificity
\item \code{abs_d_ppv_npv}: The absolute difference between positive predictive
value (PPV) and negative predictive value (NPV)
\item \code{p_chisquared}: The p-value of a chi-squared test on the confusion
matrix of predictions and observations
\item \code{odds_ratio}: The odds ratio calculated as (TP / FP) / (FN / TN)
\item \code{risk_ratio}: The risk ratio (relative risk) calculated as
(TP / (TP + FN)) / (FP / (FP + TN))
\item positive and negative likelihood ratio calculated as
\code{plr} = true positive rate / false positive rate and
\code{nlr} = false negative rate / true negative rate
\item \code{misclassification_cost}: The sum of the misclassification cost of
false positives and false negatives fp * cost_fp + fn * cost_fn.
Additional arguments to cutpointr: \code{cost_fp}, \code{cost_fn}
\item \code{total_utility}: The total utility of true / false positives / negatives
calculated as utility_tp * TP + utility_tn * TN - cost_fp * FP - cost_fn * FN.
Additional arguments to cutpointr: \code{utility_tp}, \code{utility_tn},
\code{cost_fp}, \code{cost_fn}
\item \code{F1_score}: The F1-score (2 * TP) / (2 * TP + FP + FN)
}
Furthermore, the following functions are included which can be used as metric
functions but are more useful for plotting purposes, for example in
plot_cutpointr, or for defining new metric functions:
\code{tp}, \code{fp}, \code{tn}, \code{fn}, \code{tpr}, \code{fpr},
\code{tnr}, \code{fnr}, \code{false_omission_rate},
\code{false_discovery_rate}, \code{ppv}, \code{npv}, \code{precision},
\code{recall}, \code{sensitivity}, and \code{specificity}.
User defined metric functions can be created as well which can accept the following
inputs as vectors:
\itemize{
\item \code{tp}: Vector of true positives
\item \code{fp}: Vector of false positives
\item \code{tn}: Vector of true negatives
\item \code{fn}: Vector of false negatives
\item \code{...} If the metric function is used in conjunction with any of the
maximize / minimize methods, further arguments can be passed
}
The function should return a numeric vector or a matrix or a \code{data.frame}
with one column. If the column is named,
the name will be included in the output and plots. Avoid using names that
are identical to the column names that are by default returned by \pkg{cutpointr}.
If \code{boot_runs} is positive, that number of bootstrap samples will be drawn
and the optimal cutpoint using \code{method} will be determined. Additionally,
as a way of internal validation, the function in \code{metric} will be used to
score the out-of-bag predictions using the cutpoints determined by
\code{method}. Various default metrics are always included in the bootstrap results.
If multiple optimal cutpoints are found, the column optimal_cutpoint becomes a
list that contains the vector(s) of the optimal cutpoints.
If \code{use_midpoints = TRUE} the mean of the optimal cutpoint and the next
highest or lowest possible cutpoint is returned, depending on \code{direction}.
The \code{tol_metric} argument can be used to avoid floating-point problems
that may lead to exclusion of cutpoints that achieve the optimally achievable
metric value. Additionally, by selecting a large tolerance multiple cutpoints
can be returned that lead to decent metric values in the vicinity of the
optimal metric value. \code{tol_metric} is passed to metric and is only
supported by the maximization and minimization functions, i.e.
\code{maximize_metric}, \code{minimize_metric}, \code{maximize_loess_metric},
\code{minimize_loess_metric}, \code{maximize_spline_metric}, and
\code{minimize_spline_metric}. In \code{maximize_boot_metric} and
\code{minimize_boot_metric} multiple optimal cutpoints will be passed to the
\code{summary_func} of these two functions.
}
\examples{
library(cutpointr)
## Optimal cutpoint for dsi
data(suicide)
opt_cut <- cutpointr(suicide, dsi, suicide)
opt_cut
summary(opt_cut)
plot(opt_cut)
\dontrun{
## Predict class for new observations
predict(opt_cut, newdata = data.frame(dsi = 0:5))
## Supplying raw data, same result
cutpointr(x = suicide$dsi, class = suicide$suicide)
## direction, class labels, method and metric can be defined manually
## Again, same result
cutpointr(suicide, dsi, suicide, direction = ">=", pos_class = "yes",
method = maximize_metric, metric = youden)
## Optimal cutpoint for dsi, as before, but for the separate subgroups
opt_cut <- cutpointr(suicide, dsi, suicide, gender)
opt_cut
## Bootstrapping also works on individual subgroups
## low boot_runs for illustrative purposes
set.seed(30)
opt_cut <- cutpointr(suicide, dsi, suicide, gender, boot_runs = 5)
opt_cut
summary(opt_cut)
plot(opt_cut)
## Transforming variables (unrealistic, just to show the functionality)
opt_cut <- cutpointr(suicide, x = log(dsi + 1), class = suicide == "yes",
subgroup = dsi \%\% 2 == 0)
opt_cut
predict(opt_cut, newdata = data.frame(dsi = 1:3))
## Parallelized bootstrapping
cl <- makeCluster(2) # 2 cores
registerDoParallel(cl)
registerDoRNG(12) # Reproducible parallel loops using doRNG
opt_cut <- cutpointr(suicide, dsi, suicide, gender,
boot_runs = 10, allowParallel = TRUE)
stopCluster(cl)
opt_cut
plot(opt_cut)
## Robust cutpoint method using kernel smoothing for optimizing Youden-Index
opt_cut <- cutpointr(suicide, dsi, suicide, gender,
method = oc_youden_kernel)
opt_cut
}
}
\seealso{
Other main cutpointr functions: \code{\link{cutpointr_}},
\code{\link{multi_cutpointr}},
\code{\link{predict.cutpointr}}, \code{\link{roc}}
}
Computing file changes ...