https://github.com/cran/cutpointr
Raw File
Tip revision: 2900dc24d2c5a7d8fdb3f1abb1540fb704e51742 authored by Christian Thiele on 15 February 2021, 13:40:03 UTC
version 1.1.0
Tip revision: 2900dc2
cutpointr.html
<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />

<meta name="viewport" content="width=device-width, initial-scale=1" />

<meta name="author" content="Christian Thiele" />

<meta name="date" content="2021-02-15" />

<title>An introduction to cutpointr</title>

<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
// be compatible with the behavior of Pandoc < 2.8).
document.addEventListener('DOMContentLoaded', function(e) {
  var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
  var i, h, a;
  for (i = 0; i < hs.length; i++) {
    h = hs[i];
    if (!/^h[1-6]$/i.test(h.tagName)) continue;  // it should be a header h1-h6
    a = h.attributes;
    while (a.length > 0) h.removeAttribute(a[0].name);
  }
});
</script>
<script>// Hide empty <a> tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) -->
// v0.0.1
// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020.

document.addEventListener('DOMContentLoaded', function() {
  const codeList = document.getElementsByClassName("sourceCode");
  for (var i = 0; i < codeList.length; i++) {
    var linkList = codeList[i].getElementsByTagName('a');
    for (var j = 0; j < linkList.length; j++) {
      if (linkList[j].innerHTML === "") {
        linkList[j].setAttribute('aria-hidden', 'true');
      }
    }
  }
});
</script>

<style type="text/css">
  code{white-space: pre-wrap;}
  span.smallcaps{font-variant: small-caps;}
  span.underline{text-decoration: underline;}
  div.column{display: inline-block; vertical-align: top; width: 50%;}
  div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
  ul.task-list{list-style: none;}
    </style>


<style type="text/css">code{white-space: pre;}</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */

</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
  var sheets = document.styleSheets;
  for (var i = 0; i < sheets.length; i++) {
    if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
    try { var rules = sheets[i].cssRules; } catch (e) { continue; }
    for (var j = 0; j < rules.length; j++) {
      var rule = rules[j];
      // check if there is a div.sourceCode rule
      if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") continue;
      var style = rule.style.cssText;
      // check if color or background-color is set
      if (rule.style.color === '' && rule.style.backgroundColor === '') continue;
      // replace div.sourceCode by a pre.sourceCode rule
      sheets[i].deleteRule(j);
      sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
    }
  }
})();
</script>




<style type="text/css">body {
background-color: #fff;
margin: 1em auto;
max-width: 700px;
overflow: visible;
padding-left: 2em;
padding-right: 2em;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
font-size: 14px;
line-height: 1.35;
}
#TOC {
clear: both;
margin: 0 0 10px 10px;
padding: 4px;
width: 400px;
border: 1px solid #CCCCCC;
border-radius: 5px;
background-color: #f6f6f6;
font-size: 13px;
line-height: 1.3;
}
#TOC .toctitle {
font-weight: bold;
font-size: 15px;
margin-left: 5px;
}
#TOC ul {
padding-left: 40px;
margin-left: -1.5em;
margin-top: 5px;
margin-bottom: 5px;
}
#TOC ul ul {
margin-left: -2em;
}
#TOC li {
line-height: 16px;
}
table {
margin: 1em auto;
border-width: 1px;
border-color: #DDDDDD;
border-style: outset;
border-collapse: collapse;
}
table th {
border-width: 2px;
padding: 5px;
border-style: inset;
}
table td {
border-width: 1px;
border-style: inset;
line-height: 18px;
padding: 5px 5px;
}
table, table th, table td {
border-left-style: none;
border-right-style: none;
}
table thead, table tr.even {
background-color: #f7f7f7;
}
p {
margin: 0.5em 0;
}
blockquote {
background-color: #f6f6f6;
padding: 0.25em 0.75em;
}
hr {
border-style: solid;
border: none;
border-top: 1px solid #777;
margin: 28px 0;
}
dl {
margin-left: 0;
}
dl dd {
margin-bottom: 13px;
margin-left: 13px;
}
dl dt {
font-weight: bold;
}
ul {
margin-top: 0;
}
ul li {
list-style: circle outside;
}
ul ul {
margin-bottom: 0;
}
pre, code {
background-color: #f7f7f7;
border-radius: 3px;
color: #333;
white-space: pre-wrap; 
}
pre {
border-radius: 3px;
margin: 5px 0px 10px 0px;
padding: 10px;
}
pre:not([class]) {
background-color: #f7f7f7;
}
code {
font-family: Consolas, Monaco, 'Courier New', monospace;
font-size: 85%;
}
p > code, li > code {
padding: 2px 0px;
}
div.figure {
text-align: center;
}
img {
background-color: #FFFFFF;
padding: 2px;
border: 1px solid #DDDDDD;
border-radius: 3px;
border: 1px solid #CCCCCC;
margin: 0 5px;
}
h1 {
margin-top: 0;
font-size: 35px;
line-height: 40px;
}
h2 {
border-bottom: 4px solid #f7f7f7;
padding-top: 10px;
padding-bottom: 2px;
font-size: 145%;
}
h3 {
border-bottom: 2px solid #f7f7f7;
padding-top: 10px;
font-size: 120%;
}
h4 {
border-bottom: 1px solid #f7f7f7;
margin-left: 8px;
font-size: 105%;
}
h5, h6 {
border-bottom: 1px solid #ccc;
font-size: 105%;
}
a {
color: #0033dd;
text-decoration: none;
}
a:hover {
color: #6666ff; }
a:visited {
color: #800080; }
a:visited:hover {
color: #BB00BB; }
a[href^="http:"] {
text-decoration: underline; }
a[href^="https:"] {
text-decoration: underline; }

code > span.kw { color: #555; font-weight: bold; } 
code > span.dt { color: #902000; } 
code > span.dv { color: #40a070; } 
code > span.bn { color: #d14; } 
code > span.fl { color: #d14; } 
code > span.ch { color: #d14; } 
code > span.st { color: #d14; } 
code > span.co { color: #888888; font-style: italic; } 
code > span.ot { color: #007020; } 
code > span.al { color: #ff0000; font-weight: bold; } 
code > span.fu { color: #900; font-weight: bold; } 
code > span.er { color: #a61717; background-color: #e3d2d2; } 
</style>




</head>

<body>




<h1 class="title toc-ignore">An introduction to cutpointr</h1>
<h4 class="author">Christian Thiele</h4>
<h4 class="date">2021-02-15</h4>



<p><strong>cutpointr</strong> is an R package for tidy calculation of “optimal” cutpoints. It supports several methods for calculating cutpoints and includes several metrics that can be maximized or minimized by selecting a cutpoint. Some of these methods are designed to be more robust than the simple empirical optimization of a metric. Additionally, <strong>cutpointr</strong> can automatically bootstrap the variability of the optimal cutpoints and return out-of-bag estimates of various performance metrics.</p>
<div id="installation" class="section level2">
<h2>Installation</h2>
<p>You can install <strong>cutpointr</strong> from CRAN using the menu in RStudio or simply:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1"></a><span class="kw">install.packages</span>(<span class="st">&quot;cutpointr&quot;</span>)</span></code></pre></div>
</div>
<div id="example" class="section level2">
<h2>Example</h2>
<p>For example, the optimal cutpoint for the included data set is 2 when maximizing the sum of sensitivity and specificity.</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1"></a><span class="kw">library</span>(cutpointr)</span>
<span id="cb2-2"><a href="#cb2-2"></a><span class="kw">data</span>(suicide)</span>
<span id="cb2-3"><a href="#cb2-3"></a><span class="kw">head</span>(suicide)</span></code></pre></div>
<pre><code>##   age gender dsi suicide
## 1  29 female   1      no
## 2  26   male   0      no
## 3  26 female   0      no
## 4  27 female   0      no
## 5  28 female   0      no
## 6  53   male   2      no</code></pre>
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1"></a>cp &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, </span>
<span id="cb4-2"><a href="#cb4-2"></a>                <span class="dt">method =</span> maximize_metric, <span class="dt">metric =</span> sum_sens_spec)</span></code></pre></div>
<pre><code>## Assuming the positive class is yes</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1"></a><span class="kw">summary</span>(cp)</span></code></pre></div>
<pre><code>## Method: maximize_metric 
## Predictor: dsi 
## Outcome: suicide 
## Direction: &gt;= 
## 
##     AUC   n n_pos n_neg
##  0.9238 532    36   496
## 
##  optimal_cutpoint sum_sens_spec    acc sensitivity specificity tp fn fp  tn
##                 2        1.7518 0.8647      0.8889      0.8629 32  4 68 428
## 
## Predictor summary: 
##     Data Min.   5% 1st Qu. Median      Mean 3rd Qu.  95% Max.       SD NAs
##  Overall    0 0.00       0      0 0.9210526       1 5.00   11 1.852714   0
##       no    0 0.00       0      0 0.6330645       0 4.00   10 1.412225   0
##      yes    0 0.75       4      5 4.8888889       6 9.25   11 2.549821   0</code></pre>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1"></a><span class="kw">plot</span>(cp)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<p>When considering the optimality of a cutpoint, we can only make a judgement based on the sample at hand. Thus, the estimated cutpoint may not be optimal within the population or on unseen data, which is why we sometimes put the “optimal” in quotation marks.</p>
<p><code>cutpointr</code> makes assumptions about the direction of the dependency between <code>class</code> and <code>x</code>, if <code>direction</code> and / or <code>pos_class</code> or <code>neg_class</code> are not specified. The same result as above can be achieved by manually defining <code>direction</code> and the positive / negative classes which is slightly faster, since the classes and direction don’t have to be determined:</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, <span class="dt">direction =</span> <span class="st">&quot;&gt;=&quot;</span>, <span class="dt">pos_class =</span> <span class="st">&quot;yes&quot;</span>,</span>
<span id="cb10-2"><a href="#cb10-2"></a>                     <span class="dt">neg_class =</span> <span class="st">&quot;no&quot;</span>, <span class="dt">method =</span> maximize_metric, <span class="dt">metric =</span> youden)</span></code></pre></div>
<p><code>opt_cut</code> is a data frame that returns the input data and the ROC curve (and optionally the bootstrap results) in a nested tibble. Methods for summarizing and plotting the data and results are included (e.g. <code>summary</code>, <code>plot</code>, <code>plot_roc</code>, <code>plot_metric</code>)</p>
<p>To inspect the optimization, the function of metric values per cutpoint can be plotted using <code>plot_metric</code>, if an optimization function was used that returns a metric column in the <code>roc_curve</code> column. For example, the <code>maximize_metric</code> and <code>minimize_metric</code> functions do so:</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1"></a><span class="kw">plot_metric</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<p>Predictions for new data can be made using <code>predict</code>:</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1"></a><span class="kw">predict</span>(opt_cut, <span class="dt">newdata =</span> <span class="kw">data.frame</span>(<span class="dt">dsi =</span> <span class="dv">0</span><span class="op">:</span><span class="dv">5</span>))</span></code></pre></div>
<pre><code>## [1] &quot;no&quot;  &quot;no&quot;  &quot;yes&quot; &quot;yes&quot; &quot;yes&quot; &quot;yes&quot;</code></pre>
</div>
<div id="features" class="section level2">
<h2>Features</h2>
<ul>
<li>Calculation of optimal cutpoints in binary classification tasks</li>
<li>Tidy output, integrates well with functions from the tidyverse</li>
<li>Functions for plotting ROC curves, metric distributions and more</li>
<li>Bootstrapping for simulating the cutpoint variability and for obtaining out-of-bag estimates of various metrics (as a form of internal validation) with optional parallelisation</li>
<li>Multiple methods for calculating cutpoints</li>
<li>Multiple metrics can be chosen for maximization / minimization</li>
<li>Tidyeval</li>
</ul>
</div>
<div id="calculating-cutpoints" class="section level1">
<h1>Calculating cutpoints</h1>
<div id="method-functions-for-cutpoint-estimation" class="section level2">
<h2>Method functions for cutpoint estimation</h2>
<p>The included methods for calculating cutpoints are:</p>
<ul>
<li><code>maximize_metric</code>: Maximize the metric function</li>
<li><code>minimize_metric</code>: Minimize the metric function</li>
<li><code>maximize_loess_metric</code>: Maximize the metric function after LOESS smoothing</li>
<li><code>minimize_loess_metric</code>: Minimize the metric function after LOESS smoothing</li>
<li><code>maximize_spline_metric</code>: Maximize the metric function after spline smoothing</li>
<li><code>minimize_spline_metric</code>: Minimize the metric function after spline smoothing</li>
<li><code>maximize_gam_metric</code>: Maximize the metric function after smoothing via Generalized Additive Models</li>
<li><code>minimize_gam_metric</code>: Minimize the metric function after smoothing via Generalized Additive Models</li>
<li><code>maximize_boot_metric</code>: Bootstrap the optimal cutpoint when maximizing a metric</li>
<li><code>minimize_boot_metric</code>: Bootstrap the optimal cutpoint when minimizing a metric</li>
<li><code>oc_manual</code>: Specify the cutoff value manually</li>
<li><code>oc_mean</code>: Use the sample mean as the “optimal” cutpoint</li>
<li><code>oc_median</code>: Use the sample median as the “optimal” cutpoint</li>
<li><code>oc_youden_kernel</code>: Maximize the Youden-Index after kernel smoothing the distributions of the two classes</li>
<li><code>oc_youden_normal</code>: Maximize the Youden-Index parametrically assuming normally distributed data in both classes</li>
</ul>
</div>
<div id="metric-functions" class="section level2">
<h2>Metric functions</h2>
<p>The included metrics to be used with the minimization and maximization methods are:</p>
<ul>
<li><code>accuracy</code>: Fraction correctly classified</li>
<li><code>abs_d_sens_spec</code>: The absolute difference of sensitivity and specificity</li>
<li><code>abs_d_ppv_npv</code>: The absolute difference between positive predictive value (PPV) and negative predictive value (NPV)</li>
<li><code>roc01</code>: Distance to the point (0,1) on ROC space</li>
<li><code>cohens_kappa</code>: Cohen’s Kappa</li>
<li><code>sum_sens_spec</code>: sensitivity + specificity</li>
<li><code>sum_ppv_npv</code>: The sum of positive predictive value (PPV) and negative predictive value (NPV)</li>
<li><code>prod_sens_spec</code>: sensitivity * specificity</li>
<li><code>prod_ppv_npv</code>: The product of positive predictive value (PPV) and negative predictive value (NPV)</li>
<li><code>youden</code>: Youden- or J-Index = sensitivity + specificity - 1</li>
<li><code>odds_ratio</code>: (Diagnostic) odds ratio</li>
<li><code>risk_ratio</code>: risk ratio (relative risk)</li>
<li><code>p_chisquared</code>: The p-value of a chi-squared test on the confusion matrix</li>
<li><code>cost_misclassification</code>: The sum of the misclassification cost of false positives and false negatives. Additional arguments: cost_fp, cost_fn</li>
<li><code>total_utility</code>: The total utility of true / false positives / negatives. Additional arguments: utility_tp, utility_tn, cost_fp, cost_fn</li>
<li><code>F1_score</code>: The F1-score (2 * TP) / (2 * TP + FP + FN)</li>
<li><code>metric_constrain</code>: Maximize a selected metric given a minimal value of another selected metric</li>
<li><code>sens_constrain</code>: Maximize sensitivity given a minimal value of specificity</li>
<li><code>spec_constrain</code>: Maximize specificity given a minimal value of sensitivity</li>
<li><code>acc_constrain</code>: Maximize accuracy given a minimal value of sensitivity</li>
</ul>
<p>Furthermore, the following functions are included which can be used as metric functions but are more useful for plotting purposes, for example in <code>plot_cutpointr</code>, or for defining new metric functions: <code>tp</code>, <code>fp</code>, <code>tn</code>, <code>fn</code>, <code>tpr</code>, <code>fpr</code>, <code>tnr</code>, <code>fnr</code>, <code>false_omission_rate</code>, <code>false_discovery_rate</code>, <code>ppv</code>, <code>npv</code>, <code>precision</code>, <code>recall</code>, <code>sensitivity</code>, and <code>specificity</code>.</p>
<p>The inputs to the arguments <code>method</code> and <code>metric</code> are functions so that user-defined functions can easily be supplied instead of the built-in ones.</p>
</div>
<div id="separate-subgroups-and-bootstrapping" class="section level2">
<h2>Separate subgroups and bootstrapping</h2>
<p>Cutpoints can be separately estimated on subgroups that are defined by a third variable, <code>gender</code> in this case. Additionally, if <code>boot_runs</code> is larger zero, <code>cutpointr</code> will carry out the usual cutpoint calculation on the full sample, just as before, and additionally on <code>boot_runs</code> bootstrap samples. This offers a way of gauging the out-of-sample performance of the cutpoint estimation method. If a subgroup is given, the bootstrapping is carried out separately for every subgroup which is also reflected in the plots and output.</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1"></a><span class="kw">set.seed</span>(<span class="dv">12</span>)</span>
<span id="cb14-2"><a href="#cb14-2"></a>opt_cut_b &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, <span class="dt">boot_runs =</span> <span class="dv">1000</span>)</span></code></pre></div>
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1"></a>opt_cut_b</span></code></pre></div>
<pre><code>## # A tibble: 1 x 16
##   direction optimal_cutpoint method          sum_sens_spec      acc sensitivity
##   &lt;chr&gt;                &lt;dbl&gt; &lt;chr&gt;                   &lt;dbl&gt;    &lt;dbl&gt;       &lt;dbl&gt;
## 1 &gt;=                       2 maximize_metric       1.75179 0.864662    0.888889
##   specificity      AUC pos_class neg_class prevalence outcome predictor
##         &lt;dbl&gt;    &lt;dbl&gt; &lt;fct&gt;     &lt;fct&gt;          &lt;dbl&gt; &lt;chr&gt;   &lt;chr&gt;    
## 1    0.862903 0.923779 yes       no         0.0676692 suicide dsi      
##   data               roc_curve                 boot                 
##   &lt;list&gt;             &lt;list&gt;                    &lt;list&gt;               
## 1 &lt;tibble [532 x 2]&gt; &lt;roc_cutpointr [13 x 10]&gt; &lt;tibble [1,000 x 23]&gt;</code></pre>
<p>The returned object has the additional column <code>boot</code> which is a nested tibble that includes the cutpoints per bootstrap sample along with the metric calculated using the function in <code>metric</code> and various default metrics. The metrics are suffixed by <code>_b</code> to indicate in-bag results or <code>_oob</code> to indicate out-of-bag results:</p>
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1"></a>opt_cut<span class="op">$</span>boot</span></code></pre></div>
<pre><code>## [1] NA</code></pre>
<p>The summary and plots include additional elements that summarize or display the bootstrap results:</p>
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1"></a><span class="kw">summary</span>(opt_cut)</span></code></pre></div>
<pre><code>## Method: maximize_metric 
## Predictor: dsi 
## Outcome: suicide 
## Direction: &gt;= 
## 
##     AUC   n n_pos n_neg
##  0.9238 532    36   496
## 
##  optimal_cutpoint youden    acc sensitivity specificity tp fn fp  tn
##                 2 0.7518 0.8647      0.8889      0.8629 32  4 68 428
## 
## Predictor summary: 
##     Data Min.   5% 1st Qu. Median      Mean 3rd Qu.  95% Max.       SD NAs
##  Overall    0 0.00       0      0 0.9210526       1 5.00   11 1.852714   0
##       no    0 0.00       0      0 0.6330645       0 4.00   10 1.412225   0
##      yes    0 0.75       4      5 4.8888889       6 9.25   11 2.549821   0</code></pre>
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1"></a><span class="kw">plot</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<div id="parallelized-bootstrapping" class="section level3">
<h3>Parallelized bootstrapping</h3>
<p>Using <code>foreach</code> and <code>doRNG</code> the bootstrapping can be parallelized easily. The <strong>doRNG</strong> package is being used to make the bootstrap sampling reproducible.</p>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1"></a><span class="kw">library</span>(doParallel)</span>
<span id="cb22-2"><a href="#cb22-2"></a>cl &lt;-<span class="st"> </span><span class="kw">makeCluster</span>(<span class="dv">2</span>) <span class="co"># 2 cores</span></span>
<span id="cb22-3"><a href="#cb22-3"></a><span class="kw">registerDoParallel</span>(cl)</span>
<span id="cb22-4"><a href="#cb22-4"></a><span class="kw">registerDoRNG</span>(<span class="dv">12</span>) <span class="co"># Reproducible parallel loops using doRNG</span></span>
<span id="cb22-5"><a href="#cb22-5"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, <span class="dt">pos_class =</span> <span class="st">&quot;yes&quot;</span>,</span>
<span id="cb22-6"><a href="#cb22-6"></a>                     <span class="dt">direction =</span> <span class="st">&quot;&gt;=&quot;</span>, <span class="dt">boot_runs =</span> <span class="dv">1000</span>, <span class="dt">allowParallel =</span> <span class="ot">TRUE</span>)</span>
<span id="cb22-7"><a href="#cb22-7"></a><span class="kw">stopCluster</span>(cl)</span>
<span id="cb22-8"><a href="#cb22-8"></a>opt_cut</span></code></pre></div>
</div>
</div>
</div>
<div id="more-robust-cutpoint-estimation-methods" class="section level1">
<h1>More robust cutpoint estimation methods</h1>
<div id="bootstrapped-cutpoints" class="section level2">
<h2>Bootstrapped cutpoints</h2>
<p>It has been shown that bagging can substantially improve performance of a wide range of types of models in regression as well as in classification tasks. This method is available for cutpoint estimation via the <code>maximize_boot_metric</code> and <code>minimize_boot_metric</code> functions. If one of these functions is used as <code>method</code>, <code>boot_cut</code> bootstrap samples are drawn, the cutpoint optimization is carried out in each one and a summary (e.g. the mean) of the resulting optimal cutpoints on the bootstrap samples is returned as the optimal cutpoint in <code>cutpointr</code>. Note that if bootstrap validation is run, i.e. if <code>boot_runs</code> is larger zero, an outer bootstrap will be executed. In the bootstrap validation routine <code>boot_runs</code> bootstrap samples are generated and each one is again bootstrapped <code>boot_cut</code> times. This may lead to long run times, so activating the built-in parallelization may be advisable.</p>
<p>The advantages of bootstrapping the optimal cutpoint are that the procedure doesn’t possess parameters that have to be tuned, unlike the LOESS smoothing, that it doesn’t rely on assumptions, unlike the Normal method, and that it is applicable to any metric that can be used with <code>minimize_metric</code> or <code>maximize_metric</code>, unlike the Kernel method. Furthermore, like Random Forests cannot be overfit by increasing the number of trees, the bootstrapped cutpoints cannot be overfit by running an excessive amount of <code>boot_cut</code> repetitions.</p>
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1"></a><span class="kw">set.seed</span>(<span class="dv">100</span>)</span>
<span id="cb23-2"><a href="#cb23-2"></a><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, </span>
<span id="cb23-3"><a href="#cb23-3"></a>          <span class="dt">method =</span> maximize_boot_metric,</span>
<span id="cb23-4"><a href="#cb23-4"></a>          <span class="dt">boot_cut =</span> <span class="dv">200</span>, <span class="dt">summary_func =</span> mean,</span>
<span id="cb23-5"><a href="#cb23-5"></a>          <span class="dt">metric =</span> accuracy, <span class="dt">silent =</span> <span class="ot">TRUE</span>)</span></code></pre></div>
<pre><code>## # A tibble: 2 x 18
##   subgroup direction optimal_cutpoint method               accuracy      acc
##   &lt;chr&gt;    &lt;chr&gt;                &lt;dbl&gt; &lt;chr&gt;                   &lt;dbl&gt;    &lt;dbl&gt;
## 1 female   &gt;=                 5.73246 maximize_boot_metric 0.956633 0.956633
## 2 male     &gt;=                 8.41026 maximize_boot_metric 0.95     0.95    
##   sensitivity specificity      AUC pos_class neg_class prevalence outcome
##         &lt;dbl&gt;       &lt;dbl&gt;    &lt;dbl&gt; &lt;fct&gt;     &lt;fct&gt;          &lt;dbl&gt; &lt;chr&gt;  
## 1    0.444444    0.994521 0.944647 yes       no         0.0688776 suicide
## 2    0.222222    1        0.861747 yes       no         0.0642857 suicide
##   predictor grouping data               roc_curve                boot 
##   &lt;chr&gt;     &lt;chr&gt;    &lt;list&gt;             &lt;list&gt;                   &lt;lgl&gt;
## 1 dsi       gender   &lt;tibble [392 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA   
## 2 dsi       gender   &lt;tibble [140 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA</code></pre>
</div>
<div id="loess-smoothing-for-selecting-a-cutpoint" class="section level2">
<h2>LOESS smoothing for selecting a cutpoint</h2>
<p>When using <code>maximize_metric</code> and <code>minimize_metric</code> the optimal cutpoint is selected by searching the maximum or minimum of the metric function. For example, we may want to minimize the misclassification cost. Since false negatives (a suicide attempt was not anticipated) can be regarded as much more severe than false positives we can set the cost of a false negative <code>cost_fn</code> for example to ten times the cost of a false positive.</p>
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, <span class="dt">method =</span> minimize_metric,</span>
<span id="cb25-2"><a href="#cb25-2"></a>                     <span class="dt">metric =</span> misclassification_cost, <span class="dt">cost_fp =</span> <span class="dv">1</span>, <span class="dt">cost_fn =</span> <span class="dv">10</span>)</span></code></pre></div>
<pre><code>## Assuming the positive class is yes</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1"></a><span class="kw">plot_metric</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<p>As this “optimal” cutpoint may depend on minor differences between the possible cutoffs, smoothing of the function of metric values by cutpoint value might be desirable, especially in small samples. The <code>minimize_loess_metric</code> and <code>maximize_loess_metric</code> functions can be used to smooth the function so that the optimal cutpoint is selected based on the smoothed metric values. Options to modify the smoothing, which is implemented using <code>loess.as</code> from the <strong>fANCOVA</strong> package, include:</p>
<ul>
<li><code>criterion</code>: the criterion for automatic smoothing parameter selection: “aicc” denotes bias-corrected AIC criterion, “gcv” denotes generalized cross-validation.</li>
<li><code>degree</code>: the degree of the local polynomials to be used. It can be 0, 1 or 2.</li>
<li><code>family</code>: if “gaussian” fitting is by least-squares, and if “symmetric” a re-descending M estimator is used with Tukey’s biweight function.</li>
<li><code>user.span</code>: the user-defined parameter which controls the degree of smoothing.</li>
</ul>
<p>Using parameters for the LOESS smoothing of <code>criterion = &quot;aicc&quot;</code>, <code>degree = 2</code>, <code>family = &quot;symmetric&quot;</code>, and <code>user.span = 0.7</code> we get the following smoothed versions of the above metrics:</p>
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, </span>
<span id="cb29-2"><a href="#cb29-2"></a>                     <span class="dt">method =</span> minimize_loess_metric,</span>
<span id="cb29-3"><a href="#cb29-3"></a>                     <span class="dt">criterion =</span> <span class="st">&quot;aicc&quot;</span>, <span class="dt">family =</span> <span class="st">&quot;symmetric&quot;</span>, </span>
<span id="cb29-4"><a href="#cb29-4"></a>                     <span class="dt">degree =</span> <span class="dv">2</span>, <span class="dt">user.span =</span> <span class="fl">0.7</span>,</span>
<span id="cb29-5"><a href="#cb29-5"></a>                     <span class="dt">metric =</span> misclassification_cost, <span class="dt">cost_fp =</span> <span class="dv">1</span>, <span class="dt">cost_fn =</span> <span class="dv">10</span>)</span></code></pre></div>
<div class="sourceCode" id="cb30"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1"></a><span class="kw">plot_metric</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<p>The optimal cutpoint for the female subgroup changes to 3. Note, though, that there are no reliable rules for selecting the “best” smoothing parameters. Notably, the LOESS smoothing is sensitive to the number of unique cutpoints. A large number of unique cutpoints generally leads to a more volatile curve of metric values by cutpoint value, even after smoothing. Thus, the curve tends to be undersmoothed in that scenario. The unsmoothed metric values are returned in <code>opt_cut$roc_curve</code> in the column <code>m_unsmoothed</code>.</p>
</div>
<div id="smoothing-via-generalized-additive-models-for-selecting-a-cutpoint" class="section level2">
<h2>Smoothing via Generalized Additive Models for selecting a cutpoint</h2>
<p>In a similar fashion, the function of metric values per cutpoint can be smoothed using Generalized Additive Models with smooth terms. Internally, <code>mgcv::gam</code> carries out the smoothing which can be customized via the arguments <code>formula</code> and <code>optimizer</code>, see <code>help(&quot;gam&quot;, package = &quot;mgcv&quot;)</code>. Most importantly, the GAM can be specified by altering the default formula, for example the smoothing function could be configured to apply cubic regression splines (<code>&quot;cr&quot;</code>) as the smooth term. As the <code>suicide</code> data has only very few unique cutpoints, it is not very suitable for showcasing the GAM smoothing, so we will use two classes of the <code>iris</code> data here. In this case, the purely empirical method and the GAM smoothing lead to identical cutpoints, but in practice the GAM smoothing tends to be more robust, especially with larger data. An attractive feature of the GAM smoothing is that the default values tend to work quite well and usually require no tuning, eliminating researcher degrees of freedom.</p>
<div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1"></a><span class="kw">library</span>(ggplot2)</span>
<span id="cb31-2"><a href="#cb31-2"></a>exdat &lt;-<span class="st"> </span>iris</span>
<span id="cb31-3"><a href="#cb31-3"></a>exdat &lt;-<span class="st"> </span>exdat[exdat<span class="op">$</span>Species <span class="op">!=</span><span class="st"> &quot;setosa&quot;</span>, ]</span>
<span id="cb31-4"><a href="#cb31-4"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(exdat, Petal.Length, Species,</span>
<span id="cb31-5"><a href="#cb31-5"></a>                     <span class="dt">method =</span> minimize_gam_metric,</span>
<span id="cb31-6"><a href="#cb31-6"></a>                     <span class="dt">formula =</span> m <span class="op">~</span><span class="st"> </span><span class="kw">s</span>(x.sorted, <span class="dt">bs =</span> <span class="st">&quot;cr&quot;</span>),</span>
<span id="cb31-7"><a href="#cb31-7"></a>                     <span class="dt">metric =</span> abs_d_sens_spec)</span></code></pre></div>
<pre><code>## Assuming the positive class is virginica</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<div class="sourceCode" id="cb34"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb34-1"><a href="#cb34-1"></a><span class="kw">plot_metric</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
</div>
<div id="spline-smoothing-for-selecting-a-cutpoint" class="section level2">
<h2>Spline smoothing for selecting a cutpoint</h2>
<p>Again in the same fashion the function of metric values per cutpoint can be smoothed using smoothing splines. By default, the number of knots is automatically chosen using the <code>cutpoint_knots</code> function. That function uses <code>stats::.nknots.smspl</code>, which is the default in <code>stats::smooth.spline</code> to pick the number of knots.</p>
<p>Alternatively, the number of knots can be set manually and also the other smoothing parameters of <code>stats::smooth.spline</code> can be set as desired. For details see <code>?maximize_spline_metric</code>.</p>
<div class="sourceCode" id="cb35"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, </span>
<span id="cb35-2"><a href="#cb35-2"></a>                     <span class="dt">method =</span> minimize_spline_metric, <span class="dt">spar =</span> <span class="fl">0.4</span>,</span>
<span id="cb35-3"><a href="#cb35-3"></a>                     <span class="dt">metric =</span> misclassification_cost, <span class="dt">cost_fp =</span> <span class="dv">1</span>, <span class="dt">cost_fn =</span> <span class="dv">10</span>)</span></code></pre></div>
<pre><code>## Assuming the positive class is yes</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<pre><code>## nknots: 10
## nknots: 10</code></pre>
<div class="sourceCode" id="cb39"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb39-1"><a href="#cb39-1"></a><span class="kw">plot_metric</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<div id="parametric-method-assuming-normality" class="section level3">
<h3>Parametric method assuming normality</h3>
<p>The Normal method in <code>oc_youden_normal</code> is a parametric method for maximizing the Youden-Index or equivalently the sum of <span class="math inline">\(Se\)</span> and <span class="math inline">\(Sp\)</span>. It relies on the assumption that the predictor for both the negative and positive observations is normally distributed. In that case it can be shown that</p>
<p><span class="math display">\[c^* = \frac{(\mu_P \sigma_N^2 - \mu_N \sigma_P^2) - \sigma_N \sigma_P \sqrt{(\mu_N - \mu_P)^2 + (\sigma_N^2 - \sigma_P^2) log(\sigma_N^2 / \sigma_P^2)}}{\sigma_N^2 - \sigma_P^2}\]</span></p>
<p>where the negative class is normally distributed with <span class="math inline">\(\sim N(\mu_N, \sigma_N^2)\)</span> and the positive class independently normally distributed with <span class="math inline">\(\sim N(\mu_P, \sigma_P^2)\)</span> provides the optimal cutpoint <span class="math inline">\(c^*\)</span> that maximizes the Youden-Index. If <span class="math inline">\(\sigma_N\)</span> and <span class="math inline">\(\sigma_P\)</span> are equal, the expression can be simplified to <span class="math inline">\(c^* = \frac{\mu_N + \mu_P}{2}\)</span>. However, the <code>oc_youden_normal</code> method in cutpointr always assumes unequal standard deviations. Since this method does not select a cutpoint from the observed predictor values, it is questionable which values for <span class="math inline">\(Se\)</span> and <span class="math inline">\(Sp\)</span> should be reported. Here, the Youden-Index can be calculated as</p>
<p><span class="math display">\[J = \Phi(\frac{c^* - \mu_N}{\sigma_N}) - \Phi(\frac{c^* - \mu_P}{\sigma_P})\]</span></p>
<p>if the assumption of normality holds. However, since there exist several methods that do not select cutpoints from the available observations and to unify the reporting of metrics for these methods, <strong>cutpointr</strong> reports all metrics, e.g. <span class="math inline">\(Se\)</span> and <span class="math inline">\(Sp\)</span>, based on the empirical observations.</p>
<div class="sourceCode" id="cb40"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1"></a><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, <span class="dt">method =</span> oc_youden_normal)</span></code></pre></div>
<pre><code>## Assuming the positive class is yes</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<pre><code>## # A tibble: 2 x 18
##   subgroup direction optimal_cutpoint method           sum_sens_spec      acc
##   &lt;chr&gt;    &lt;chr&gt;                &lt;dbl&gt; &lt;chr&gt;                    &lt;dbl&gt;    &lt;dbl&gt;
## 1 female   &gt;=                 2.47775 oc_youden_normal       1.71618 0.895408
## 2 male     &gt;=                 3.17226 oc_youden_normal       1.54453 0.864286
##   sensitivity specificity      AUC pos_class neg_class prevalence outcome
##         &lt;dbl&gt;       &lt;dbl&gt;    &lt;dbl&gt; &lt;fct&gt;     &lt;fct&gt;          &lt;dbl&gt; &lt;chr&gt;  
## 1    0.814815    0.901370 0.944647 yes       no         0.0688776 suicide
## 2    0.666667    0.877863 0.861747 yes       no         0.0642857 suicide
##   predictor grouping data               roc_curve                boot 
##   &lt;chr&gt;     &lt;chr&gt;    &lt;list&gt;             &lt;list&gt;                   &lt;lgl&gt;
## 1 dsi       gender   &lt;tibble [392 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA   
## 2 dsi       gender   &lt;tibble [140 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA</code></pre>
</div>
<div id="nonparametric-kernel-method" class="section level3">
<h3>Nonparametric kernel method</h3>
<p>A nonparametric alternative is the Kernel method <span class="citation">[@fluss_estimation_2005]</span>. Here, the empirical distribution functions are smoothed using the Gaussian kernel functions <span class="math inline">\(\hat{F}_N(t) = \frac{1}{n} \sum^n_{i=1} \Phi(\frac{t - y_i}{h_y})\)</span> and <span class="math inline">\(\hat{G}_P(t) = \frac{1}{m} \sum^m_{i=1} \Phi(\frac{t - x_i}{h_x})\)</span> for the negative and positive classes respectively. Following Silverman’s plug-in “rule of thumb” the bandwidths are selected as <span class="math inline">\(h_y = 0.9 * min\{s_y, iqr_y/1.34\} * n^{-0.2}\)</span> and <span class="math inline">\(h_x = 0.9 * min\{s_x, iqr_x/1.34\} * m^{-0.2}\)</span> where <span class="math inline">\(s\)</span> is the sample standard deviation and <span class="math inline">\(iqr\)</span> is the inter quartile range. It has been demonstrated that AUC estimation is rather insensitive to the choice of the bandwidth procedure <span class="citation">[@faraggi_estimation_2002]</span> and thus the plug-in bandwidth estimator has also been recommended for cutpoint estimation. The <code>oc_youden_kernel</code> function in <strong>cutpointr</strong> uses a Gaussian kernel and the direct plug-in method for selecting the bandwidths. The kernel smoothing is done via the <code>bkde</code> function from the <strong>KernSmooth</strong> package <span class="citation">[@wand_kernsmooth:_2013]</span>.</p>
<p>Again, there is a way to calculate the Youden-Index from the results of this method <span class="citation">[@fluss_estimation_2005]</span> which is</p>
<p><span class="math display">\[\hat{J} = max_c \{\hat{F}_N(c) - \hat{G}_N(c) \}\]</span></p>
<p>but as before we prefer to report all metrics based on applying the cutpoint that was estimated using the Kernel method to the empirical observations.</p>
<div class="sourceCode" id="cb44"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb44-1"><a href="#cb44-1"></a><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, <span class="dt">method =</span> oc_youden_kernel)</span></code></pre></div>
<pre><code>## Assuming the positive class is yes</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<pre><code>## # A tibble: 2 x 18
##   subgroup direction optimal_cutpoint method           sum_sens_spec      acc
##   &lt;chr&gt;    &lt;chr&gt;                &lt;dbl&gt; &lt;chr&gt;                    &lt;dbl&gt;    &lt;dbl&gt;
## 1 female   &gt;=                 1.18128 oc_youden_kernel       1.80812 0.885204
## 2 male     &gt;=                 1.31636 oc_youden_kernel       1.58694 0.807143
##   sensitivity specificity      AUC pos_class neg_class prevalence outcome
##         &lt;dbl&gt;       &lt;dbl&gt;    &lt;dbl&gt; &lt;fct&gt;     &lt;fct&gt;          &lt;dbl&gt; &lt;chr&gt;  
## 1    0.925926    0.882192 0.944647 yes       no         0.0688776 suicide
## 2    0.777778    0.809160 0.861747 yes       no         0.0642857 suicide
##   predictor grouping data               roc_curve                boot 
##   &lt;chr&gt;     &lt;chr&gt;    &lt;list&gt;             &lt;list&gt;                   &lt;lgl&gt;
## 1 dsi       gender   &lt;tibble [392 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA   
## 2 dsi       gender   &lt;tibble [140 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA</code></pre>
</div>
</div>
</div>
<div id="additional-features" class="section level1">
<h1>Additional features</h1>
<div id="calculating-only-the-roc-curve" class="section level2">
<h2>Calculating only the ROC curve</h2>
<p>When running <code>cutpointr</code>, a ROC curve is by default returned in the column <code>roc_curve</code>. This ROC curve can be plotted using <code>plot_roc</code>. Alternatively, if only the ROC curve is desired and no cutpoint needs to be calculated, the ROC curve can be created using <code>roc()</code> and plotted using <code>plot_cutpointr</code>. The <code>roc</code> function, unlike <code>cutpointr</code>, does not determine <code>direction</code>, <code>pos_class</code> or <code>neg_class</code> automatically.</p>
<div class="sourceCode" id="cb48"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb48-1"><a href="#cb48-1"></a>roc_curve &lt;-<span class="st"> </span><span class="kw">roc</span>(<span class="dt">data =</span> suicide, <span class="dt">x =</span> dsi, <span class="dt">class =</span> suicide,</span>
<span id="cb48-2"><a href="#cb48-2"></a>    <span class="dt">pos_class =</span> <span class="st">&quot;yes&quot;</span>, <span class="dt">neg_class =</span> <span class="st">&quot;no&quot;</span>, <span class="dt">direction =</span> <span class="st">&quot;&gt;=&quot;</span>)</span>
<span id="cb48-3"><a href="#cb48-3"></a><span class="kw">auc</span>(roc_curve)</span></code></pre></div>
<pre><code>## [1] 0.9237791</code></pre>
<div class="sourceCode" id="cb50"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb50-1"><a href="#cb50-1"></a><span class="kw">head</span>(roc_curve)</span></code></pre></div>
<pre><code>## # A tibble: 6 x 9
##   x.sorted    tp    fp    tn    fn    tpr   tnr     fpr   fnr
##      &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; &lt;int&gt;  &lt;dbl&gt; &lt;dbl&gt;   &lt;dbl&gt; &lt;dbl&gt;
## 1      Inf     0     0   496    36 0      1     0       1    
## 2       11     1     0   496    35 0.0278 1     0       0.972
## 3       10     2     1   495    34 0.0556 0.998 0.00202 0.944
## 4        9     3     1   495    33 0.0833 0.998 0.00202 0.917
## 5        8     4     1   495    32 0.111  0.998 0.00202 0.889
## 6        7     7     1   495    29 0.194  0.998 0.00202 0.806</code></pre>
<div class="sourceCode" id="cb52"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1"></a><span class="kw">plot_roc</span>(roc_curve)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
</div>
<div id="midpoints" class="section level2">
<h2>Midpoints</h2>
<p>So far - which is the default in <code>cutpointr</code> - we have considered all unique values of the predictor as possible cutpoints. An alternative could be to use a sequence of equidistant values instead, for example in the case of the <code>suicide</code> data all integers in <span class="math inline">\([0, 10]\)</span>. However, with very sparse data and small intervals between the candidate cutpoints (i.e. a ‘dense’ sequence like <code>seq(0, 10, by = 0.01)</code>) this leads to the uninformative evaluation of large ranges of cutpoints that all result in the same metric value. A more elegant alternative, not only for the case of sparse data, that is supported by <strong>cutpointr</strong> is the use of a mean value of the optimal cutpoint and the next highest (if <code>direction = &quot;&gt;=&quot;</code>) or the next lowest (if <code>direction = &quot;&lt;=&quot;</code>) predictor value in the data. The result is an optimal cutpoint that is equal to the cutpoint that would be obtained using an infinitely dense sequence of candidate cutpoints and is thus usually more efficient computationally. This behavior can be activated by setting <code>use_midpoints = TRUE</code>, which is the default. If we use this setting, we obtain an optimal cutpoint of 1.5 for the complete sample on the <code>suicide</code> data instead of 2 when maximizing the sum of sensitivity and specificity.</p>
<p>Assume the following small data set:</p>
<div class="sourceCode" id="cb53"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb53-1"><a href="#cb53-1"></a>dat &lt;-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">outcome =</span> <span class="kw">c</span>(<span class="st">&quot;neg&quot;</span>, <span class="st">&quot;neg&quot;</span>, <span class="st">&quot;neg&quot;</span>, <span class="st">&quot;pos&quot;</span>, <span class="st">&quot;pos&quot;</span>, <span class="st">&quot;pos&quot;</span>, <span class="st">&quot;pos&quot;</span>),</span>
<span id="cb53-2"><a href="#cb53-2"></a>                  <span class="dt">pred    =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">2</span>, <span class="dv">3</span>, <span class="dv">8</span>, <span class="dv">11</span>, <span class="dv">11</span>, <span class="dv">12</span>))</span></code></pre></div>
<p>Since the distance of the optimal cutpoint (8) to the next lowest observation (3) is rather large we arrive at a range of possible cutpoints that all maximize the metric. In the case of this kind of sparseness it might for example be desirable to classify a new observation with a predictor value of 4 as belonging to the negative class. If <code>use_midpoints</code> is set to <code>TRUE</code>, the mean of the optimal cutpoint and the next lowest observation is returned as the optimal cutpoint, if direction is <code>&gt;=</code>. The mean of the optimal cutpoint and the next highest observation is returned as the optimal cutpoint, if <code>direction = &quot;&lt;=&quot;</code>.</p>
<div class="sourceCode" id="cb54"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb54-1"><a href="#cb54-1"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(dat, <span class="dt">x =</span> pred, <span class="dt">class =</span> outcome, <span class="dt">use_midpoints =</span> <span class="ot">TRUE</span>)</span></code></pre></div>
<pre><code>## Assuming the positive class is pos</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<div class="sourceCode" id="cb57"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb57-1"><a href="#cb57-1"></a><span class="kw">plot_x</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<p>A simulation demonstrates more clearly that setting <code>use_midpoints = TRUE</code> avoids biasing the cutpoints. To simulate the bias of the metric functions, the predictor values of both classes were drawn from normal distributions with constant standard deviations of 10, a constant mean of the negative class of 100 and higher mean values of the positive class that are selected in such a way that optimal Youden-Index values of 0.2, 0.4, 0.6, and 0.8 result in the population. Samples of 9 different sizes were drawn and the cutpoints that maximize the Youden-Index were estimated. The simulation was repeated 10000 times. As can be seen by the mean error, <code>use_midpoints = TRUE</code> eliminates the bias that is introduced by otherwise selecting the value of an observation as the optimal cutpoint. If <code>direction = &quot;&gt;=&quot;</code>, as in this case, the observation that represents the optimal cutpoint is the highest possible cutpoint that leads to the optimal metric value and thus the biases are positive. The methods <code>oc_youden_normal</code> and <code>oc_youden_kernel</code> are always unbiased, as they don’t select a cutpoint based on the ROC-curve or the function of metric values per cutpoint.</p>
<pre><code>## 
## Attaching package: &#39;dplyr&#39;</code></pre>
<pre><code>## The following objects are masked from &#39;package:stats&#39;:
## 
##     filter, lag</code></pre>
<pre><code>## The following objects are masked from &#39;package:base&#39;:
## 
##     intersect, setdiff, setequal, union</code></pre>
<p><img src="" style="display: block; margin: auto;" /></p>
</div>
<div id="finding-all-cutpoints-with-acceptable-performance" class="section level2">
<h2>Finding all cutpoints with acceptable performance</h2>
<p>By default, most packages only return the “best” cutpoint and disregard other cutpoints with quite similar performance, even if the performance differences are minuscule. <strong>cutpointr</strong> makes this process more explicit via the <code>tol_metric</code> argument. For example, if all cutpoints are of interest that achieve at least an accuracy within <code>0.05</code> of the optimally achievable accuracy, <code>tol_metric</code> can be set to <code>0.05</code> and also those cutpoints will be returned.</p>
<p>In the case of the <code>suicide</code> data and when maximizing the sum of sensitivity and specificity, empirically the cutpoints 2 and 3 lead to quite similar performances. If <code>tol_metric</code> is set to <code>0.05</code>, both will be returned.</p>
<div class="sourceCode" id="cb61"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb61-1"><a href="#cb61-1"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, <span class="dt">metric =</span> sum_sens_spec, </span>
<span id="cb61-2"><a href="#cb61-2"></a>                     <span class="dt">tol_metric =</span> <span class="fl">0.05</span>, <span class="dt">break_ties =</span> c)</span></code></pre></div>
<pre><code>## Assuming the positive class is yes</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<pre><code>## Multiple optimal cutpoints found, applying break_ties.</code></pre>
<div class="sourceCode" id="cb65"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb65-1"><a href="#cb65-1"></a><span class="kw">library</span>(tidyr)</span>
<span id="cb65-2"><a href="#cb65-2"></a>opt_cut <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb65-3"><a href="#cb65-3"></a><span class="st">    </span><span class="kw">select</span>(optimal_cutpoint, sum_sens_spec) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb65-4"><a href="#cb65-4"></a><span class="st">    </span><span class="kw">unnest</span>(<span class="dt">cols =</span> <span class="kw">c</span>(optimal_cutpoint, sum_sens_spec))</span></code></pre></div>
<pre><code>## # A tibble: 2 x 2
##   optimal_cutpoint sum_sens_spec
##              &lt;dbl&gt;         &lt;dbl&gt;
## 1                2          1.75
## 2                1          1.70</code></pre>
</div>
<div id="manual-and-mean-median-cutpoints" class="section level2">
<h2>Manual and mean / median cutpoints</h2>
<p>Using the <code>oc_manual</code> function the optimal cutpoint will not be determined based on, for example, a metric but is instead set manually using the <code>cutpoint</code> argument. This is useful for supplying and evaluating cutpoints that were found in the literature or in other external sources.</p>
<p>The <code>oc_manual</code> function could also be used to set the cutpoint to the sample mean using <code>cutpoint = mean(data$x)</code>. However, this may introduce bias into the bootstrap validation procedure, since the actual mean of the population is not known and thus the mean to be used as the cutpoint should be automatically determined in every resample. To do so, the <code>oc_mean</code> and <code>oc_median</code> functions can be used.</p>
<div class="sourceCode" id="cb67"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb67-1"><a href="#cb67-1"></a><span class="kw">set.seed</span>(<span class="dv">100</span>)</span>
<span id="cb67-2"><a href="#cb67-2"></a>opt_cut_manual &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, <span class="dt">method =</span> oc_manual, </span>
<span id="cb67-3"><a href="#cb67-3"></a>                       <span class="dt">cutpoint =</span> <span class="kw">mean</span>(suicide<span class="op">$</span>dsi), <span class="dt">boot_runs =</span> <span class="dv">1000</span>)</span>
<span id="cb67-4"><a href="#cb67-4"></a><span class="kw">set.seed</span>(<span class="dv">100</span>)</span>
<span id="cb67-5"><a href="#cb67-5"></a>opt_cut_mean &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, <span class="dt">method =</span> oc_mean, <span class="dt">boot_runs =</span> <span class="dv">1000</span>)</span></code></pre></div>
</div>
<div id="nonstandard-evaluation-via-tidyeval" class="section level2">
<h2>Nonstandard evaluation via tidyeval</h2>
<p>The arguments to <code>cutpointr</code> do not need to be enclosed in quotes. This is possible thanks to nonstandard evaluation of the arguments, which are evaluated on <code>data</code>.</p>
<p>Functions that use nonstandard evaluation are often not suitable for programming with. The use of nonstandard evaluation may lead to scoping problems and subsequent obvious as well as possibly subtle errors. <strong>cutpointr</strong> uses tidyeval internally and accordingly the same rules as for programming with <code>dplyr</code> apply. Arguments can be unquoted with <code>!!</code>:</p>
<div class="sourceCode" id="cb68"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb68-1"><a href="#cb68-1"></a>myvar &lt;-<span class="st"> &quot;dsi&quot;</span></span>
<span id="cb68-2"><a href="#cb68-2"></a><span class="kw">cutpointr</span>(suicide, <span class="op">!!</span>myvar, suicide)</span></code></pre></div>
</div>
<div id="roc-curve-and-optimal-cutpoint-for-multiple-variables" class="section level2">
<h2>ROC curve and optimal cutpoint for multiple variables</h2>
<p>Alternatively, we can map the standard evaluation version <code>cutpointr</code> to the column names. If <code>direction</code> and / or <code>pos_class</code> and <code>neg_class</code> are unspecified, these parameters will automatically be determined by <strong>cutpointr</strong> so that the AUC values for all variables will be <span class="math inline">\(&gt; 0.5\)</span>.</p>
<p>We could do this manually, e.g. using <code>purrr::map</code>, but to make this task more convenient <code>multi_cutpointr</code> can be used to achieve the same result. It maps multiple predictor columns to <code>cutpointr</code>, by default all numeric columns except for the class column.</p>
<div class="sourceCode" id="cb69"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb69-1"><a href="#cb69-1"></a>mcp &lt;-<span class="st"> </span><span class="kw">multi_cutpointr</span>(suicide, <span class="dt">class =</span> suicide, <span class="dt">pos_class =</span> <span class="st">&quot;yes&quot;</span>, </span>
<span id="cb69-2"><a href="#cb69-2"></a>                <span class="dt">use_midpoints =</span> <span class="ot">TRUE</span>, <span class="dt">silent =</span> <span class="ot">TRUE</span>) </span>
<span id="cb69-3"><a href="#cb69-3"></a><span class="kw">summary</span>(mcp)</span></code></pre></div>
<pre><code>## Method: maximize_metric 
## Predictor: age, dsi 
## Outcome: suicide 
## 
## Predictor: age 
## -------------------------------------------------------------------------------- 
##  direction    AUC   n n_pos n_neg
##         &lt;= 0.5257 532    36   496
## 
##  optimal_cutpoint sum_sens_spec    acc sensitivity specificity tp fn  fp tn
##              55.5        1.1154 0.1992      0.9722      0.1431 35  1 425 71
## 
## Predictor summary: 
##     Data Min. 5% 1st Qu. Median    Mean 3rd Qu.   95% Max.      SD NAs
##  Overall   18 19      24   28.0 34.1259   41.25 65.00   83 15.0542   0
##       no   18 19      24   28.0 34.2218   41.25 65.50   83 15.1857   0
##      yes   18 18      22   27.5 32.8056   41.25 54.25   69 13.2273   0
## 
## Predictor: dsi 
## -------------------------------------------------------------------------------- 
##  direction    AUC   n n_pos n_neg
##         &gt;= 0.9238 532    36   496
## 
##  optimal_cutpoint sum_sens_spec    acc sensitivity specificity tp fn fp  tn
##               1.5        1.7518 0.8647      0.8889      0.8629 32  4 68 428
## 
## Predictor summary: 
##     Data Min.   5% 1st Qu. Median   Mean 3rd Qu.  95% Max.     SD NAs
##  Overall    0 0.00       0      0 0.9211       1 5.00   11 1.8527   0
##       no    0 0.00       0      0 0.6331       0 4.00   10 1.4122   0
##      yes    0 0.75       4      5 4.8889       6 9.25   11 2.5498   0</code></pre>
</div>
<div id="accessing-data-roc_curve-and-boot" class="section level2">
<h2>Accessing <code>data</code>, <code>roc_curve</code>, and <code>boot</code></h2>
<p>The object returned by <code>cutpointr</code> is of the classes <code>cutpointr</code>, <code>tbl_df</code>, <code>tbl</code>, and <code>data.frame</code>. Thus, it can be handled like a usual data frame. The columns <code>data</code>, <code>roc_curve</code>, and <code>boot</code> consist of nested data frames, which means that these are list columns whose elements are data frames. They can either be accessed using <code>[</code> or by using functions from the tidyverse. If subgroups were given, the output contains one row per subgroup and the function that accesses the data should be mapped to every row or the data should be grouped by subgroup.</p>
<div class="sourceCode" id="cb71"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb71-1"><a href="#cb71-1"></a><span class="kw">set.seed</span>(<span class="dv">123</span>)</span>
<span id="cb71-2"><a href="#cb71-2"></a>opt_cut_b_g &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, <span class="dt">boot_runs =</span> <span class="dv">1000</span>)</span></code></pre></div>
<div class="sourceCode" id="cb72"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb72-1"><a href="#cb72-1"></a><span class="co"># Using dplyr and tidyr</span></span>
<span id="cb72-2"><a href="#cb72-2"></a><span class="kw">library</span>(tidyr)</span>
<span id="cb72-3"><a href="#cb72-3"></a>opt_cut_b_g <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb72-4"><a href="#cb72-4"></a><span class="st">  </span><span class="kw">group_by</span>(subgroup) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb72-5"><a href="#cb72-5"></a><span class="st">  </span><span class="kw">select</span>(subgroup, boot) <span class="op">%&gt;%</span></span>
<span id="cb72-6"><a href="#cb72-6"></a><span class="st">  </span><span class="kw">unnest</span>(<span class="dt">cols =</span> boot) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb72-7"><a href="#cb72-7"></a><span class="st">  </span><span class="kw">summarise</span>(<span class="dt">sd_oc_boot =</span> <span class="kw">sd</span>(optimal_cutpoint),</span>
<span id="cb72-8"><a href="#cb72-8"></a>            <span class="dt">m_oc_boot  =</span> <span class="kw">mean</span>(optimal_cutpoint),</span>
<span id="cb72-9"><a href="#cb72-9"></a>            <span class="dt">m_acc_oob  =</span> <span class="kw">mean</span>(acc_oob))</span></code></pre></div>
<pre><code>## # A tibble: 2 x 4
##   subgroup sd_oc_boot m_oc_boot m_acc_oob
## * &lt;chr&gt;         &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;
## 1 female        0.766      2.17     0.880
## 2 male          1.51       2.92     0.806</code></pre>
</div>
<div id="adding-metrics-to-the-result-of-cutpointr-or-roc" class="section level2">
<h2>Adding metrics to the result of cutpointr() or roc()</h2>
<p>By default, the output of <code>cutpointr</code> includes the optimized metric and several other metrics. The <code>add_metric</code> function adds further metrics. Here, we’re adding the negative predictive value (NPV) and the positive predictive value (PPV) at the optimal cutpoint per subgroup:</p>
<div class="sourceCode" id="cb74"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb74-1"><a href="#cb74-1"></a><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, <span class="dt">metric =</span> youden, <span class="dt">silent =</span> <span class="ot">TRUE</span>) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb74-2"><a href="#cb74-2"></a><span class="st">    </span><span class="kw">add_metric</span>(<span class="kw">list</span>(ppv, npv)) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb74-3"><a href="#cb74-3"></a><span class="st">    </span><span class="kw">select</span>(subgroup, optimal_cutpoint, youden, ppv, npv)</span></code></pre></div>
<pre><code>## # A tibble: 2 x 5
##   subgroup optimal_cutpoint   youden      ppv      npv
##   &lt;chr&gt;               &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;
## 1 female                  2 0.808118 0.367647 0.993827
## 2 male                    3 0.625106 0.259259 0.982301</code></pre>
<p>In the same fashion, additional metric columns can be added to a <code>roc_cutpointr</code> object:</p>
<div class="sourceCode" id="cb76"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb76-1"><a href="#cb76-1"></a><span class="kw">roc</span>(<span class="dt">data =</span> suicide, <span class="dt">x =</span> dsi, <span class="dt">class =</span> suicide, <span class="dt">pos_class =</span> <span class="st">&quot;yes&quot;</span>,</span>
<span id="cb76-2"><a href="#cb76-2"></a>    <span class="dt">neg_class =</span> <span class="st">&quot;no&quot;</span>, <span class="dt">direction =</span> <span class="st">&quot;&gt;=&quot;</span>) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb76-3"><a href="#cb76-3"></a><span class="st">  </span><span class="kw">add_metric</span>(<span class="kw">list</span>(cohens_kappa, F1_score)) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb76-4"><a href="#cb76-4"></a><span class="st">  </span><span class="kw">select</span>(x.sorted, tp, fp, tn, fn, cohens_kappa, F1_score) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb76-5"><a href="#cb76-5"></a><span class="st">  </span><span class="kw">head</span>()</span></code></pre></div>
<pre><code>## # A tibble: 6 x 7
##   x.sorted    tp    fp    tn    fn cohens_kappa F1_score
##      &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; &lt;int&gt;        &lt;dbl&gt;    &lt;dbl&gt;
## 1      Inf     0     0   496    36       0        0     
## 2       11     1     0   496    35       0.0506   0.0541
## 3       10     2     1   495    34       0.0931   0.103 
## 4        9     3     1   495    33       0.138    0.15  
## 5        8     4     1   495    32       0.182    0.195 
## 6        7     7     1   495    29       0.301    0.318</code></pre>
</div>
<div id="user-defined-functions" class="section level2">
<h2>User-defined functions</h2>
<div id="method" class="section level3">
<h3>method</h3>
<p>User-defined functions can be supplied to <code>method</code>, which is the function that is responsible for returning the optimal cutpoint. To define a new method function, create a function that may take as input(s):</p>
<ul>
<li><code>data</code>: A <code>data.frame</code> or <code>tbl_df</code></li>
<li><code>x</code>: (character) The name of the predictor variable</li>
<li><code>class</code>: (character) The name of the class variable</li>
<li><code>metric_func</code>: A function for calculating a metric, e.g. accuracy. Note that the method function does not necessarily have to accept this argument</li>
<li><code>pos_class</code>: The positive class</li>
<li><code>neg_class</code>: The negative class</li>
<li><code>direction</code>: <code>&quot;&gt;=&quot;</code> if the positive class has higher x values, <code>&quot;&lt;=&quot;</code> otherwise</li>
<li><code>tol_metric</code>: (numeric) In the built-in methods, all cutpoints will be returned that lead to a metric value in the interval [m_max - tol_metric, m_max + tol_metric] where m_max is the maximum achievable metric value. This can be used to return multiple decent cutpoints and to avoid floating-point problems.</li>
<li><code>use_midpoints</code>: (logical) In the built-in methods, if TRUE (default FALSE) the returned optimal cutpoint will be the mean of the optimal cutpoint and the next highest observation (for direction = “&gt;”) or the next lowest observation (for direction = “&lt;”) which avoids biasing the optimal cutpoint.</li>
<li><code>...</code>: Further arguments that are passed to <code>metric</code> or that can be captured inside of <code>method</code></li>
</ul>
<p>The function should return a data frame or tibble with one row, the column <code>optimal_cutpoint</code>, and an optional column with an arbitrary name with the metric value at the optimal cutpoint.</p>
<p>For example, a function for choosing the cutpoint as the mean of the independent variable could look like this:</p>
<div class="sourceCode" id="cb78"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb78-1"><a href="#cb78-1"></a>mean_cut &lt;-<span class="st"> </span><span class="cf">function</span>(data, x, ...) {</span>
<span id="cb78-2"><a href="#cb78-2"></a>    oc &lt;-<span class="st"> </span><span class="kw">mean</span>(data[[x]])</span>
<span id="cb78-3"><a href="#cb78-3"></a>    <span class="kw">return</span>(<span class="kw">data.frame</span>(<span class="dt">optimal_cutpoint =</span> oc))</span>
<span id="cb78-4"><a href="#cb78-4"></a>}</span></code></pre></div>
<p>If a <code>method</code> function does not return a metric column, the default <code>sum_sens_spec</code>, the sum of sensitivity and specificity, is returned as the extra metric column in addition to accuracy, sensitivity and specificity.</p>
<p>Some <code>method</code> functions that make use of the additional arguments (that are captured by <code>...</code>) are already included in <strong>cutpointr</strong>, see the list at the top. Since these functions are arguments to <code>cutpointr</code> their code can be accessed by simply typing their name, see for example <code>oc_youden_normal</code>.</p>
</div>
<div id="metric" class="section level3">
<h3>metric</h3>
<p>User defined <code>metric</code> functions can be used as well. They are mainly useful in conjunction with <code>method = maximize_metric</code>, <code>method = minimize_metric</code>, or one of the other minimization and maximization functions. In case of a different <code>method</code> function <code>metric</code> will only be used as the main out-of-bag metric when plotting the result. The <code>metric</code> function should accept the following inputs as vectors:</p>
<ul>
<li><code>tp</code>: Vector of true positives</li>
<li><code>fp</code>: Vector of false positives</li>
<li><code>tn</code>: Vector of true negatives</li>
<li><code>fn</code>: Vector of false negatives</li>
<li><code>...</code>: Further arguments</li>
</ul>
<p>The function should return a numeric vector, a matrix, or a <code>data.frame</code> with one column. If the column is named, the name will be included in the output and plots. Avoid using names that are identical to the column names that are by default returned by <strong>cutpointr</strong>, as such names will be prefixed by <code>metric_</code> in the output. The inputs (<code>tp</code>, <code>fp</code>, <code>tn</code>, and <code>fn</code>) are vectors. The code of the included metric functions can be accessed by simply typing their name.</p>
<p>For example, this is the <code>misclassification_cost</code> metric function:</p>
<div class="sourceCode" id="cb79"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb79-1"><a href="#cb79-1"></a>misclassification_cost</span></code></pre></div>
<pre><code>## function (tp, fp, tn, fn, cost_fp = 1, cost_fn = 1, ...) 
## {
##     misclassification_cost &lt;- cost_fp * fp + cost_fn * fn
##     misclassification_cost &lt;- matrix(misclassification_cost, 
##         ncol = 1)
##     colnames(misclassification_cost) &lt;- &quot;misclassification_cost&quot;
##     return(misclassification_cost)
## }
## &lt;bytecode: 0x000000001f68f520&gt;
## &lt;environment: namespace:cutpointr&gt;</code></pre>
</div>
</div>
</div>
<div id="plotting" class="section level1">
<h1>Plotting</h1>
<p><strong>cutpointr</strong> includes several convenience functions for plotting data from a <code>cutpointr</code> object. These include:</p>
<ul>
<li><code>plot_cutpointr</code>: General purpose plotting function for cutpointr or roc_cutpointr objects</li>
<li><code>plot_cut_boot</code>: Plot the bootstrapped distribution of optimal cutpoints</li>
<li><code>plot_metric</code>: If <code>maximize_metric</code> or <code>minimize_metric</code> was used this function plots all possible cutoffs on the x-axis vs. the respective metric values on the y-axis. If bootstrapping was run, a confidence interval based on the bootstrapped distribution of metric values at each cutpoint can be displayed. To display no confidence interval set <code>conf_lvl = 0</code>.</li>
<li><code>plot_metric_boot</code>: Plot the distribution of out-of-bag metric values</li>
<li><code>plot_precision_recall</code>: Plot the precision recall curve</li>
<li><code>plot_sensitivity_specificity</code>: Plot all cutpoints vs. sensitivity and specificity</li>
<li><code>plot_roc</code>: Plot the ROC curve</li>
<li><code>plot_x</code>: Plot the distribution of the predictor variable</li>
</ul>
<div class="sourceCode" id="cb81"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb81-1"><a href="#cb81-1"></a><span class="kw">plot_cut_boot</span>(opt_cut_b_g)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<div class="sourceCode" id="cb82"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb82-1"><a href="#cb82-1"></a><span class="kw">plot_metric</span>(opt_cut_b_g, <span class="dt">conf_lvl =</span> <span class="fl">0.9</span>)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<div class="sourceCode" id="cb83"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb83-1"><a href="#cb83-1"></a><span class="kw">plot_metric_boot</span>(opt_cut_b_g)</span></code></pre></div>
<pre><code>## Warning: Removed 12 rows containing non-finite values (stat_density).</code></pre>
<p><img src="" style="display: block; margin: auto;" /></p>
<div class="sourceCode" id="cb85"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb85-1"><a href="#cb85-1"></a><span class="kw">plot_precision_recall</span>(opt_cut_b_g)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<div class="sourceCode" id="cb86"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb86-1"><a href="#cb86-1"></a><span class="kw">plot_sensitivity_specificity</span>(opt_cut_b_g)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<div class="sourceCode" id="cb87"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb87-1"><a href="#cb87-1"></a><span class="kw">plot_roc</span>(opt_cut_b_g)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<p>All plot functions, except for the standard plot method that returns a composed plot, return <code>ggplot</code> objects than can be further modified. For example, changing labels, title, and the theme can be achieved this way:</p>
<div class="sourceCode" id="cb88"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb88-1"><a href="#cb88-1"></a>p &lt;-<span class="st"> </span><span class="kw">plot_x</span>(opt_cut_b_g)</span>
<span id="cb88-2"><a href="#cb88-2"></a>p <span class="op">+</span><span class="st"> </span><span class="kw">ggtitle</span>(<span class="st">&quot;Distribution of dsi&quot;</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_minimal</span>() <span class="op">+</span><span class="st"> </span><span class="kw">xlab</span>(<span class="st">&quot;Depression score&quot;</span>)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<div id="flexible-plotting-function" class="section level2">
<h2>Flexible plotting function</h2>
<p>Using <code>plot_cutpointr</code> any metric can be chosen to be plotted on the x- or y-axis and results of <code>cutpointr()</code> as well as <code>roc()</code> can be plotted. If a <code>cutpointr</code> object is to be plotted, it is thus irrelevant which <code>metric</code> function was chosen for cutpoint estimation. Any metric that can be calculated based on the ROC curve can be subsequently plotted as only the true / false positives / negatives over all cutpoints are needed. That way, not only the above plots can be produced, but also any combination of two metrics (or metric functions) and / or cutpoints. The built-in metric functions as well as user-defined functions or anonymous functions can be supplied to <code>xvar</code> and <code>yvar</code>. If bootstrapping was run, confidence intervals can be plotted around the y-variable. This is especially useful if the cutpoints, available in the <code>cutpoints</code> function, are placed on the x-axis. Note that confidence intervals can only be correctly plotted if the values of <code>xvar</code> are constant across bootstrap samples. For example, confidence intervals for TPR by FPR (a ROC curve) cannot be plotted easily, as the values of the false positive rate vary per bootstrap sample.</p>
<div class="sourceCode" id="cb89"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb89-1"><a href="#cb89-1"></a><span class="kw">plot_cutpointr</span>(opt_cut_b, <span class="dt">xvar =</span> cutpoints, <span class="dt">yvar =</span> sum_sens_spec, <span class="dt">conf_lvl =</span> <span class="fl">0.9</span>)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<div class="sourceCode" id="cb90"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb90-1"><a href="#cb90-1"></a><span class="kw">plot_cutpointr</span>(opt_cut_b, <span class="dt">xvar =</span> fpr, <span class="dt">yvar =</span> tpr, <span class="dt">aspect_ratio =</span> <span class="dv">1</span>, <span class="dt">conf_lvl =</span> <span class="dv">0</span>)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<div class="sourceCode" id="cb91"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb91-1"><a href="#cb91-1"></a><span class="kw">plot_cutpointr</span>(opt_cut_b, <span class="dt">xvar =</span> cutpoint, <span class="dt">yvar =</span> tp, <span class="dt">conf_lvl =</span> <span class="fl">0.9</span>) <span class="op">+</span><span class="st"> </span><span class="kw">geom_point</span>()</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
</div>
<div id="manual-plotting" class="section level2">
<h2>Manual plotting</h2>
<p>Since <code>cutpointr</code> returns a <code>data.frame</code> with the original data, bootstrap results, and the ROC curve in nested tibbles, these data can be conveniently extracted and plotted manually. The relevant nested tibbles are in the columns <code>data</code>, <code>roc_curve</code> and <code>boot</code>. The following is an example of accessing and plotting the grouped data.</p>
<div class="sourceCode" id="cb92"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb92-1"><a href="#cb92-1"></a>opt_cut_b_g <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb92-2"><a href="#cb92-2"></a><span class="st">    </span><span class="kw">select</span>(data, subgroup) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb92-3"><a href="#cb92-3"></a><span class="st">    </span><span class="kw">unnest</span>(<span class="dt">cols =</span> data) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb92-4"><a href="#cb92-4"></a><span class="st">    </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> suicide, <span class="dt">y =</span> dsi)) <span class="op">+</span><span class="st"> </span></span>
<span id="cb92-5"><a href="#cb92-5"></a><span class="st">    </span><span class="kw">geom_boxplot</span>(<span class="dt">alpha =</span> <span class="fl">0.3</span>) <span class="op">+</span><span class="st"> </span><span class="kw">facet_grid</span>(<span class="op">~</span>subgroup)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
</div>
</div>
<div id="benchmarks" class="section level1">
<h1>Benchmarks</h1>
<p>To offer a comparison to established solutions, <strong>cutpointr</strong> will be benchmarked against <code>optimal.cutpoints</code> from the <strong>OptimalCutpoints</strong> package, <strong>ThresholdROC</strong> and custom functions based on the <strong>ROCR</strong> and <strong>pROC</strong> packages. By generating data of different sizes the benchmarks will offer a comparison of the scalability of the different solutions.</p>
<p>Using <code>prediction</code> and <code>performance</code> from the <strong>ROCR</strong> package and <code>roc</code> from the <strong>pROC</strong> package, we can write functions for computing the cutpoint that maximizes the sum of sensitivity and specificity. <strong>pROC</strong> has a built-in function to optimize a few metrics:</p>
<div class="sourceCode" id="cb93"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb93-1"><a href="#cb93-1"></a><span class="co"># Return cutpoint that maximizes the sum of sensitivity and specificiy</span></span>
<span id="cb93-2"><a href="#cb93-2"></a><span class="co"># ROCR package</span></span>
<span id="cb93-3"><a href="#cb93-3"></a>rocr_sensspec &lt;-<span class="st"> </span><span class="cf">function</span>(x, class) {</span>
<span id="cb93-4"><a href="#cb93-4"></a>    pred &lt;-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">prediction</span>(x, class)</span>
<span id="cb93-5"><a href="#cb93-5"></a>    perf &lt;-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">performance</span>(pred, <span class="st">&quot;sens&quot;</span>, <span class="st">&quot;spec&quot;</span>)</span>
<span id="cb93-6"><a href="#cb93-6"></a>    sens &lt;-<span class="st"> </span><span class="kw">slot</span>(perf, <span class="st">&quot;y.values&quot;</span>)[[<span class="dv">1</span>]]</span>
<span id="cb93-7"><a href="#cb93-7"></a>    spec &lt;-<span class="st"> </span><span class="kw">slot</span>(perf, <span class="st">&quot;x.values&quot;</span>)[[<span class="dv">1</span>]]</span>
<span id="cb93-8"><a href="#cb93-8"></a>    cut &lt;-<span class="st"> </span><span class="kw">slot</span>(perf, <span class="st">&quot;alpha.values&quot;</span>)[[<span class="dv">1</span>]]</span>
<span id="cb93-9"><a href="#cb93-9"></a>    cut[<span class="kw">which.max</span>(sens <span class="op">+</span><span class="st"> </span>spec)]</span>
<span id="cb93-10"><a href="#cb93-10"></a>}</span>
<span id="cb93-11"><a href="#cb93-11"></a></span>
<span id="cb93-12"><a href="#cb93-12"></a><span class="co"># pROC package</span></span>
<span id="cb93-13"><a href="#cb93-13"></a>proc_sensspec &lt;-<span class="st"> </span><span class="cf">function</span>(x, class) {</span>
<span id="cb93-14"><a href="#cb93-14"></a>    r &lt;-<span class="st"> </span>pROC<span class="op">::</span><span class="kw">roc</span>(class, x, <span class="dt">algorithm =</span> <span class="dv">2</span>, <span class="dt">levels =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">1</span>), <span class="dt">direction =</span> <span class="st">&quot;&lt;&quot;</span>)</span>
<span id="cb93-15"><a href="#cb93-15"></a>    pROC<span class="op">::</span><span class="kw">coords</span>(r, <span class="st">&quot;best&quot;</span>, <span class="dt">ret=</span><span class="st">&quot;threshold&quot;</span>, <span class="dt">transpose =</span> <span class="ot">FALSE</span>)[<span class="dv">1</span>]</span>
<span id="cb93-16"><a href="#cb93-16"></a>}</span></code></pre></div>
<p>The benchmarking will be carried out using the <strong>microbenchmark</strong> package and randomly generated data. The values of the <code>x</code> predictor variable are drawn from a normal distribution which leads to a lot more unique values than were encountered before in the <code>suicide</code> data. Accordingly, the search for an optimal cutpoint is much more demanding, if all possible cutpoints are evaluated.</p>
<p>Benchmarks are run for sample sizes of 100, 1000, 1e4, 1e5, 1e6, and 1e7. For low sample sizes <strong>cutpointr</strong> is slower than the other solutions. While this should be of low practical importance, <strong>cutpointr</strong> scales more favorably with increasing sample size. The speed disadvantage in small samples that leads to the lower limit of around 25ms is mainly due to the nesting of the original data and the results that makes the compact output of <code>cutpointr</code> possible. This observation is emphasized by the fact that <code>cutpointr::roc</code> is quite fast also in small samples. For sample sizes &gt; 1e5 <strong>cutpointr</strong> is a little faster than the function based on <strong>ROCR</strong> and <strong>pROC</strong>. Both of these solutions are generally faster than <strong>OptimalCutpoints</strong> and <strong>ThresholdROC</strong> with the exception of small samples. <strong>OptimalCutpoints</strong> and <strong>ThresholdROC</strong> had to be excluded from benchmarks with more than 1e4 observations due to high memory requirements and/or excessive run times, rendering the use of these packages in larger samples impractical.</p>
<div class="sourceCode" id="cb94"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb94-1"><a href="#cb94-1"></a><span class="co"># ROCR package</span></span>
<span id="cb94-2"><a href="#cb94-2"></a>rocr_roc &lt;-<span class="st"> </span><span class="cf">function</span>(x, class) {</span>
<span id="cb94-3"><a href="#cb94-3"></a>    pred &lt;-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">prediction</span>(x, class)</span>
<span id="cb94-4"><a href="#cb94-4"></a>    perf &lt;-<span class="st"> </span>ROCR<span class="op">::</span><span class="kw">performance</span>(pred, <span class="st">&quot;sens&quot;</span>, <span class="st">&quot;spec&quot;</span>)</span>
<span id="cb94-5"><a href="#cb94-5"></a>    <span class="kw">return</span>(<span class="ot">NULL</span>)</span>
<span id="cb94-6"><a href="#cb94-6"></a>}</span>
<span id="cb94-7"><a href="#cb94-7"></a></span>
<span id="cb94-8"><a href="#cb94-8"></a><span class="co"># pROC package</span></span>
<span id="cb94-9"><a href="#cb94-9"></a>proc_roc &lt;-<span class="st"> </span><span class="cf">function</span>(x, class) {</span>
<span id="cb94-10"><a href="#cb94-10"></a>    r &lt;-<span class="st"> </span>pROC<span class="op">::</span><span class="kw">roc</span>(class, x, <span class="dt">algorithm =</span> <span class="dv">2</span>, <span class="dt">levels =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">1</span>), <span class="dt">direction =</span> <span class="st">&quot;&lt;&quot;</span>)</span>
<span id="cb94-11"><a href="#cb94-11"></a>    <span class="kw">return</span>(<span class="ot">NULL</span>)</span>
<span id="cb94-12"><a href="#cb94-12"></a>}</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<table>
<thead>
<tr class="header">
<th align="right">n</th>
<th align="left">task</th>
<th align="right">OptimalCutpoints</th>
<th align="right">ROCR</th>
<th align="right">ThresholdROC</th>
<th align="right">cutpointr</th>
<th align="right">pROC</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="right">1e+02</td>
<td align="left">Cutpoint Estimation</td>
<td align="right">2.288702</td>
<td align="right">1.812802</td>
<td align="right">1.194301</td>
<td align="right">4.5018015</td>
<td align="right">0.662101</td>
</tr>
<tr class="even">
<td align="right">1e+03</td>
<td align="left">Cutpoint Estimation</td>
<td align="right">45.056801</td>
<td align="right">2.176401</td>
<td align="right">36.239852</td>
<td align="right">4.8394010</td>
<td align="right">0.981001</td>
</tr>
<tr class="odd">
<td align="right">1e+04</td>
<td align="left">Cutpoint Estimation</td>
<td align="right">2538.612001</td>
<td align="right">5.667101</td>
<td align="right">2503.801251</td>
<td align="right">8.5662515</td>
<td align="right">4.031701</td>
</tr>
<tr class="even">
<td align="right">1e+05</td>
<td align="left">Cutpoint Estimation</td>
<td align="right">NA</td>
<td align="right">43.118751</td>
<td align="right">NA</td>
<td align="right">45.3845010</td>
<td align="right">37.150151</td>
</tr>
<tr class="odd">
<td align="right">1e+06</td>
<td align="left">Cutpoint Estimation</td>
<td align="right">NA</td>
<td align="right">607.023851</td>
<td align="right">NA</td>
<td align="right">465.0032010</td>
<td align="right">583.095000</td>
</tr>
<tr class="even">
<td align="right">1e+07</td>
<td align="left">Cutpoint Estimation</td>
<td align="right">NA</td>
<td align="right">7850.258700</td>
<td align="right">NA</td>
<td align="right">5467.3328010</td>
<td align="right">7339.356101</td>
</tr>
<tr class="odd">
<td align="right">1e+02</td>
<td align="left">ROC curve calculation</td>
<td align="right">NA</td>
<td align="right">1.732651</td>
<td align="right">NA</td>
<td align="right">0.7973505</td>
<td align="right">0.447701</td>
</tr>
<tr class="even">
<td align="right">1e+03</td>
<td align="left">ROC curve calculation</td>
<td align="right">NA</td>
<td align="right">2.035852</td>
<td align="right">NA</td>
<td align="right">0.8593010</td>
<td align="right">0.694802</td>
</tr>
<tr class="odd">
<td align="right">1e+04</td>
<td align="left">ROC curve calculation</td>
<td align="right">NA</td>
<td align="right">5.662151</td>
<td align="right">NA</td>
<td align="right">1.8781510</td>
<td align="right">3.658050</td>
</tr>
<tr class="even">
<td align="right">1e+05</td>
<td align="left">ROC curve calculation</td>
<td align="right">NA</td>
<td align="right">42.820852</td>
<td align="right">NA</td>
<td align="right">11.0992510</td>
<td align="right">35.329301</td>
</tr>
<tr class="odd">
<td align="right">1e+06</td>
<td align="left">ROC curve calculation</td>
<td align="right">NA</td>
<td align="right">612.471901</td>
<td align="right">NA</td>
<td align="right">159.8100505</td>
<td align="right">610.433700</td>
</tr>
<tr class="even">
<td align="right">1e+07</td>
<td align="left">ROC curve calculation</td>
<td align="right">NA</td>
<td align="right">7806.385452</td>
<td align="right">NA</td>
<td align="right">2032.6935510</td>
<td align="right">7081.897251</td>
</tr>
</tbody>
</table>
</div>



<!-- code folding -->


<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>
back to top