https://github.com/cran/cutpointr
Raw File
Tip revision: 4408233eb8624dea85ecf18e86d50c296165c3f2 authored by Christian Thiele on 13 April 2022, 17:12:29 UTC
version 1.1.2
Tip revision: 4408233
cutpointr_estimation.html
<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />

<meta name="viewport" content="width=device-width, initial-scale=1" />

<meta name="author" content="Christian Thiele" />

<meta name="date" content="2022-04-13" />

<title>Robust cutpoint estimation</title>

<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
// be compatible with the behavior of Pandoc < 2.8).
document.addEventListener('DOMContentLoaded', function(e) {
  var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
  var i, h, a;
  for (i = 0; i < hs.length; i++) {
    h = hs[i];
    if (!/^h[1-6]$/i.test(h.tagName)) continue;  // it should be a header h1-h6
    a = h.attributes;
    while (a.length > 0) h.removeAttribute(a[0].name);
  }
});
</script>
<script>// Hide empty <a> tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) -->
// v0.0.1
// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020.

document.addEventListener('DOMContentLoaded', function() {
  const codeList = document.getElementsByClassName("sourceCode");
  for (var i = 0; i < codeList.length; i++) {
    var linkList = codeList[i].getElementsByTagName('a');
    for (var j = 0; j < linkList.length; j++) {
      if (linkList[j].innerHTML === "") {
        linkList[j].setAttribute('aria-hidden', 'true');
      }
    }
  }
});
</script>

<style type="text/css">
  code{white-space: pre-wrap;}
  span.smallcaps{font-variant: small-caps;}
  span.underline{text-decoration: underline;}
  div.column{display: inline-block; vertical-align: top; width: 50%;}
  div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
  ul.task-list{list-style: none;}
    </style>


<style type="text/css">
  code {
    white-space: pre;
  }
  .sourceCode {
    overflow: visible;
  }
</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */

</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
  var sheets = document.styleSheets;
  for (var i = 0; i < sheets.length; i++) {
    if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
    try { var rules = sheets[i].cssRules; } catch (e) { continue; }
    for (var j = 0; j < rules.length; j++) {
      var rule = rules[j];
      // check if there is a div.sourceCode rule
      if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") continue;
      var style = rule.style.cssText;
      // check if color or background-color is set
      if (rule.style.color === '' && rule.style.backgroundColor === '') continue;
      // replace div.sourceCode by a pre.sourceCode rule
      sheets[i].deleteRule(j);
      sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
    }
  }
})();
</script>




<style type="text/css">body {
background-color: #fff;
margin: 1em auto;
max-width: 700px;
overflow: visible;
padding-left: 2em;
padding-right: 2em;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
font-size: 14px;
line-height: 1.35;
}
#TOC {
clear: both;
margin: 0 0 10px 10px;
padding: 4px;
width: 400px;
border: 1px solid #CCCCCC;
border-radius: 5px;
background-color: #f6f6f6;
font-size: 13px;
line-height: 1.3;
}
#TOC .toctitle {
font-weight: bold;
font-size: 15px;
margin-left: 5px;
}
#TOC ul {
padding-left: 40px;
margin-left: -1.5em;
margin-top: 5px;
margin-bottom: 5px;
}
#TOC ul ul {
margin-left: -2em;
}
#TOC li {
line-height: 16px;
}
table {
margin: 1em auto;
border-width: 1px;
border-color: #DDDDDD;
border-style: outset;
border-collapse: collapse;
}
table th {
border-width: 2px;
padding: 5px;
border-style: inset;
}
table td {
border-width: 1px;
border-style: inset;
line-height: 18px;
padding: 5px 5px;
}
table, table th, table td {
border-left-style: none;
border-right-style: none;
}
table thead, table tr.even {
background-color: #f7f7f7;
}
p {
margin: 0.5em 0;
}
blockquote {
background-color: #f6f6f6;
padding: 0.25em 0.75em;
}
hr {
border-style: solid;
border: none;
border-top: 1px solid #777;
margin: 28px 0;
}
dl {
margin-left: 0;
}
dl dd {
margin-bottom: 13px;
margin-left: 13px;
}
dl dt {
font-weight: bold;
}
ul {
margin-top: 0;
}
ul li {
list-style: circle outside;
}
ul ul {
margin-bottom: 0;
}
pre, code {
background-color: #f7f7f7;
border-radius: 3px;
color: #333;
white-space: pre-wrap; 
}
pre {
border-radius: 3px;
margin: 5px 0px 10px 0px;
padding: 10px;
}
pre:not([class]) {
background-color: #f7f7f7;
}
code {
font-family: Consolas, Monaco, 'Courier New', monospace;
font-size: 85%;
}
p > code, li > code {
padding: 2px 0px;
}
div.figure {
text-align: center;
}
img {
background-color: #FFFFFF;
padding: 2px;
border: 1px solid #DDDDDD;
border-radius: 3px;
border: 1px solid #CCCCCC;
margin: 0 5px;
}
h1 {
margin-top: 0;
font-size: 35px;
line-height: 40px;
}
h2 {
border-bottom: 4px solid #f7f7f7;
padding-top: 10px;
padding-bottom: 2px;
font-size: 145%;
}
h3 {
border-bottom: 2px solid #f7f7f7;
padding-top: 10px;
font-size: 120%;
}
h4 {
border-bottom: 1px solid #f7f7f7;
margin-left: 8px;
font-size: 105%;
}
h5, h6 {
border-bottom: 1px solid #ccc;
font-size: 105%;
}
a {
color: #0033dd;
text-decoration: none;
}
a:hover {
color: #6666ff; }
a:visited {
color: #800080; }
a:visited:hover {
color: #BB00BB; }
a[href^="http:"] {
text-decoration: underline; }
a[href^="https:"] {
text-decoration: underline; }

code > span.kw { color: #555; font-weight: bold; } 
code > span.dt { color: #902000; } 
code > span.dv { color: #40a070; } 
code > span.bn { color: #d14; } 
code > span.fl { color: #d14; } 
code > span.ch { color: #d14; } 
code > span.st { color: #d14; } 
code > span.co { color: #888888; font-style: italic; } 
code > span.ot { color: #007020; } 
code > span.al { color: #ff0000; font-weight: bold; } 
code > span.fu { color: #900; font-weight: bold; } 
code > span.er { color: #a61717; background-color: #e3d2d2; } 
</style>




</head>

<body>




<h1 class="title toc-ignore">Robust cutpoint estimation</h1>
<h4 class="author">Christian Thiele</h4>
<h4 class="date">2022-04-13</h4>



<div id="more-robust-cutpoint-estimation-methods" class="section level1">
<h1>More robust cutpoint estimation methods</h1>
<div id="bootstrapped-cutpoints" class="section level2">
<h2>Bootstrapped cutpoints</h2>
<p>It has been shown that bagging can substantially improve performance of a wide range of types of models in regression as well as in classification tasks. This method is available for cutpoint estimation via the <code>maximize_boot_metric</code> and <code>minimize_boot_metric</code> functions. If one of these functions is used as <code>method</code>, <code>boot_cut</code> bootstrap samples are drawn, the cutpoint optimization is carried out in each one and a summary (e.g. the mean) of the resulting optimal cutpoints on the bootstrap samples is returned as the optimal cutpoint in <code>cutpointr</code>. Note that if bootstrap validation is run, i.e. if <code>boot_runs</code> is larger zero, an outer bootstrap will be executed. In the bootstrap validation routine <code>boot_runs</code> bootstrap samples are generated and each one is again bootstrapped <code>boot_cut</code> times. This may lead to long run times, so activating the built-in parallelization may be advisable.</p>
<p>The advantages of bootstrapping the optimal cutpoint are that the procedure doesn’t possess parameters that have to be tuned, unlike the LOESS smoothing, that it doesn’t rely on assumptions, unlike the Normal method, and that it is applicable to any metric that can be used with <code>minimize_metric</code> or <code>maximize_metric</code>, unlike the Kernel method. Furthermore, like Random Forests cannot be overfit by increasing the number of trees, the bootstrapped cutpoints cannot be overfit by running an excessive amount of <code>boot_cut</code> repetitions.</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1"></a><span class="kw">library</span>(cutpointr)</span>
<span id="cb1-2"><a href="#cb1-2"></a><span class="kw">set.seed</span>(<span class="dv">100</span>)</span>
<span id="cb1-3"><a href="#cb1-3"></a><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, </span>
<span id="cb1-4"><a href="#cb1-4"></a>          <span class="dt">method =</span> maximize_boot_metric,</span>
<span id="cb1-5"><a href="#cb1-5"></a>          <span class="dt">boot_cut =</span> <span class="dv">200</span>, <span class="dt">summary_func =</span> mean,</span>
<span id="cb1-6"><a href="#cb1-6"></a>          <span class="dt">metric =</span> accuracy, <span class="dt">silent =</span> <span class="ot">TRUE</span>)</span></code></pre></div>
<pre><code>## # A tibble: 2 x 18
##   subgroup direction optimal_cutpoint method               accuracy      acc
##   &lt;chr&gt;    &lt;chr&gt;                &lt;dbl&gt; &lt;chr&gt;                   &lt;dbl&gt;    &lt;dbl&gt;
## 1 female   &gt;=                 5.73246 maximize_boot_metric 0.956633 0.956633
## 2 male     &gt;=                 8.41026 maximize_boot_metric 0.95     0.95    
##   sensitivity specificity      AUC pos_class neg_class prevalence outcome
##         &lt;dbl&gt;       &lt;dbl&gt;    &lt;dbl&gt; &lt;fct&gt;     &lt;fct&gt;          &lt;dbl&gt; &lt;chr&gt;  
## 1    0.444444    0.994521 0.944647 yes       no         0.0688776 suicide
## 2    0.222222    1        0.861747 yes       no         0.0642857 suicide
##   predictor grouping data               roc_curve                boot 
##   &lt;chr&gt;     &lt;chr&gt;    &lt;list&gt;             &lt;list&gt;                   &lt;lgl&gt;
## 1 dsi       gender   &lt;tibble [392 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA   
## 2 dsi       gender   &lt;tibble [140 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA</code></pre>
</div>
<div id="loess-smoothing-for-selecting-a-cutpoint" class="section level2">
<h2>LOESS smoothing for selecting a cutpoint</h2>
<p>When using <code>maximize_metric</code> and <code>minimize_metric</code> the optimal cutpoint is selected by searching the maximum or minimum of the metric function. For example, we may want to minimize the misclassification cost. Since false negatives (a suicide attempt was not anticipated) can be regarded as much more severe than false positives we can set the cost of a false negative <code>cost_fn</code> for example to ten times the cost of a false positive.</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, <span class="dt">method =</span> minimize_metric,</span>
<span id="cb3-2"><a href="#cb3-2"></a>                     <span class="dt">metric =</span> misclassification_cost, <span class="dt">cost_fp =</span> <span class="dv">1</span>, <span class="dt">cost_fn =</span> <span class="dv">10</span>)</span></code></pre></div>
<pre><code>## Assuming the positive class is yes</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1"></a><span class="kw">plot_metric</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<p>As this “optimal” cutpoint may depend on minor differences between the possible cutoffs, smoothing of the function of metric values by cutpoint value might be desirable, especially in small samples. The <code>minimize_loess_metric</code> and <code>maximize_loess_metric</code> functions can be used to smooth the function so that the optimal cutpoint is selected based on the smoothed metric values. Options to modify the smoothing, which is implemented using <code>loess.as</code> from the <strong>fANCOVA</strong> package, include:</p>
<ul>
<li><code>criterion</code>: the criterion for automatic smoothing parameter selection: “aicc” denotes bias-corrected AIC criterion, “gcv” denotes generalized cross-validation.</li>
<li><code>degree</code>: the degree of the local polynomials to be used. It can be 0, 1 or 2.</li>
<li><code>family</code>: if “gaussian” fitting is by least-squares, and if “symmetric” a re-descending M estimator is used with Tukey’s biweight function.</li>
<li><code>user.span</code>: the user-defined parameter which controls the degree of smoothing.</li>
</ul>
<p>Using parameters for the LOESS smoothing of <code>criterion = &quot;aicc&quot;</code>, <code>degree = 2</code>, <code>family = &quot;symmetric&quot;</code>, and <code>user.span = 0.7</code> we get the following smoothed versions of the above metrics:</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, </span>
<span id="cb7-2"><a href="#cb7-2"></a>                     <span class="dt">method =</span> minimize_loess_metric,</span>
<span id="cb7-3"><a href="#cb7-3"></a>                     <span class="dt">criterion =</span> <span class="st">&quot;aicc&quot;</span>, <span class="dt">family =</span> <span class="st">&quot;symmetric&quot;</span>, </span>
<span id="cb7-4"><a href="#cb7-4"></a>                     <span class="dt">degree =</span> <span class="dv">2</span>, <span class="dt">user.span =</span> <span class="fl">0.7</span>,</span>
<span id="cb7-5"><a href="#cb7-5"></a>                     <span class="dt">metric =</span> misclassification_cost, <span class="dt">cost_fp =</span> <span class="dv">1</span>, <span class="dt">cost_fn =</span> <span class="dv">10</span>)</span></code></pre></div>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1"></a><span class="kw">plot_metric</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<p>The optimal cutpoint for the female subgroup changes to 3. Note, though, that there are no reliable rules for selecting the “best” smoothing parameters. Notably, the LOESS smoothing is sensitive to the number of unique cutpoints. A large number of unique cutpoints generally leads to a more volatile curve of metric values by cutpoint value, even after smoothing. Thus, the curve tends to be undersmoothed in that scenario. The unsmoothed metric values are returned in <code>opt_cut$roc_curve</code> in the column <code>m_unsmoothed</code>.</p>
</div>
<div id="smoothing-via-generalized-additive-models-for-selecting-a-cutpoint" class="section level2">
<h2>Smoothing via Generalized Additive Models for selecting a cutpoint</h2>
<p>In a similar fashion, the function of metric values per cutpoint can be smoothed using Generalized Additive Models with smooth terms. Internally, <code>mgcv::gam</code> carries out the smoothing which can be customized via the arguments <code>formula</code> and <code>optimizer</code>, see <code>help(&quot;gam&quot;, package = &quot;mgcv&quot;)</code>. Most importantly, the GAM can be specified by altering the default formula, for example the smoothing function could be configured to apply cubic regression splines (<code>&quot;cr&quot;</code>) as the smooth term. As the <code>suicide</code> data has only very few unique cutpoints, it is not very suitable for showcasing the GAM smoothing, so we will use two classes of the <code>iris</code> data here. In this case, the purely empirical method and the GAM smoothing lead to identical cutpoints, but in practice the GAM smoothing tends to be more robust, especially with larger data. An attractive feature of the GAM smoothing is that the default values tend to work quite well and usually require no tuning, eliminating researcher degrees of freedom.</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1"></a><span class="kw">library</span>(ggplot2)</span>
<span id="cb9-2"><a href="#cb9-2"></a>exdat &lt;-<span class="st"> </span>iris</span>
<span id="cb9-3"><a href="#cb9-3"></a>exdat &lt;-<span class="st"> </span>exdat[exdat<span class="op">$</span>Species <span class="op">!=</span><span class="st"> &quot;setosa&quot;</span>, ]</span>
<span id="cb9-4"><a href="#cb9-4"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(exdat, Petal.Length, Species,</span>
<span id="cb9-5"><a href="#cb9-5"></a>                     <span class="dt">method =</span> minimize_gam_metric,</span>
<span id="cb9-6"><a href="#cb9-6"></a>                     <span class="dt">formula =</span> m <span class="op">~</span><span class="st"> </span><span class="kw">s</span>(x.sorted, <span class="dt">bs =</span> <span class="st">&quot;cr&quot;</span>),</span>
<span id="cb9-7"><a href="#cb9-7"></a>                     <span class="dt">metric =</span> abs_d_sens_spec)</span></code></pre></div>
<pre><code>## Assuming the positive class is virginica</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1"></a><span class="kw">plot_metric</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
</div>
<div id="spline-smoothing-for-selecting-a-cutpoint" class="section level2">
<h2>Spline smoothing for selecting a cutpoint</h2>
<p>Again in the same fashion the function of metric values per cutpoint can be smoothed using smoothing splines. By default, the number of knots is automatically chosen using the <code>cutpoint_knots</code> function. That function uses <code>stats::.nknots.smspl</code>, which is the default in <code>stats::smooth.spline</code> to pick the number of knots.</p>
<p>Alternatively, the number of knots can be set manually and also the other smoothing parameters of <code>stats::smooth.spline</code> can be set as desired. For details see <code>?maximize_spline_metric</code>.</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1"></a>opt_cut &lt;-<span class="st"> </span><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, </span>
<span id="cb13-2"><a href="#cb13-2"></a>                     <span class="dt">method =</span> minimize_spline_metric, <span class="dt">spar =</span> <span class="fl">0.4</span>,</span>
<span id="cb13-3"><a href="#cb13-3"></a>                     <span class="dt">metric =</span> misclassification_cost, <span class="dt">cost_fp =</span> <span class="dv">1</span>, <span class="dt">cost_fn =</span> <span class="dv">10</span>)</span></code></pre></div>
<pre><code>## Assuming the positive class is yes</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<pre><code>## nknots: 10
## nknots: 10</code></pre>
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1"></a><span class="kw">plot_metric</span>(opt_cut)</span></code></pre></div>
<p><img src="" style="display: block; margin: auto;" /></p>
<div id="parametric-method-assuming-normality" class="section level3">
<h3>Parametric method assuming normality</h3>
<p>The Normal method in <code>oc_youden_normal</code> is a parametric method for maximizing the Youden-Index or equivalently the sum of <span class="math inline">\(Se\)</span> and <span class="math inline">\(Sp\)</span>. It relies on the assumption that the predictor for both the negative and positive observations is normally distributed. In that case it can be shown that</p>
<p><span class="math display">\[c^* = \frac{(\mu_P \sigma_N^2 - \mu_N \sigma_P^2) - \sigma_N \sigma_P \sqrt{(\mu_N - \mu_P)^2 + (\sigma_N^2 - \sigma_P^2) log(\sigma_N^2 / \sigma_P^2)}}{\sigma_N^2 - \sigma_P^2}\]</span></p>
<p>where the negative class is normally distributed with <span class="math inline">\(\sim N(\mu_N, \sigma_N^2)\)</span> and the positive class independently normally distributed with <span class="math inline">\(\sim N(\mu_P, \sigma_P^2)\)</span> provides the optimal cutpoint <span class="math inline">\(c^*\)</span> that maximizes the Youden-Index. If <span class="math inline">\(\sigma_N\)</span> and <span class="math inline">\(\sigma_P\)</span> are equal, the expression can be simplified to <span class="math inline">\(c^* = \frac{\mu_N + \mu_P}{2}\)</span>. However, the <code>oc_youden_normal</code> method in cutpointr always assumes unequal standard deviations. Since this method does not select a cutpoint from the observed predictor values, it is questionable which values for <span class="math inline">\(Se\)</span> and <span class="math inline">\(Sp\)</span> should be reported. Here, the Youden-Index can be calculated as</p>
<p><span class="math display">\[J = \Phi(\frac{c^* - \mu_N}{\sigma_N}) - \Phi(\frac{c^* - \mu_P}{\sigma_P})\]</span></p>
<p>if the assumption of normality holds. However, since there exist several methods that do not select cutpoints from the available observations and to unify the reporting of metrics for these methods, <strong>cutpointr</strong> reports all metrics, e.g. <span class="math inline">\(Se\)</span> and <span class="math inline">\(Sp\)</span>, based on the empirical observations.</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1"></a><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, <span class="dt">method =</span> oc_youden_normal)</span></code></pre></div>
<pre><code>## Assuming the positive class is yes</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<pre><code>## # A tibble: 2 x 18
##   subgroup direction optimal_cutpoint method           sum_sens_spec      acc
##   &lt;chr&gt;    &lt;chr&gt;                &lt;dbl&gt; &lt;chr&gt;                    &lt;dbl&gt;    &lt;dbl&gt;
## 1 female   &gt;=                 2.47775 oc_youden_normal       1.71618 0.895408
## 2 male     &gt;=                 3.17226 oc_youden_normal       1.54453 0.864286
##   sensitivity specificity      AUC pos_class neg_class prevalence outcome
##         &lt;dbl&gt;       &lt;dbl&gt;    &lt;dbl&gt; &lt;fct&gt;     &lt;fct&gt;          &lt;dbl&gt; &lt;chr&gt;  
## 1    0.814815    0.901370 0.944647 yes       no         0.0688776 suicide
## 2    0.666667    0.877863 0.861747 yes       no         0.0642857 suicide
##   predictor grouping data               roc_curve                boot 
##   &lt;chr&gt;     &lt;chr&gt;    &lt;list&gt;             &lt;list&gt;                   &lt;lgl&gt;
## 1 dsi       gender   &lt;tibble [392 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA   
## 2 dsi       gender   &lt;tibble [140 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA</code></pre>
</div>
<div id="nonparametric-kernel-method" class="section level3">
<h3>Nonparametric kernel method</h3>
<p>A nonparametric alternative is the Kernel method <span class="citation">[@fluss_estimation_2005]</span>. Here, the empirical distribution functions are smoothed using the Gaussian kernel functions <span class="math inline">\(\hat{F}_N(t) = \frac{1}{n} \sum^n_{i=1} \Phi(\frac{t - y_i}{h_y})\)</span> and <span class="math inline">\(\hat{G}_P(t) = \frac{1}{m} \sum^m_{i=1} \Phi(\frac{t - x_i}{h_x})\)</span> for the negative and positive classes respectively. Following Silverman’s plug-in “rule of thumb” the bandwidths are selected as <span class="math inline">\(h_y = 0.9 * min\{s_y, iqr_y/1.34\} * n^{-0.2}\)</span> and <span class="math inline">\(h_x = 0.9 * min\{s_x, iqr_x/1.34\} * m^{-0.2}\)</span> where <span class="math inline">\(s\)</span> is the sample standard deviation and <span class="math inline">\(iqr\)</span> is the inter quartile range. It has been demonstrated that AUC estimation is rather insensitive to the choice of the bandwidth procedure <span class="citation">[@faraggi_estimation_2002]</span> and thus the plug-in bandwidth estimator has also been recommended for cutpoint estimation. The <code>oc_youden_kernel</code> function in <strong>cutpointr</strong> uses a Gaussian kernel and the direct plug-in method for selecting the bandwidths. The kernel smoothing is done via the <code>bkde</code> function from the <strong>KernSmooth</strong> package <span class="citation">[@wand_kernsmooth:_2013]</span>.</p>
<p>Again, there is a way to calculate the Youden-Index from the results of this method <span class="citation">[@fluss_estimation_2005]</span> which is</p>
<p><span class="math display">\[\hat{J} = max_c \{\hat{F}_N(c) - \hat{G}_N(c) \}\]</span></p>
<p>but as before we prefer to report all metrics based on applying the cutpoint that was estimated using the Kernel method to the empirical observations.</p>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1"></a><span class="kw">cutpointr</span>(suicide, dsi, suicide, gender, <span class="dt">method =</span> oc_youden_kernel)</span></code></pre></div>
<pre><code>## Assuming the positive class is yes</code></pre>
<pre><code>## Assuming the positive class has higher x values</code></pre>
<pre><code>## # A tibble: 2 x 18
##   subgroup direction optimal_cutpoint method           sum_sens_spec      acc
##   &lt;chr&gt;    &lt;chr&gt;                &lt;dbl&gt; &lt;chr&gt;                    &lt;dbl&gt;    &lt;dbl&gt;
## 1 female   &gt;=                 1.18128 oc_youden_kernel       1.80812 0.885204
## 2 male     &gt;=                 1.31636 oc_youden_kernel       1.58694 0.807143
##   sensitivity specificity      AUC pos_class neg_class prevalence outcome
##         &lt;dbl&gt;       &lt;dbl&gt;    &lt;dbl&gt; &lt;fct&gt;     &lt;fct&gt;          &lt;dbl&gt; &lt;chr&gt;  
## 1    0.925926    0.882192 0.944647 yes       no         0.0688776 suicide
## 2    0.777778    0.809160 0.861747 yes       no         0.0642857 suicide
##   predictor grouping data               roc_curve                boot 
##   &lt;chr&gt;     &lt;chr&gt;    &lt;list&gt;             &lt;list&gt;                   &lt;lgl&gt;
## 1 dsi       gender   &lt;tibble [392 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA   
## 2 dsi       gender   &lt;tibble [140 x 2]&gt; &lt;roc_cutpointr [11 x 9]&gt; NA</code></pre>
</div>
</div>
</div>



<!-- code folding -->


<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>
back to top