Revision 8ee0117993c6df7789aa0e96b39421d1a4691806 authored by Karel Kubicek on 04 April 2018, 21:23:41 UTC, committed by Karel Kubicek on 04 April 2018, 21:23:41 UTC
1 parent 2a1aa3e
statistics.cc
#include "statistics.h"
#include <algorithm>
#include <cmath>
#include <stdexcept>
/** gamma function
* taken from http://www.crbond.com/math.htm (Gamma function in C/C++ for real
* arguments, ported from Zhang and Jin)
* returns 1e308 if argument is a negative integer or 0 or if argument exceeds
* 171.
* @param x argument
* @return Gamma(argument)
*/
static double gamma0(double x) {
/* gamma function.
Algorithms and coefficient values from "Computation of Special
Functions", Zhang and Jin, John Wiley and Sons, 1996.
(C) 2003, C. Bond. All rights reserved.
taken from http://www.crbond.com/math.htm */
int i, k, m;
double ga, gr, r = 1.0, z;
static double g[] = {1.0,
0.5772156649015329,
-0.6558780715202538,
-0.420026350340952e-1,
0.1665386113822915,
-0.421977345555443e-1,
-0.9621971527877e-2,
0.7218943246663e-2,
-0.11651675918591e-2,
-0.2152416741149e-3,
0.1280502823882e-3,
-0.201348547807e-4,
-0.12504934821e-5,
0.1133027232e-5,
-0.2056338417e-6,
0.6116095e-8,
0.50020075e-8,
-0.11812746e-8,
0.1043427e-9,
0.77823e-11,
-0.36968e-11,
0.51e-12,
-0.206e-13,
-0.54e-14,
0.14e-14};
if (x > 171.0)
return 1e308; // This value is an overflow flag.
if (x == int(x)) {
if (x > 0.0) {
ga = 1.0; // use factorial
for (i = 2; i < x; i++) {
ga *= i;
}
} else
ga = 1e308;
} else {
if (std::fabs(x) > 1.0) {
z = std::fabs(x);
m = int(z);
r = 1.0;
for (k = 1; k <= m; k++) {
r *= (z - k);
}
z -= m;
} else
z = x;
gr = g[24];
for (k = 23; k >= 0; k--) {
gr = gr * z + g[k];
}
ga = 1.0 / (gr * z);
if (std::fabs(x) > 1.0) {
ga *= r;
if (x < 0.0) {
ga = -M_PI / (x * ga * std::sin(M_PI * x));
}
}
}
return ga;
}
/** incomplete gamma function
* taken from http://www.crbond.com/math.htm (Incomplete Gamma function, ported
* from Zhang and Jin)
* @param a
* @param x
* @param gin
* @param gim
* @param gip
* @return
*/
static int incog(double a, double x, double& gin, double& gim, double& gip) {
double xam, r, s, ga, t0;
int k;
if ((a < 0.0) || (x < 0))
return 1;
xam = -x + a * std::log(x);
if ((xam > 700) || (a > 170.0))
return 1;
if (x == 0.0) {
gin = 0.0;
gim = gamma0(a);
gip = 0.0;
return 0;
}
if (x <= 1.0 + a) {
s = 1.0 / a;
r = s;
for (k = 1; k <= 60; k++) {
r *= x / (a + k);
s += r;
if (std::fabs(r / s) < 1e-15)
break;
}
gin = std::exp(xam) * s;
ga = gamma0(a);
gip = gin / ga;
gim = ga - gin;
} else {
t0 = 0.0;
for (k = 60; k >= 1; k--) {
t0 = (k - a) / (1.0 + k / (x + t0));
}
gim = std::exp(xam) / (x + t0);
ga = gamma0(a);
gin = ga - gim;
gip = 1.0 - gim / ga;
}
return 0;
}
/** function converting Chi^2 value to corresponding p-value
* taken from
* http://www.codeproject.com/Articles/432194/How-to-Calculate-the-Chi-Squared-P-Value
* note: do not use full implementation from above website, it is not precise
* enough for small inputs
* @param Dof degrees of freedom
* @param Cv Chi^2 value
* @return p-value
*/
static double chisqr(int Dof, double Cv) {
if (Cv < 0 || Dof < 1) {
return 1;
}
double K = (double(Dof)) * 0.5;
double X = Cv * 0.5;
if (Dof == 2) {
return std::exp(-1.0 * X);
}
double gin, gim, gip;
incog(K, X, gin, gim, gip); // compute incomplete gamma function
double PValue = gim;
PValue /= gamma0(K); // divide by gamma function value
return PValue;
}
namespace core {
namespace statistics {
double two_sample_chisqr(const std::vector<std::uint64_t>& distribution_a,
const std::vector<std::uint64_t>& distribution_b) {
// using two-smaple Chi^2 test
// (http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/chi2samp.htm)
double k1 = 1;
double k2 = 1;
double chisqr_value = 0;
int dof = 0;
for (unsigned i = 0; i != distribution_a.size(); ++i) {
auto sum = distribution_a[i] + distribution_b[i];
if (sum > 5) {
dof++;
chisqr_value += std::pow(k1 * distribution_a[i] - k2 * distribution_b[i], 2) / sum;
}
}
dof--; // last category is fully determined by others
return chisqr(dof, chisqr_value);
}
double ks_critical_value(std::size_t number_of_samples, unsigned significance_level) {
if (number_of_samples <= 35)
throw std::runtime_error("Too few samples for KS critical value (<=35).");
switch (significance_level) {
case 10:
return 1.224 / std::sqrt(double(number_of_samples));
case 5:
return 1.358 / std::sqrt(double(number_of_samples));
case 1:
return 1.628 / sqrt(double(number_of_samples));
default:
throw std::runtime_error("Significance level must be 1, 5, or 10.");
}
}
double ks_uniformity_test(std::vector<double> samples) {
std::sort(samples.begin(), samples.end());
if (samples.front() < 0 || samples.back() > 1)
throw std::out_of_range("Cannot run K-S test, data out of range.");
double pvalue = 0;
double n = samples.size();
for (std::size_t i = 0; i != samples.size(); ++i) {
double temp = std::max(samples[i] - i / n, (i + 1) / n - samples[i]);
pvalue = std::max(pvalue, temp);
}
return pvalue;
}
} // namespace statistics
} // namespace core
Computing file changes ...