https://github.com/ntamas/plfit
plfit.c
/* vim:set ts=4 sw=4 sts=4 et: */
/* plfit.c
*
* Copyright (C) 2010-2011 Tamas Nepusz
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <stdio.h>
#include <float.h>
#include <math.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include "plfit_error.h"
#include "gss.h"
#include "lbfgs.h"
#include "platform.h"
#include "plfit.h"
#include "kolmogorov.h"
#include "plfit_sampling.h"
#include "hzeta.h"
/* #define PLFIT_DEBUG */
#define DATA_POINTS_CHECK \
if (n <= 0) { \
PLFIT_ERROR("no data points", PLFIT_EINVAL); \
}
#define XMIN_CHECK_ZERO \
if (xmin <= 0) { \
PLFIT_ERROR("xmin must be greater than zero", PLFIT_EINVAL); \
}
#define XMIN_CHECK_ONE \
if (xmin < 1) { \
PLFIT_ERROR("xmin must be at least 1", PLFIT_EINVAL); \
}
static int plfit_i_resample_continuous(double* xs_head, size_t num_smaller,
size_t n, double alpha, double xmin, size_t num_samples, plfit_mt_rng_t* rng,
double* result);
static int plfit_i_resample_discrete(double* xs_head, size_t num_smaller,
size_t n, double alpha, double xmin, size_t num_samples, plfit_mt_rng_t* rng,
double* result);
static int double_comparator(const void *a, const void *b) {
const double *da = (const double*)a;
const double *db = (const double*)b;
return (*da > *db) - (*da < *db);
}
static int plfit_i_copy_and_sort(double* xs, size_t n, double** result) {
*result = (double*)malloc(sizeof(double) * n);
if (*result == 0) {
PLFIT_ERROR("cannot create sorted copy of input data", PLFIT_ENOMEM);
}
memcpy(*result, xs, sizeof(double) * n);
qsort(*result, n, sizeof(double), double_comparator);
return PLFIT_SUCCESS;
}
/**
* Given an unsorted array of doubles, counts how many elements there are that
* are smaller than a given value.
*
* \param begin pointer to the beginning of the array
* \param end pointer to the first element after the end of the array
* \param xmin the threshold value
*
* \return the nubmer of elements in the array that are smaller than the given
* value.
*/
static size_t count_smaller(double* begin, double* end, double xmin) {
double* p;
size_t counter = 0;
for (p = begin; p < end; p++) {
if (*p < xmin) {
counter++;
}
}
return counter;
}
/**
* Given an unsorted array of doubles, return another array that contains the
* elements that are smaller than a given value
*
* \param begin pointer to the beginning of the array
* \param end pointer to the first element after the end of the array
* \param xmin the threshold value
* \param result_length if not \c NULL, the number of unique elements in the
* given array is returned here
*
* \return pointer to the head of the new array or 0 if there is not enough
* memory
*/
static double* extract_smaller(double* begin, double* end, double xmin,
size_t* result_length) {
size_t counter = count_smaller(begin, end, xmin);
double *p, *result;
result = calloc(counter, sizeof(double));
if (result == 0)
return 0;
for (p = result; begin < end; begin++) {
if (*begin < xmin) {
*p = *begin;
p++;
}
}
if (result_length) {
*result_length = counter;
}
return result;
}
/**
* Given a sorted array of doubles, return another array that contains pointers
* into the array for the start of each block of identical elements.
*
* \param begin pointer to the beginning of the array
* \param end pointer to the first element after the end of the array
* \param result_length if not \c NULL, the number of unique elements in the
* given array is returned here. It is left unchanged if
* the function returns with an error.
*
* \return pointer to the head of the new array or 0 if there is not enough
* memory
*/
static double** unique_element_pointers(double* begin, double* end, size_t* result_length) {
double* ptr = begin;
double** result;
double prev_x;
size_t num_elts = 15;
size_t used_elts = 0;
/* Special case: empty array */
if (begin == end) {
result = calloc(1, sizeof(double*));
if (result != 0) {
result[0] = 0;
if (result_length != 0) {
*result_length = 0;
}
}
return result;
}
/* Allocate initial result array, including the guard element */
result = calloc(num_elts+1, sizeof(double*));
if (result == 0)
return 0;
prev_x = *begin;
result[used_elts++] = begin;
/* Process the input array */
for (ptr = begin+1; ptr < end; ptr++) {
if (*ptr == prev_x)
continue;
/* New block found */
if (used_elts >= num_elts) {
/* Array full; allocate a new chunk */
num_elts = num_elts*2 + 1;
result = realloc(result, sizeof(double*) * (num_elts+1));
if (result == 0)
return 0;
}
/* Store the new element */
result[used_elts++] = ptr;
prev_x = *ptr;
}
/* Calculate the result length */
if (result_length != 0) {
*result_length = used_elts;
}
/* Add the guard entry to the end of the result */
result[used_elts++] = 0;
return result;
}
static void plfit_i_perform_finite_size_correction(plfit_result_t* result, size_t n) {
result->alpha = result->alpha * (n-1) / n + 1.0 / n;
}
/********** Continuous power law distribution fitting **********/
static void plfit_i_logsum_less_than_continuous(double* begin, double* end,
double xmin, double* result, size_t* m) {
double logsum = 0.0;
size_t count = 0;
for (; begin != end; begin++) {
if (*begin >= xmin) {
count++;
logsum += log(*begin / xmin);
}
}
*m = count;
*result = logsum;
}
static double plfit_i_logsum_continuous(double* begin, double* end, double xmin) {
double logsum = 0.0;
for (; begin != end; begin++)
logsum += log(*begin / xmin);
return logsum;
}
static int plfit_i_estimate_alpha_continuous(double* xs, size_t n,
double xmin, double* alpha) {
double result;
size_t m;
XMIN_CHECK_ZERO;
plfit_i_logsum_less_than_continuous(xs, xs+n, xmin, &result, &m);
if (m == 0) {
PLFIT_ERROR("no data point was larger than xmin", PLFIT_EINVAL);
}
*alpha = 1 + m / result;
return PLFIT_SUCCESS;
}
static int plfit_i_estimate_alpha_continuous_sorted(double* xs, size_t n,
double xmin, double* alpha) {
double* end = xs+n;
XMIN_CHECK_ZERO;
for (; xs != end && *xs < xmin; xs++);
if (xs == end) {
PLFIT_ERROR("no data point was larger than xmin", PLFIT_EINVAL);
}
*alpha = 1 + (end-xs) / plfit_i_logsum_continuous(xs, end, xmin);
return PLFIT_SUCCESS;
}
static int plfit_i_ks_test_continuous(double* xs, double* xs_end,
const double alpha, const double xmin, double* D) {
/* Assumption: xs is sorted and cut off at xmin so the first element is
* always larger than or equal to xmin. */
double result = 0, n;
int m = 0;
n = xs_end - xs;
while (xs < xs_end) {
double d = fabs(1-pow(xmin / *xs, alpha-1) - m / n);
if (d > result)
result = d;
xs++; m++;
}
*D = result;
return PLFIT_SUCCESS;
}
static int plfit_i_calculate_p_value_continuous(double* xs, size_t n,
const plfit_continuous_options_t *options, plfit_bool_t xmin_fixed,
plfit_result_t *result) {
long int num_trials;
long int successes = 0;
double *xs_head;
size_t num_smaller;
plfit_continuous_options_t options_no_p_value = *options;
int retval = PLFIT_SUCCESS;
if (options->p_value_method == PLFIT_P_VALUE_SKIP) {
result->p = NAN;
return PLFIT_SUCCESS;
}
if (options->p_value_method == PLFIT_P_VALUE_APPROXIMATE) {
num_smaller = count_smaller(xs, xs + n, result->xmin);
result->p = plfit_ks_test_one_sample_p(result->D, n - num_smaller);
return PLFIT_SUCCESS;
}
options_no_p_value.p_value_method = PLFIT_P_VALUE_SKIP;
num_trials = (long int)(0.25 / options->p_value_precision / options->p_value_precision);
if (num_trials <= 0) {
PLFIT_ERROR("invalid p-value precision", PLFIT_EINVAL);
}
/* Extract the head of xs that contains elements smaller than xmin */
xs_head = extract_smaller(xs, xs+n, result->xmin, &num_smaller);
if (xs_head == 0)
PLFIT_ERROR("cannot calculate exact p-value", PLFIT_ENOMEM);
#ifdef _OPENMP
#pragma omp parallel
#endif
{
/* Parallel section starts here. If we are compiling using OpenMP, each
* thread will use its own RNG that is seeded from the master RNG. If
* we are compiling without OpenMP, there is only one thread and it uses
* the master RNG. This section must be critical to ensure that only one
* thread is using the master RNG at the same time. */
#ifdef _OPENMP
plfit_mt_rng_t private_rng;
#endif
plfit_mt_rng_t *p_rng;
double *ys;
long int i;
plfit_result_t result_synthetic;
#ifdef _OPENMP
#pragma omp critical
{
p_rng = &private_rng;
plfit_mt_init_from_rng(p_rng, options->rng);
}
#else
p_rng = options->rng;
#endif
/* Allocate memory to sample into */
ys = calloc(n, sizeof(double));
if (ys == 0) {
retval = PLFIT_ENOMEM;
} else {
/* The main for loop starts here. */
#ifdef _OPENMP
#pragma omp for reduction(+:successes)
#endif
for (i = 0; i < num_trials; i++) {
plfit_i_resample_continuous(xs_head, num_smaller, n, result->alpha,
result->xmin, n, p_rng, ys);
if (xmin_fixed) {
plfit_estimate_alpha_continuous(ys, n, result->xmin,
&options_no_p_value, &result_synthetic);
} else {
plfit_continuous(ys, n, &options_no_p_value, &result_synthetic);
}
if (result_synthetic.D > result->D)
successes++;
}
free(ys);
}
/* End of parallelized part */
}
free(xs_head);
if (retval == PLFIT_SUCCESS) {
result->p = successes / ((double)num_trials);
} else {
PLFIT_ERROR("cannot calculate exact p-value", retval);
}
return retval;
}
int plfit_log_likelihood_continuous(double* xs, size_t n, double alpha,
double xmin, double* L) {
double logsum, c;
size_t m;
if (alpha <= 1) {
PLFIT_ERROR("alpha must be greater than one", PLFIT_EINVAL);
}
XMIN_CHECK_ZERO;
c = (alpha - 1) / xmin;
plfit_i_logsum_less_than_continuous(xs, xs+n, xmin, &logsum, &m);
*L = -alpha * logsum + log(c) * m;
return PLFIT_SUCCESS;
}
int plfit_estimate_alpha_continuous_sorted(double* xs, size_t n, double xmin,
const plfit_continuous_options_t* options, plfit_result_t *result) {
double *begin, *end;
if (!options)
options = &plfit_continuous_default_options;
begin = xs;
end = xs + n;
while (begin < end && *begin < xmin)
begin++;
PLFIT_CHECK(plfit_i_estimate_alpha_continuous_sorted(begin, end-begin,
xmin, &result->alpha));
PLFIT_CHECK(plfit_i_ks_test_continuous(begin, end, result->alpha,
xmin, &result->D));
if (options->finite_size_correction)
plfit_i_perform_finite_size_correction(result, end-begin);
result->xmin = xmin;
PLFIT_CHECK(plfit_log_likelihood_continuous(begin, end-begin, result->alpha,
result->xmin, &result->L));
PLFIT_CHECK(plfit_i_calculate_p_value_continuous(xs, n, options, 1, result));
return PLFIT_SUCCESS;
}
int plfit_estimate_alpha_continuous(double* xs, size_t n, double xmin,
const plfit_continuous_options_t* options, plfit_result_t *result) {
double *xs_copy;
if (!options)
options = &plfit_continuous_default_options;
PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &xs_copy));
PLFIT_CHECK(plfit_estimate_alpha_continuous_sorted(xs_copy, n, xmin,
options, result));
free(xs_copy);
return PLFIT_SUCCESS;
}
typedef struct {
double *begin; /**< Pointer to the beginning of the array holding the data */
double *end; /**< Pointer to after the end of the array holding the data */
double **probes; /**< Pointers to the elements of the array that will be probed */
size_t num_probes; /**< Number of probes */
plfit_result_t last; /**< Result of the last evaluation */
} plfit_continuous_xmin_opt_data_t;
static double plfit_i_continuous_xmin_opt_evaluate(void* instance, double x) {
plfit_continuous_xmin_opt_data_t* data = (plfit_continuous_xmin_opt_data_t*)instance;
double* begin = data->probes[(long int)x];
data->last.xmin = *begin;
#ifdef PLFIT_DEBUG
printf("Trying with probes[%ld] = %.4f\n", (long int)x, *begin);
#endif
plfit_i_estimate_alpha_continuous_sorted(begin, data->end-begin, *begin,
&data->last.alpha);
plfit_i_ks_test_continuous(begin, data->end, data->last.alpha, *begin,
&data->last.D);
return data->last.D;
}
static int plfit_i_continuous_xmin_opt_progress(void* instance, double x, double fx,
double min, double fmin, double left, double right, int k) {
#ifdef PLFIT_DEBUG
printf("Iteration #%d: [%.4f; %.4f), x=%.4f, fx=%.4f, min=%.4f, fmin=%.4f\n",
k, left, right, x, fx, min, fmin);
#endif
/* Continue only if `left' and `right' point to different integers */
return (int)left == (int)right;
}
static int plfit_i_continuous_xmin_opt_linear_scan(
plfit_continuous_xmin_opt_data_t* opt_data, plfit_result_t* best_result,
size_t* best_n) {
/* this must be signed because OpenMP with Windows MSVC needs signed for
* loop index variables. ssize_t will not work because that is a POSIX
* extension */
ptrdiff_t i = 0; /* initialize to work around incorrect warning issued by Clang 9.0 */
plfit_result_t global_best_result;
size_t global_best_n;
/* Prepare some variables */
global_best_n = 0;
global_best_result.D = DBL_MAX;
global_best_result.xmin = 0;
global_best_result.alpha = 0;
/* Due to the OpenMP parallelization, we do things as follows. Each
* OpenMP thread will search for the best D-score on its own and store
* the result in a private local_best_result variable. The end of the
* parallel block contains a critical section that threads will enter
* one by one and compare their private local_best_result with a
* global_best that is shared among the threads.
*/
#ifdef _OPENMP
#pragma omp parallel shared(global_best_result, global_best_n) private(i) firstprivate(opt_data)
#endif
{
/* These variables are private since they are declared within the
* parallel block */
plfit_result_t local_best_result;
plfit_continuous_xmin_opt_data_t local_opt_data = *opt_data;
size_t local_best_n;
/* Initialize the local_best_result and local_best_n variables */
local_best_n = 0;
local_best_result.D = DBL_MAX;
local_best_result.xmin = 0;
local_best_result.alpha = 0;
local_best_result.p = NAN;
local_best_result.L = NAN;
/* The range of the for loop below is divided among the threads.
* nowait means that there will be no implicit barrier at the end
* of the loop so threads that get there earlier can enter the
* critical section without waiting for the others */
#ifdef _OPENMP
#pragma omp for nowait schedule(dynamic,10)
#endif
for (i = 0; i < local_opt_data.num_probes-1; i++) {
plfit_i_continuous_xmin_opt_evaluate(&local_opt_data, i);
if (local_opt_data.last.D < local_best_result.D) {
#ifdef PLFIT_DEBUG
printf("Found new local best at %g with D=%g\n",
local_opt_data.last.xmin, local_opt_data.last.D);
#endif
local_best_result = local_opt_data.last;
local_best_n = local_opt_data.end - local_opt_data.probes[i];
}
}
/* Critical section that finds the global best result from the
* local ones collected by each thread */
#ifdef _OPENMP
#pragma omp critical
#endif
if (local_best_result.D < global_best_result.D) {
global_best_result = local_best_result;
global_best_n = local_best_n;
#ifdef PLFIT_DEBUG
printf("Found new global best at %g with D=%g\n", global_best_result.xmin,
global_best_result.D);
#endif
}
}
*best_result = global_best_result;
*best_n = global_best_n;
#ifdef PLFIT_DEBUG
printf("Returning global best: %g\n", best_result->xmin);
#endif
return PLFIT_SUCCESS;
}
int plfit_continuous(double* xs, size_t n, const plfit_continuous_options_t* options,
plfit_result_t* result) {
gss_parameter_t gss_param;
plfit_continuous_xmin_opt_data_t opt_data;
plfit_result_t best_result = {
/* alpha = */ NAN,
/* xmin = */ NAN,
/* L = */ NAN,
/* D = */ NAN,
/* p = */ NAN
};
int success;
size_t i, best_n, num_uniques = 0;
double x, *px, **uniques, **strata;
int error_code, retval = PLFIT_SUCCESS;
DATA_POINTS_CHECK;
/* Set up pointers that we will allocate */
opt_data.begin = NULL;
uniques = NULL;
strata = NULL;
/* Sane defaults */
best_n = n;
if (!options)
options = &plfit_continuous_default_options;
/* Make a copy of xs and sort it */
PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &opt_data.begin));
opt_data.end = opt_data.begin + n;
/* Create an array containing pointers to the unique elements of the input. From
* each block of unique elements, we add the pointer to the first one. */
uniques = unique_element_pointers(opt_data.begin, opt_data.end, &num_uniques);
if (uniques == 0) {
free(opt_data.begin);
PLFIT_ERROR("cannot fit continuous power-law", PLFIT_ENOMEM);
}
/* We will now determine the best xmin that yields the lowest D-score. The
* 'success' variable will denote whether the search procedure we tried was
* successful. If it is false after having exhausted all options, we fall
* back to a linear search. */
success = 0;
switch (options->xmin_method) {
case PLFIT_GSS_OR_LINEAR:
/* Try golden section search first. */
if (num_uniques > 5) {
opt_data.probes = uniques;
opt_data.num_probes = num_uniques;
gss_parameter_init(&gss_param);
success = (gss(0, opt_data.num_probes-5, &x, 0,
plfit_i_continuous_xmin_opt_evaluate,
plfit_i_continuous_xmin_opt_progress, &opt_data, &gss_param) == 0);
if (success) {
px = opt_data.probes[(int)x];
best_n = opt_data.end-px+1;
best_result = opt_data.last;
}
}
break;
case PLFIT_STRATIFIED_SAMPLING:
if (num_uniques >= 50) {
/* Try stratified sampling to narrow down the interval where the minimum
* is likely to reside. We check 10% of the unique items, distributed
* evenly, find the one with the lowest D-score, and then check the
* area around it more thoroughly. */
const size_t subdivision_length = 10;
size_t num_strata = num_uniques / subdivision_length;
double **strata = calloc(num_strata, sizeof(double*));
for (i = 0; i < num_strata; i++) {
strata[i] = uniques[i * subdivision_length];
}
opt_data.probes = strata;
opt_data.num_probes = num_strata;
error_code = plfit_i_continuous_xmin_opt_linear_scan(&opt_data, &best_result, &best_n);
if (error_code != PLFIT_SUCCESS) {
retval = error_code;
goto cleanup;
}
opt_data.num_probes = 0;
for (i = 0; i < num_strata; i++) {
if (*strata[i] == best_result.xmin) {
/* Okay, scan more thoroughly from strata[i-1] to strata[i+1],
* which is from uniques[(i-1)*subdivision_length] to
* uniques[(i+1)*subdivision_length */
opt_data.probes = uniques + (i > 0 ? (i-1)*subdivision_length : 0);
opt_data.num_probes = 0;
if (i != 0)
opt_data.num_probes += subdivision_length;
if (i != num_strata-1)
opt_data.num_probes += subdivision_length;
break;
}
}
free(strata); strata = NULL;
if (opt_data.num_probes > 0) {
/* Do a strict linear scan in the subrange determined above */
error_code = plfit_i_continuous_xmin_opt_linear_scan(
&opt_data, &best_result, &best_n
);
if (error_code) {
retval = error_code;
goto cleanup;
}
success = 1;
} else {
/* This should not happen, but we handle it anyway */
success = 0;
}
}
break;
default:
/* Just use the linear search */
break;
}
if (!success) {
/* More advanced search methods failed or were skipped; try linear search */
opt_data.probes = uniques;
opt_data.num_probes = num_uniques;
error_code = plfit_i_continuous_xmin_opt_linear_scan(&opt_data, &best_result, &best_n);
if (error_code) {
retval = error_code;
goto cleanup;
}
success = 1;
}
/* Get rid of the uniques array, we don't need it any more */
free(uniques); uniques = NULL;
/* Sort out the result */
*result = best_result;
if (options->finite_size_correction)
plfit_i_perform_finite_size_correction(result, best_n);
error_code = plfit_log_likelihood_continuous(
opt_data.begin + n - best_n, best_n, result->alpha, result->xmin,
&result->L
);
if (error_code) {
retval = error_code;
goto cleanup;
}
error_code = plfit_i_calculate_p_value_continuous(opt_data.begin, n, options, 0, result);
if (error_code) {
retval = error_code;
goto cleanup;
}
cleanup:
if (strata) {
free(strata);
}
if (uniques) {
free(uniques);
}
free(opt_data.begin);
return retval;
}
/********** Discrete power law distribution fitting **********/
typedef struct {
size_t m;
double logsum;
double xmin;
} plfit_i_estimate_alpha_discrete_data_t;
static double plfit_i_logsum_discrete(double* begin, double* end, double xmin) {
double logsum = 0.0;
for (; begin != end; begin++)
logsum += log(*begin);
return logsum;
}
static void plfit_i_logsum_less_than_discrete(double* begin, double* end, double xmin,
double* logsum, size_t* m) {
double result = 0.0;
size_t count = 0;
for (; begin != end; begin++) {
if (*begin < xmin)
continue;
result += log(*begin);
count++;
}
*logsum = result;
*m = count;
}
static lbfgsfloatval_t plfit_i_estimate_alpha_discrete_lbfgs_evaluate(
void* instance, const lbfgsfloatval_t* x,
lbfgsfloatval_t* g, const int n,
const lbfgsfloatval_t step) {
plfit_i_estimate_alpha_discrete_data_t* data;
lbfgsfloatval_t result;
double dx = step;
double huge = 1e10; /* pseudo-infinity; apparently DBL_MAX does not work */
double lnhzeta_x=NAN;
double lnhzeta_deriv_x=NAN;
data = (plfit_i_estimate_alpha_discrete_data_t*)instance;
#ifdef PLFIT_DEBUG
printf("- Evaluating at %.4f (step = %.4f, xmin = %.4f)\n", *x, step, data->xmin);
#endif
if (isnan(*x)) {
g[0] = huge;
return huge;
}
/* Find the delta X value to estimate the gradient */
if (dx > 0.001 || dx == 0)
dx = 0.001;
else if (dx < -0.001)
dx = -0.001;
/* Is x[0] in its valid range? */
if (x[0] <= 1.0) {
/* The Hurwitz zeta function is infinite in this case */
g[0] = (dx > 0) ? -huge : huge;
return huge;
}
if (x[0] + dx <= 1.0) {
g[0] = huge;
result = x[0] * data->logsum + data->m * hsl_sf_lnhzeta(x[0], data->xmin);
} else {
hsl_sf_lnhzeta_deriv_tuple(x[0], data->xmin, &lnhzeta_x, &lnhzeta_deriv_x);
g[0] = data->logsum + data->m * lnhzeta_deriv_x;
result = x[0] * data->logsum + data->m * lnhzeta_x;
}
#ifdef PLFIT_DEBUG
printf(" - Gradient: %.4f\n", g[0]);
printf(" - Result: %.4f\n", result);
#endif
return result;
}
static int plfit_i_estimate_alpha_discrete_lbfgs_progress(void* instance,
const lbfgsfloatval_t* x, const lbfgsfloatval_t* g,
const lbfgsfloatval_t fx, const lbfgsfloatval_t xnorm,
const lbfgsfloatval_t gnorm, const lbfgsfloatval_t step,
int n, int k, int ls) {
return 0;
}
static int plfit_i_estimate_alpha_discrete_linear_scan(double* xs, size_t n,
double xmin, double* alpha, const plfit_discrete_options_t* options,
plfit_bool_t sorted) {
double curr_alpha, best_alpha, L, L_max;
double logsum;
size_t m;
XMIN_CHECK_ONE;
if (options->alpha.min <= 1.0) {
PLFIT_ERROR("alpha.min must be greater than 1.0", PLFIT_EINVAL);
}
if (options->alpha.max < options->alpha.min) {
PLFIT_ERROR("alpha.max must be greater than alpha.min", PLFIT_EINVAL);
}
if (options->alpha.step <= 0) {
PLFIT_ERROR("alpha.step must be positive", PLFIT_EINVAL);
}
if (sorted) {
logsum = plfit_i_logsum_discrete(xs, xs+n, xmin);
m = n;
} else {
plfit_i_logsum_less_than_discrete(xs, xs+n, xmin, &logsum, &m);
}
best_alpha = options->alpha.min; L_max = -DBL_MAX;
for (curr_alpha = options->alpha.min; curr_alpha <= options->alpha.max;
curr_alpha += options->alpha.step) {
L = -curr_alpha * logsum - m * hsl_sf_lnhzeta(curr_alpha, xmin);
if (L > L_max) {
L_max = L;
best_alpha = curr_alpha;
}
}
*alpha = best_alpha;
return PLFIT_SUCCESS;
}
static int plfit_i_estimate_alpha_discrete_lbfgs(double* xs, size_t n, double xmin,
double* alpha, const plfit_discrete_options_t* options, plfit_bool_t sorted) {
lbfgs_parameter_t param;
lbfgsfloatval_t* variables;
plfit_i_estimate_alpha_discrete_data_t data;
int ret;
XMIN_CHECK_ONE;
/* Initialize algorithm parameters */
lbfgs_parameter_init(¶m);
param.max_iterations = 0; /* proceed until infinity */
/* Set up context for optimization */
data.xmin = xmin;
if (sorted) {
data.logsum = plfit_i_logsum_discrete(xs, xs+n, xmin);
data.m = n;
} else {
plfit_i_logsum_less_than_discrete(xs, xs+n, xmin, &data.logsum, &data.m);
}
/* Allocate space for the single alpha variable */
variables = lbfgs_malloc(1);
variables[0] = 3.0; /* initial guess */
/* Optimization */
ret = lbfgs(1, variables, /* ptr_fx = */ 0,
plfit_i_estimate_alpha_discrete_lbfgs_evaluate,
plfit_i_estimate_alpha_discrete_lbfgs_progress,
&data, ¶m);
if (ret < 0 &&
ret != LBFGSERR_ROUNDING_ERROR &&
ret != LBFGSERR_MAXIMUMLINESEARCH &&
ret != LBFGSERR_MINIMUMSTEP &&
ret != LBFGSERR_CANCELED) {
char buf[4096];
snprintf(buf, 4096, "L-BFGS optimization signaled an error (error code = %d)", ret);
lbfgs_free(variables);
PLFIT_ERROR(buf, PLFIT_FAILURE);
}
*alpha = variables[0];
/* Deallocate the variable array */
lbfgs_free(variables);
return PLFIT_SUCCESS;
}
static int plfit_i_estimate_alpha_discrete_fast(double* xs, size_t n, double xmin,
double* alpha, const plfit_discrete_options_t* options, plfit_bool_t sorted) {
plfit_continuous_options_t cont_options;
if (!options)
options = &plfit_discrete_default_options;
plfit_continuous_options_init(&cont_options);
cont_options.finite_size_correction = options->finite_size_correction;
XMIN_CHECK_ONE;
if (sorted) {
return plfit_i_estimate_alpha_continuous_sorted(xs, n, xmin-0.5, alpha);
} else {
return plfit_i_estimate_alpha_continuous(xs, n, xmin-0.5, alpha);
}
}
static int plfit_i_estimate_alpha_discrete(double* xs, size_t n, double xmin,
double* alpha, const plfit_discrete_options_t* options,
plfit_bool_t sorted) {
switch (options->alpha_method) {
case PLFIT_LBFGS:
PLFIT_CHECK(plfit_i_estimate_alpha_discrete_lbfgs(xs, n, xmin, alpha,
options, sorted));
break;
case PLFIT_LINEAR_SCAN:
PLFIT_CHECK(plfit_i_estimate_alpha_discrete_linear_scan(xs, n, xmin,
alpha, options, sorted));
break;
case PLFIT_PRETEND_CONTINUOUS:
PLFIT_CHECK(plfit_i_estimate_alpha_discrete_fast(xs, n, xmin,
alpha, options, sorted));
break;
default:
PLFIT_ERROR("unknown optimization method specified", PLFIT_EINVAL);
}
return PLFIT_SUCCESS;
}
static int plfit_i_ks_test_discrete(double* xs, double* xs_end, const double alpha,
const double xmin, double* D) {
/* Assumption: xs is sorted and cut off at xmin so the first element is
* always larger than or equal to xmin. */
double result = 0, n, lnhzeta, x;
int m = 0;
n = xs_end - xs;
lnhzeta = hsl_sf_lnhzeta(alpha, xmin);
while (xs < xs_end) {
double d;
x = *xs;
/* Re the next line: this used to be the following:
*
* fabs( 1 - hzeta(alpha, x) / hzeta(alpha, xmin) - m / n)
*
* However, using the Hurwitz zeta directly sometimes yields
* underflows (see Github pull request #17 and related issues).
* hzeta(alpha, x) / hzeta(alpha, xmin) can be replaced with
* exp(lnhzeta(alpha, x) - lnhzeta(alpha, xmin)), but then
* we have 1 - exp(something), which is better to calculate
* with a dedicated expm1() function.
*/
d = fabs( expm1( hsl_sf_lnhzeta(alpha, x) - lnhzeta ) + m / n);
if (d > result)
result = d;
do {
xs++; m++;
} while (xs < xs_end && *xs == x);
}
*D = result;
return PLFIT_SUCCESS;
}
static int plfit_i_calculate_p_value_discrete(double* xs, size_t n,
const plfit_discrete_options_t* options, plfit_bool_t xmin_fixed,
plfit_result_t *result) {
long int num_trials;
long int successes = 0;
double *xs_head;
size_t num_smaller;
plfit_discrete_options_t options_no_p_value = *options;
int retval = PLFIT_SUCCESS;
if (options->p_value_method == PLFIT_P_VALUE_SKIP) {
/* skipping p-value calculation */
result->p = NAN;
return PLFIT_SUCCESS;
}
if (options->p_value_method == PLFIT_P_VALUE_APPROXIMATE) {
/* p-value approximation; most likely an upper bound */
num_smaller = count_smaller(xs, xs + n, result->xmin);
result->p = plfit_ks_test_one_sample_p(result->D, n - num_smaller);
return PLFIT_SUCCESS;
}
options_no_p_value.p_value_method = PLFIT_P_VALUE_SKIP;
num_trials = (long int)(0.25 / options->p_value_precision / options->p_value_precision);
if (num_trials <= 0) {
PLFIT_ERROR("invalid p-value precision", PLFIT_EINVAL);
}
/* Extract the head of xs that contains elements smaller than xmin */
xs_head = extract_smaller(xs, xs+n, result->xmin, &num_smaller);
if (xs_head == 0)
PLFIT_ERROR("cannot calculate exact p-value", PLFIT_ENOMEM);
#ifdef _OPENMP
#pragma omp parallel
#endif
{
/* Parallel section starts here. If we are compiling using OpenMP, each
* thread will use its own RNG that is seeded from the master RNG. If
* we are compiling without OpenMP, there is only one thread and it uses
* the master RNG. This section must be critical to ensure that only one
* thread is using the master RNG at the same time. */
#ifdef _OPENMP
plfit_mt_rng_t private_rng;
#endif
plfit_mt_rng_t *p_rng;
double *ys;
long int i;
plfit_result_t result_synthetic;
#ifdef _OPENMP
#pragma omp critical
{
p_rng = &private_rng;
plfit_mt_init_from_rng(p_rng, options->rng);
}
#else
p_rng = options->rng;
#endif
/* Allocate memory to sample into */
ys = calloc(n, sizeof(double));
if (ys == 0) {
retval = PLFIT_ENOMEM;
} else {
/* The main for loop starts here. */
#ifdef _OPENMP
#pragma omp for reduction(+:successes)
#endif
for (i = 0; i < num_trials; i++) {
plfit_i_resample_discrete(xs_head, num_smaller, n, result->alpha,
result->xmin, n, p_rng, ys);
if (xmin_fixed) {
plfit_estimate_alpha_discrete(ys, n, result->xmin,
&options_no_p_value, &result_synthetic);
} else {
plfit_discrete(ys, n, &options_no_p_value, &result_synthetic);
}
if (result_synthetic.D > result->D)
successes++;
}
free(ys);
}
/* End of parallelized part */
}
free(xs_head);
if (retval == PLFIT_SUCCESS) {
result->p = successes / ((double)num_trials);
} else {
PLFIT_ERROR("cannot calculate exact p-value", retval);
}
return retval;
}
int plfit_log_likelihood_discrete(double* xs, size_t n, double alpha, double xmin, double* L) {
double result;
size_t m;
if (alpha <= 1) {
PLFIT_ERROR("alpha must be greater than one", PLFIT_EINVAL);
}
XMIN_CHECK_ONE;
plfit_i_logsum_less_than_discrete(xs, xs+n, xmin, &result, &m);
result = - alpha * result - m * hsl_sf_lnhzeta(alpha, xmin);
*L = result;
return PLFIT_SUCCESS;
}
int plfit_estimate_alpha_discrete(double* xs, size_t n, double xmin,
const plfit_discrete_options_t* options, plfit_result_t *result) {
double *xs_copy, *begin, *end;
if (!options)
options = &plfit_discrete_default_options;
/* Check the validity of the input parameters */
DATA_POINTS_CHECK;
if (options->alpha_method == PLFIT_LINEAR_SCAN) {
if (options->alpha.min <= 1.0) {
PLFIT_ERROR("alpha.min must be greater than 1.0", PLFIT_EINVAL);
}
if (options->alpha.max < options->alpha.min) {
PLFIT_ERROR("alpha.max must be greater than alpha.min", PLFIT_EINVAL);
}
if (options->alpha.step <= 0) {
PLFIT_ERROR("alpha.step must be positive", PLFIT_EINVAL);
}
}
PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &xs_copy));
begin = xs_copy; end = xs_copy + n;
while (begin < end && *begin < xmin)
begin++;
PLFIT_CHECK(plfit_i_estimate_alpha_discrete(begin, end-begin, xmin, &result->alpha,
options, /* sorted = */ 1));
PLFIT_CHECK(plfit_i_ks_test_discrete(begin, end, result->alpha, xmin, &result->D));
result->xmin = xmin;
if (options->finite_size_correction)
plfit_i_perform_finite_size_correction(result, end-begin);
PLFIT_CHECK(plfit_log_likelihood_discrete(begin, end-begin, result->alpha,
result->xmin, &result->L));
PLFIT_CHECK(plfit_i_calculate_p_value_discrete(xs, n, options, 1, result));
free(xs_copy);
return PLFIT_SUCCESS;
}
int plfit_discrete(double* xs, size_t n, const plfit_discrete_options_t* options,
plfit_result_t* result) {
double curr_D, curr_alpha;
plfit_result_t best_result;
double *xs_copy, *px, *end, *end_xmin, prev_x;
size_t best_n;
int m;
if (!options)
options = &plfit_discrete_default_options;
/* Check the validity of the input parameters */
DATA_POINTS_CHECK;
if (options->alpha_method == PLFIT_LINEAR_SCAN) {
if (options->alpha.min <= 1.0) {
PLFIT_ERROR("alpha.min must be greater than 1.0", PLFIT_EINVAL);
}
if (options->alpha.max < options->alpha.min) {
PLFIT_ERROR("alpha.max must be greater than alpha.min", PLFIT_EINVAL);
}
if (options->alpha.step <= 0) {
PLFIT_ERROR("alpha.step must be positive", PLFIT_EINVAL);
}
}
PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &xs_copy));
best_result.D = DBL_MAX;
best_result.xmin = 1;
best_result.alpha = 1;
best_n = 0;
/* Skip initial values from xs_copy until we get to a positive element or
* until we reach the end of the array */
px = xs_copy; end = px + n; end_xmin = end - 1;
while (px < end && *px < 1) {
px++;
}
/* Make sure there are at least three distinct values if possible */
m = px - xs_copy;
prev_x = *end_xmin;
while (end_xmin > px && *end_xmin == prev_x) {
end_xmin--;
}
prev_x = *end_xmin;
while (end_xmin > px && *end_xmin == prev_x) {
end_xmin--;
}
prev_x = 0;
end_xmin++;
while (px < end_xmin) {
while (px < end_xmin && *px == prev_x) {
px++; m++;
}
PLFIT_CHECK(
plfit_i_estimate_alpha_discrete(
px, n-m, *px, &curr_alpha, options, /* sorted = */ 1
)
);
PLFIT_CHECK(
plfit_i_ks_test_discrete(px, end, curr_alpha, *px, &curr_D)
);
if (curr_D < best_result.D) {
best_result.alpha = curr_alpha;
best_result.xmin = *px;
best_result.D = curr_D;
best_n = n-m;
}
prev_x = *px;
px++; m++;
}
*result = best_result;
if (options->finite_size_correction)
plfit_i_perform_finite_size_correction(result, best_n);
PLFIT_CHECK(plfit_log_likelihood_discrete(xs_copy + n - best_n, best_n,
result->alpha, result->xmin, &result->L));
PLFIT_CHECK(plfit_i_calculate_p_value_discrete(xs_copy, n, options, 0, result));
free(xs_copy);
return PLFIT_SUCCESS;
}
/***** resampling routines to generate synthetic replicates ****/
static int plfit_i_resample_continuous(double* xs_head, size_t num_smaller,
size_t n, double alpha, double xmin, size_t num_samples, plfit_mt_rng_t* rng,
double* result)
{
size_t num_orig_samples, i;
/* Calculate how many samples have to be drawn from xs_head */
num_orig_samples = (size_t) plfit_rbinom(num_samples, num_smaller / (double)n, rng);
/* Draw the samples from xs_head */
for (i = 0; i < num_orig_samples; i++, result++) {
*result = xs_head[(size_t)plfit_runif(0, num_smaller, rng)];
}
/* Draw the remaining samples from the fitted distribution */
PLFIT_CHECK(plfit_rpareto_array(xmin, alpha-1, num_samples-num_orig_samples, rng,
result));
return PLFIT_SUCCESS;
}
int plfit_resample_continuous(double* xs, size_t n, double alpha, double xmin,
size_t num_samples, plfit_mt_rng_t* rng, double* result) {
double *xs_head;
size_t num_smaller = 0;
int retval;
/* Extract the head of xs that contains elements smaller than xmin */
xs_head = extract_smaller(xs, xs+n, xmin, &num_smaller);
if (xs_head == 0)
PLFIT_ERROR("cannot resample continuous dataset", PLFIT_ENOMEM);
retval = plfit_i_resample_continuous(xs_head, num_smaller, n, alpha, xmin,
num_samples, rng, result);
/* Free xs_head; we don't need it any more */
free(xs_head);
return retval;
}
static int plfit_i_resample_discrete(double* xs_head, size_t num_smaller, size_t n,
double alpha, double xmin, size_t num_samples, plfit_mt_rng_t* rng,
double* result)
{
size_t num_orig_samples, i;
/* Calculate how many samples have to be drawn from xs_head */
num_orig_samples = (size_t) plfit_rbinom(num_samples, num_smaller / (double)n, rng);
/* Draw the samples from xs_head */
for (i = 0; i < num_orig_samples; i++, result++) {
*result = xs_head[(size_t)plfit_runif(0, num_smaller, rng)];
}
/* Draw the remaining samples from the fitted distribution */
PLFIT_CHECK(plfit_rzeta_array((long int)xmin, alpha,
num_samples-num_orig_samples, rng, result));
return PLFIT_SUCCESS;
}
int plfit_resample_discrete(double* xs, size_t n, double alpha, double xmin,
size_t num_samples, plfit_mt_rng_t* rng, double* result) {
double *xs_head;
size_t num_smaller = 0;
int retval;
/* Extract the head of xs that contains elements smaller than xmin */
xs_head = extract_smaller(xs, xs+n, xmin, &num_smaller);
if (xs_head == 0)
PLFIT_ERROR("cannot resample discrete dataset", PLFIT_ENOMEM);
retval = plfit_i_resample_discrete(xs_head, num_smaller, n, alpha, xmin,
num_samples, rng, result);
/* Free xs_head; we don't need it any more */
free(xs_head);
return retval;
}
/******** calculating the p-value of a fitted model only *******/
int plfit_calculate_p_value_continuous(double* xs, size_t n,
const plfit_continuous_options_t* options, plfit_bool_t xmin_fixed,
plfit_result_t *result) {
double* xs_copy;
PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &xs_copy));
PLFIT_CHECK(plfit_i_calculate_p_value_continuous(xs_copy, n, options,
xmin_fixed, result));
free(xs_copy);
return PLFIT_SUCCESS;
}
int plfit_calculate_p_value_discrete(double* xs, size_t n,
const plfit_discrete_options_t* options, plfit_bool_t xmin_fixed,
plfit_result_t *result) {
double* xs_copy;
PLFIT_CHECK(plfit_i_copy_and_sort(xs, n, &xs_copy));
PLFIT_CHECK(plfit_i_calculate_p_value_discrete(xs_copy, n, options,
xmin_fixed, result));
free(xs_copy);
return PLFIT_SUCCESS;
}