Content - 11044948fc8e9dbcb299953ee35e6e501b46697e - a02892e/src/sampling.c

visit type:
https://github.com/ntamas/plfit

03 April 2024, 17:57:03 UTC
Tip revision: 651adf2bdb8d6b468f16c930898b8a5c2597a253 authored by Tamas Nepusz on 12 March 2024, 13:46:47 UTC
chore: bumped version to 0.9.6
Tip revision: 651adf2
sampling.c
/* sampling.c
 *
 * Copyright (C) 2012 Tamas Nepusz
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

#include <limits.h>
#include <math.h>

#include "plfit_error.h"
#include "plfit_sampling.h"
#include "platform.h"

inline double plfit_runif(double lo, double hi, plfit_mt_rng_t* rng) {
    if (rng == 0) {
        return lo + rand() / ((double)RAND_MAX) * (hi-lo);
    }
    return lo + plfit_mt_uniform_01(rng) * (hi-lo);
}

inline double plfit_runif_01(plfit_mt_rng_t* rng) {
    if (rng == 0) {
        return rand() / ((double)RAND_MAX);
    }
    return plfit_mt_uniform_01(rng);
}

inline double plfit_rpareto(double xmin, double alpha, plfit_mt_rng_t* rng) {
    if (alpha <= 0 || xmin <= 0)
        return NAN;

    /* 1-u is used in the base here because we want to avoid the case of
     * sampling zero */
    return pow(1-plfit_runif_01(rng), -1.0 / alpha) * xmin;
}

int plfit_rpareto_array(double xmin, double alpha, size_t n, plfit_mt_rng_t* rng,
        double* result) {
    double gamma;

    if (alpha <= 0 || xmin <= 0)
        return PLFIT_EINVAL;

    if (result == 0 || n == 0)
        return PLFIT_SUCCESS;

    gamma = -1.0 / alpha;
    while (n > 0) {
        /* 1-u is used in the base here because we want to avoid the case of
         * sampling zero */
        *result = pow(1-plfit_runif_01(rng), gamma) * xmin;
        result++; n--;
    }

    return PLFIT_SUCCESS;
}

inline double plfit_rzeta(long int xmin, double alpha, plfit_mt_rng_t* rng) {
    double u, v, t;
    long int x;
    double alpha_minus_1 = alpha-1;
    double minus_1_over_alpha_minus_1 = -1.0 / (alpha-1);
    double b;
    double one_over_b_minus_1;

    if (alpha <= 0 || xmin < 1)
        return NAN;

    xmin = (long int) round(xmin);

    /* Rejection sampling for the win. We use Y=floor(U^{-1/alpha} * xmin) as the
     * envelope distribution, similarly to Chapter X.6 of Luc Devroye's book
     * (where xmin is assumed to be 1): http://luc.devroye.org/chapter_ten.pdf
     *
     * Some notes that should help me recover what I was doing:
     *
     * p_i = 1/zeta(alpha, xmin) * i^-alpha
     * q_i = (xmin/i)^{alpha-1} - (xmin/(i+1))^{alpha-1}
     *     = (i/xmin)^{1-alpha} - ((i+1)/xmin)^{1-alpha}
     *     = [i^{1-alpha} - (i+1)^{1-alpha}] / xmin^{1-alpha}
     *
     * p_i / q_i attains its maximum at xmin=i, so the rejection constant is:
     *
     * c = p_xmin / q_xmin
     *
     * We have to accept the sample if V <= (p_i / q_i) * (q_xmin / p_xmin) =
     * (i/xmin)^-alpha * [xmin^{1-alpha} - (xmin+1)^{1-alpha}] / [i^{1-alpha} - (i+1)^{1-alpha}] =
     * [xmin - xmin^alpha / (xmin+1)^{alpha-1}] / [i - i^alpha / (i+1)^{alpha-1}] =
     * xmin/i * [1-(xmin/(xmin+1))^{alpha-1}]/[1-(i/(i+1))^{alpha-1}]
     *
     * In other words (and substituting i with X, which is the same),
     *
     * V * (X/xmin) <= [1 - (1+1/xmin)^{1-alpha}] / [1 - (1+1/i)^{1-alpha}]
     *
     * Let b := (1+1/xmin)^{alpha-1} and let T := (1+1/i)^{alpha-1}. Then:
     *
     * V * (X/xmin) <= [(b-1)/b] / [(T-1)/T]
     * V * (X/xmin) * (T-1) / (b-1) <= T / b
     *
     * which is the same as in Devroye's book, except for the X/xmin term, and
     * the definition of b.
     */
    b = pow(1 + 1.0/xmin, alpha_minus_1);
    one_over_b_minus_1 = 1.0/(b-1);
    do {
        do {
            u = plfit_runif_01(rng);
            v = plfit_runif_01(rng);
            /* 1-u is used in the base here because we want to avoid the case of
             * having zero in x */
            x = (long int) floor(pow(1-u, minus_1_over_alpha_minus_1) * xmin);
        } while (x < xmin);
        t = pow((x+1.0)/x, alpha_minus_1);
    } while (v*x*(t-1)*one_over_b_minus_1*b > t*xmin);

    return x;
}

int plfit_rzeta_array(long int xmin, double alpha, size_t n, plfit_mt_rng_t* rng,
        double* result) {
    double u, v, t;
    long int x;
    double alpha_minus_1 = alpha-1;
    double minus_1_over_alpha_minus_1 = -1.0 / (alpha-1);
    double b, one_over_b_minus_1;

    if (alpha <= 0 || xmin < 1)
        return PLFIT_EINVAL;

    if (result == 0 || n == 0)
        return PLFIT_SUCCESS;

    /* See the comments in plfit_rzeta for an explanation of the algorithm
     * below. */
    xmin = (long int) round(xmin);
    b = pow(1 + 1.0/xmin, alpha_minus_1);
    one_over_b_minus_1 = 1.0/(b-1);

    while (n > 0) {
        do {
            do {
                u = plfit_runif_01(rng);
                v = plfit_runif_01(rng);
                /* 1-u is used in the base here because we want to avoid the case of
                 * having zero in x */
                x = (long int) floor(pow(1-u, minus_1_over_alpha_minus_1) * xmin);
            } while (x < xmin);     /* handles overflow as well */
            t = pow((x+1.0)/x, alpha_minus_1);
        } while (v*x*(t-1)*one_over_b_minus_1*b > t*xmin);
        *result = x;
        if (x < 0) return PLFIT_EINVAL;
        result++; n--;
    }

    return PLFIT_SUCCESS;
}

int plfit_walker_alias_sampler_init(plfit_walker_alias_sampler_t* sampler,
        double* ps, size_t n) {
    double *p, *p2, *ps_end;
    double sum;
    long int *short_sticks, *long_sticks;
    long int num_short_sticks, num_long_sticks;
    long int i;

    if (n > LONG_MAX) {
        return PLFIT_EINVAL;
    }

    sampler->num_bins = (long int) n;

    ps_end = ps + n;

    /* Initialize indexes and probs */
    sampler->indexes = (long int*)calloc(n > 0 ? n : 1, sizeof(long int));
    if (sampler->indexes == NULL) {
        return PLFIT_ENOMEM;
    }
    sampler->probs   = (double*)calloc(n > 0 ? n : 1, sizeof(double));
    if (sampler->probs == NULL) {
        free(sampler->indexes);
        return PLFIT_ENOMEM;
    }

    /* Normalize the probability vector; count how many short and long sticks
     * are there initially */
    for (sum = 0.0, p = ps; p != ps_end; p++) {
        sum += *p;
    }
    sum = n / sum;

    num_short_sticks = num_long_sticks = 0;
    for (p = ps, p2 = sampler->probs; p != ps_end; p++, p2++) {
        *p2 = *p * sum;
        if (*p2 < 1) {
            num_short_sticks++;
        } else if (*p2 > 1) {
            num_long_sticks++;
        }
    }

    /* Allocate space for short & long stick indexes */
    long_sticks = (long int*)calloc(num_long_sticks > 0 ? num_long_sticks : 1, sizeof(long int));
    if (long_sticks == NULL) {
        free(sampler->probs);
        free(sampler->indexes);
        return PLFIT_ENOMEM;
    }
    short_sticks = (long int*)calloc(num_short_sticks > 0 ? num_short_sticks : 1, sizeof(long int));
    if (short_sticks == NULL) {
        free(sampler->probs);
        free(sampler->indexes);
        free(long_sticks);
        return PLFIT_ENOMEM;
    }

    /* Initialize short_sticks and long_sticks */
    num_short_sticks = num_long_sticks = 0;
    for (i = 0, p = sampler->probs; i < n; i++, p++) {
        if (*p < 1) {
            short_sticks[num_short_sticks++] = i;
        } else if (*p > 1) {
            long_sticks[num_long_sticks++] = i;
        }
    }

    /* Prepare the index table */
    while (num_short_sticks && num_long_sticks) {
        long int short_index, long_index;
        short_index = short_sticks[--num_short_sticks];
        long_index = long_sticks[num_long_sticks-1];
        sampler->indexes[short_index] = long_index;
        sampler->probs[long_index] =     /* numerical stability */
            (sampler->probs[long_index] + sampler->probs[short_index]) - 1;
        if (sampler->probs[long_index] < 1) {
            short_sticks[num_short_sticks++] = long_index;
            num_long_sticks--;
        }
    }

    /* Fix numerical stability issues */
    while (num_long_sticks) {
        i = long_sticks[--num_long_sticks];
        sampler->probs[i] = 1;
    }
    while (num_short_sticks) {
        i = short_sticks[--num_short_sticks];
        sampler->probs[i] = 1;
    }

    free(short_sticks);
    free(long_sticks);

    return PLFIT_SUCCESS;
}


void plfit_walker_alias_sampler_destroy(plfit_walker_alias_sampler_t* sampler) {
    if (sampler->indexes) {
        free(sampler->indexes);
        sampler->indexes = 0;
    }
    if (sampler->probs) {
        free(sampler->probs);
        sampler->probs = 0;
    }
}


int plfit_walker_alias_sampler_sample(const plfit_walker_alias_sampler_t* sampler,
        long int *xs, size_t n, plfit_mt_rng_t* rng) {
    double u;
    long int j;
    long int *x;

    x = xs;

    if (rng == 0) {
        /* Using built-in RNG */
        while (n > 0) {
            u = rand() / ((double)RAND_MAX);
            j = rand() % sampler->num_bins;
            *x = (u < sampler->probs[j]) ? j : sampler->indexes[j];
            n--; x++;
        }
    } else {
        /* Using Mersenne Twister */
        while (n > 0) {
            u = plfit_mt_uniform_01(rng);
            j = plfit_mt_random(rng) % sampler->num_bins;
            *x = (u < sampler->probs[j]) ? j : sampler->indexes[j];
            n--; x++;
        }
    }

    return PLFIT_SUCCESS;
}