Revision d12d603036b334b53d6e886cfa985a082e981860 authored by Emmanuel Thomé on 29 January 2021, 06:20:31 UTC, committed by Emmanuel Thomé on 29 January 2021, 21:39:17 UTC
1 parent 590bfe4
Raw File
las-norms.cpp
#include "cado.h" // IWYU pragma: keep

#include <cinttypes>              // for PRId64, PRIu32
#include <climits>                // for UCHAR_MAX
#include <cstdlib>                // for free, malloc, abs
#include <cstring>                // for memset, size_t, NULL
#include <algorithm>              // for min, max
#include <cmath>                  // for fabs, log2, sqrt, pow, trunc, ceil
#include <cstdint>                // for int64_t, uint32_t
#include <cstdarg>             // IWYU pragma: keep
#include <iomanip>                // for operator<<, setprecision
#include <list>                   // for _List_const_iterator, list
#include <sstream>                // IWYU pragma: keep
#include <utility>                // for swap, pair
#include <gmp.h> // IWYU pragma: keep // for gmp_vfprintf, mpz_srcptr, ...
#include "cxx_mpz.hpp"
#include "las-norms.hpp"
#include "fb-types.h"             // for sublat_t
#include "las-config.h"           // for LOG_BUCKET_REGION, LOGNORM_GUARD_BITS
#include "las-siever-config.hpp"  // for siever_config::side_config, siever_...
#include "las-todo-entry.hpp"     // for las_todo_entry
#include "rho.h"        // dickman_rho_local
#include "verbose.h"    // verbose_output_print
#include "macros.h"

using namespace std;

/***********************************************************************/
/* {{{ some utility stuff */
/****************************************************************************
 * Tricky arithmetic functions to compute some values around ~log2.
 * Log2(n) is computed by the mantissa of n with IEEE 754 representation.
 ****************************************************************************/

/* lg2 is an approximation to log2, that is reasonably fast to compute.
 *
 * Let L be the function defined as follows. 
 * Let z = 2^a * (1+x) for a \in Z, and 0<=x<1. Let L(z) = a + x.
 *
 * We have 0 <= log2(z) - L(z) < 0.0861, for any real z > 0.
 *
 * lg2_raw(z) below returns (L(z) + 0x3FF) * 2^20
 *
 * lg2(z,a,s) below returns (lg2_raw(z) - a) * s.
 *
 * In particular, for a = 0x3FF00000 - G*2^20/s, we have:
 *
 *      lg2_raw(z,a,s/2^20) = L(z)*s+G
 *
 * Note that 0 <= log2(z)*s - L(z)*s < 0.0861*s
 * (IOW, the larger the scale, the larger the error we make here).
 */
static inline double lg2_raw (double i) {
    /* This is claimed to take 3 cycles only. I'm yet to be convinced...
    */
#ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
    __asm__ __volatile__ (
            "psrlq $0x20,  %0    \n"
            "cvtdq2pd      %0, %0\n" /* Mandatory in packed double even though
                                        it is not packed! */
            : "+&x" (i));            /* Really need + here! i is not modified in C code! */
    return i;
#else
    /* Same function, but in x86 gcc needs to convert the input i from a
     * xmm register to a classical register. No other way than use memory.
     * So this function needs at least 6 to 8 cycles more than the previous,
     * which uses ~3 cycles.

     * NOTE: tg declaration is mandatory: it's the place where gcc uses
     * memory to do the conversion. Without it, a warning appears but the
     * code is wrong!
     *
     * TODO: check that. I think we're playing dirty games here, which
     * will blow up sooner or later.
     */
    void *tg = &i;
    return (double)(*(uint64_t *)tg >> 0x20);
#endif
}

/* This is "morally" 2^x -- except that we need to take into account the
 * fact that lg2 is a bit fuzzy.
 */
static double lg2_reciprocal(double x) {
    int x0 = x;
    return exp2(x0) * (1 + x-x0);
}


static inline double lg2 (double i, double offset, double scale) {
    /* see comment above */
    return (lg2_raw(i) - offset) * scale;
}

/* n0 is expected to be a constant */
static inline void memset_with_writeahead(void *s, int c, size_t n, size_t n0)
{
#ifdef __GNUC__
    if (__builtin_constant_p(n) || !__builtin_constant_p(n0)) {
        memset(s, c, n);
        return;
    }
#endif
    if (n <= n0)
        memset(s, c, n0);
    else
        memset(s, c, n);
}


/*}}}*/
/* {{{ some utility stuff which I believe is now covered by double_poly, in
 * fact. Should check that.  */

/* This computes u = scale^deg(f) * f(x/scale)
 * not the same as double_poly_scale, deleted in commit
 * c480fe82174a9de96e1cd35b2317fdf0de3678ab
 *
 * Note that this is a local function only, let's keep it here...
 */
static inline void
double_poly_reverse_scale(double_poly_ptr u, double_poly_srcptr f, const double scale)
{
    double_poly_realloc(u, f->deg + 1);
    if (f->deg < 0) {
        double_poly_set_zero(u);
        return;
    }
    int d = f->deg;
    u->coeff[d] = f->coeff[d];
    for(double h = scale ; d-- ; h *= scale) u->coeff[d] = f->coeff[d] * h;
    double_poly_cleandeg(u, f->deg);
}
/* }}} */


/***********************************************************************/
/* {{{ max absolute norms for a homogeneous polynomial along a line.
 *
 * TODO: We need to do this over a circle as well, because that should be
 * the proper way to do the sieve_info_update_norm_data_Jmax function.
 * Presumably it would suffice to parameterize this by tan(t/2), so that
 * we are led to this function again.
 */
/* {{{ get_maxnorm_aux (for x in (0,s)) */
/* return max |g(x)| for x in (0, s) where s can be negative,
   and g(x) = g[d]*x^d + ... + g[1]*x + g[0] */
static double get_maxnorm_aux (double_poly_srcptr poly, double s)
{
  double_poly derivative;
  const int d = poly->deg;

  ASSERT_ALWAYS(d >= 0);

  if (d == 0)
    return fabs (poly->coeff[0]);

  double *roots = (double*) malloc (poly->deg * sizeof (double));
  FATAL_ERROR_CHECK(roots == NULL, "malloc failed");

  /* Compute the derivative of polynomial */
  double_poly_init (derivative, d - 1);
  double_poly_derivative (derivative, poly);

  /* Look for extrema of the polynomial, i.e., for roots of the derivative */
  const unsigned int nr_roots = double_poly_compute_roots(roots, derivative, s);

  /* now abscissae of all extrema of poly are 0, roots[0], ...,
     roots[nr_roots-1], s */
  double gmax = fabs (poly->coeff[0]);
  for (unsigned int k = 0; k <= nr_roots; k++)
    {
      double x = (k < nr_roots) ? roots[k] : s;
      double va = fabs (double_poly_eval (poly, x));
      if (va > gmax)
        gmax = va;
    }
  free (roots);
  double_poly_clear(derivative);
  return gmax;
}
/* }}} */

/* {{{ get_maxnorm_aux_pm (for x in (-s,s)) */
/* Like get_maxnorm_aux(), but for interval [-s, s] */
static double
get_maxnorm_aux_pm (double_poly_srcptr poly, double s)
{
  double norm1 = get_maxnorm_aux(poly, s);
  double norm2 = get_maxnorm_aux(poly, -s);
  return (norm2 > norm1) ? norm2 : norm1;
}
/* }}} */

/* {{{ get_maxnorm_rectangular */
/* returns the maximal value of |F(x,y)| for -X <= x <= X, 0 <= y <= Y,
 * where F(x,y) is a homogeneous polynomial of degree d.
 * Let F(x,y) = f(x/y)*y^d, and F(x,y) = rev(F)(y,x).
 * Since F is homogeneous, we know M = max |F(x,y)| is attained on the border
 * of the rectangle, i.e.:
 * (a) either on F(X,y) for -Y <= y <= Y (right-hand-side border, and the
 *     mirrored image of the left-hand-side border); We want the maximum of
 *     rev(F)(y,X) = rev(f)(y/X) * X^d for -Y <= j <= Y;
 *     this is rev(f)(t) * X^d for -Y/X <= t <= Y/X.
 * (b) either on F(x,Y) for -X <= x <= X (top border)
 *     = f(x/Y) * Y^d; this is f(t) * Y^d for -X/Y <= t <= X/J.
 * (d) or on F(x,0) for -X <= x <= X (lower border, on the abscissa), but this
 *     maximum is f[d]*X^d, and is attained in (a).
 */
double
get_maxnorm_rectangular (double_poly_srcptr src_poly, const double X,
			 const double Y)
{
  const unsigned int d = src_poly->deg;
  double norm, max_norm;

  /* Make copy of polynomial as we need to revert the coefficients */
  double_poly poly;
  double_poly_init (poly, d);
  double_poly_set (poly, src_poly);

  /* (b) determine the maximum of |f(x)| * Y^d for -X/Y <= x <= X/Y */
  max_norm = get_maxnorm_aux_pm (poly, X/Y) * pow(Y, (double)d);

  /* (a) determine the maximum of |g(y)| for -1 <= y <= 1, with g(y) = F(s,y) */
  double_poly_revert(poly, poly);
  norm = get_maxnorm_aux_pm (poly, Y/X) * pow(X, (double)d);
  if (norm > max_norm)
    max_norm = norm;

  double_poly_clear(poly);

  return max_norm;
}
/* }}} */
/* }}} */

/***********************************************************************/

/*
 * These are two initializations of the algebraic and rational norms.
 * These 2 initializations compute F(i, const j)=f(i) for each line.
 * f(i)=log2(abs(sum[k=0...d] Ak i^k)+1.)*scale+LOGNORM_GUARD_BITS = [LOGNORM_GUARD_BITS...254],
 * where Ak are the coefficients of the polynomial.
 *
 * The classical initialization is slow; the only optimization is done
 * by the fast computation of the log2. The associated error guarantees
 * the precision of the results +/- 1.
 *
 * The "smart" version is faster (more than 10* for the rational side,
 * almost 100* for the algebraic side).
 *
 * Both versions are implemented further down in this file.
 */

/* {{{ On the ::fill function in lognorm_base derived classes. */
/* This function is used to initialize lognorms
 * (=log2(F(i,j)*scale+LOGNORM_GUARD_BITS)
   for the bucket_region S[] number J.
   It's a wrapper; except for trivial degree = 0, it extracts the interesting
   parameters of the lognorm structure and calls the right function.

   - For degree 0, S[] is initialized by a memset: always exact.
   - A special ultra fast init function is used for degree = 1; it could be
     considered as exact (the maximal error is always -/+ 1 on S[]).
   - For smart = 0 and others degrees, the exact values F(i,j) are
     computed with a fast log2. The maximal error is always -/+ 1 on S[].
     It's obviously slow.

   - For smart != 0 and degree > 1, we first approximate the polynomial
     by piecewise linear functions, which are correct up to the
     prescribed multiplicative factor.
*/

/* }}} */

lognorm_base::lognorm_base(siever_config const & sc, cxx_cado_poly const & cpoly, int side, qlattice_basis const & Q, int logI, int J)
    : logI(logI), J(J)
    /*{{{*/
{
    int64_t H[4] = { Q.a0, Q.b0, Q.a1, Q.b1 };

    /* Update floating point version of polynomial. They will be used in
     * get_maxnorm_rectangular(). */

    mpz_poly_homography (fij, cpoly->pols[side], H);
    if (Q.doing.side == side) {
        ASSERT_ALWAYS(mpz_poly_divisible_mpz(fij, Q.doing.p));
        mpz_poly_divexact_mpz(fij, fij, Q.doing.p);
    }
    double_poly_set_mpz_poly(fijd, fij);
    // Take sublat into account: multiply all coefs by m^deg.
    // We do it only for the floating point version, that is used to
    // compute a bound on the norms, and in the norm_init phase.
    if (Q.sublat.m > 0)
        double_poly_mul_double(fijd, fijd, pow(Q.sublat.m, fijd->deg));

    int I = 1 << logI;

    maxlog2 = log2(get_maxnorm_rectangular (fijd, (double)(I/2), (double)J));

    /* we know that |F(a,b)| < 2^(maxlog2) or |F(a,b)/q| < 2^(maxlog2)
       depending on the special-q side. */
    /* we used to increase artificially maxlog2, purportedly "to allow larger values of J". I don't see why it's useful. */
    // maxlog2 += 2.0;

    /* we want to map 0 <= x < maxlog2 to LOGNORM_GUARD_BITS <= y < UCHAR_MAX,
       thus y = LOGNORM_GUARD_BITS + x * (UCHAR_MAX-LOGNORM_GUARD_BITS)/maxlog2. */
    scale = (UCHAR_MAX - LOGNORM_GUARD_BITS) / maxlog2;

    /* We require that scale is of the form (int) * 0.025, so that only a small
       number of different factor base slicings can occur. */
    /* Note: if we replace 1/40 by a power-of-two fraction of unity, that
     * could enable some tricks with norms initialization */
    scale = (int)(scale * 40) * 0.025;

    verbose_output_start_batch();
    verbose_output_print (0, 2,
            "# Side %d: log2(maxnorm)=%1.2f scale=%1.2f, logbase=%1.6f",
            side, maxlog2, scale, exp2 (1. / scale));

    /* we want to select relations with a cofactor of less than r bits */
    double max_lambda = (maxlog2 - LOGNORM_GUARD_BITS / scale) / sc.sides[side].lpb;
    double lambda = sc.sides[side].lambda;
    double r = maxlog2 - LOGNORM_GUARD_BITS / scale;

    /* when lambda = 0 (automatic), we take mfb/lpb + 0.3, which is
       experimentally close to optimal in terms of seconds per relation
       (+ 0.2 might be even better on the rational side) */
    if (lambda == 0.0)
        lambda = 0.3 + (double) sc.sides[side].mfb /
            (double) sc.sides[side].lpb ;

    r = std::min(r, lambda * sc.sides[side].lpb);

    /* other option is to ditch this +0.3 and define r (and hence the
     * bound) from mfb directly. Maybe a constant offse would be better
     * than a multiplicative one...
     * r = std::min(r, lambda ? lambda * sc.sides[side].lpb : sc.sides[side].mfb);
     */

    bound = (unsigned char) (r * scale + LOGNORM_GUARD_BITS);

    verbose_output_print (0, 2, " bound=%u\n", bound);
    if (lambda > max_lambda)
        verbose_output_print (0, 2, "# Warning, lambda>%.1f on side %d does "
                "not make sense (capped to limit)\n", max_lambda, side);

    verbose_output_end_batch();
}/*}}}*/

void lognorm_base::norm(mpz_ptr x, int i, unsigned int j) const {
    mpz_poly_homogeneous_eval_siui(x, fij, i, j);
}
unsigned char lognorm_base::lognorm(int i, unsigned int j) const {
    cxx_mpz x;
    norm(x, i, j);
    return log2(mpz_get_d(x)) * scale + LOGNORM_GUARD_BITS;
}

    /* common definitions -- for the moment it's a macro, eventually I
     * expect it's gonna be something else, probably simply replicated
     * code, who knows... */
#define LOGNORM_FILL_COMMON_DEFS()				         \
    unsigned int log_lines_per_region = MAX(0, LOG_BUCKET_REGION - logI);\
    unsigned int log_regions_per_line = MAX(0, logI - LOG_BUCKET_REGION);\
    unsigned int regions_per_line = 1 << log_regions_per_line;           \
    unsigned int region_rank_in_line = N & (regions_per_line - 1);       \
    unsigned int j0 = (N >> log_regions_per_line) << log_lines_per_region;    \
    unsigned int j1 = j0 + (1 << log_lines_per_region);                  \
    int I = 1 << logI;						         \
    int i0 = (region_rank_in_line << LOG_BUCKET_REGION) - I/2;           \
    int i1 = i0 + (1 << MIN(LOG_BUCKET_REGION, logI));                   \
    do {} while (0)

#define LOGNORM_COMMON_HANDLE_ORIGIN() do {				\
    bool has_haxis = !j0;                                               \
    bool has_vaxis = region_rank_in_line == ((regions_per_line-1)/2);   \
    bool has_origin = has_haxis && has_vaxis;                           \
    if (UNLIKELY(has_origin)) {						\
	/* compute only the norm for i = 1. Everybody else is 255. */	\
        memset(S, 255, i1-i0);						\
        if (has_origin) {						\
            double norm = (log2(fabs(fijd->coeff[fijd->deg]))) * scale;	\
            S[1 - i0] = LOGNORM_GUARD_BITS + (unsigned char) (norm);			\
        }								\
        /* And now make sure we start at the next line */		\
        S+=I;								\
	j0++;								\
    }									\
} while (0)

/***********************************************************************/

/* {{{ reference slow code for computing lognorms */
lognorm_reference::lognorm_reference(siever_config const & sc, cxx_cado_poly const & cpoly, int side, qlattice_basis const & Q, int logI, int J) : lognorm_base(sc, cpoly, side, Q, logI, J)/*{{{*/
{
    /* Knowing the norm on the rational side is bounded by 2^(2^k), compute
     * lognorms approximations for k bits of exponent + NORM_BITS-k bits
     * of mantissa.
     * Do the same for the algebraic side (with the corresponding bound for
     * the norms.
     */
    int k = (int) ceil (log2 (maxlog2));
    int K = 1 << k;
    ASSERT_ALWAYS(lognorm_reference::NORM_BITS >= k);
    int l = lognorm_reference::NORM_BITS - k;
    int L = 1 << l;

    /* extract k bits from the exponent, and l bits from the mantissa */
    double h = 1.0 / (double) L;
    double e,m;
    int i,j;
    for (e = 1.0, i = 0; i < K; i++, e *= 2.0)
    {
        /* e = 2^i for 0 <= i < 2^k */
        for (m = 1.0, j = 0; j < L; j++, m += h)
        {
            /* m = 1 + j/2^l for 0 <= j < 2^l */
            double norm = m * e;
            /* Warning: since sdata->maxlog2 does not usually correspond to
               a power a two, and we consider full binades here, we have to
               take care that values > sdata->maxlog2 do not wrap around to 0 */
            norm = log2 (norm);
            if (norm >= maxlog2)
                lognorm_table[(i << l) + j] = 255;
            else
            {
                norm = norm * scale;
                lognorm_table[(i << l) + j] = LOGNORM_GUARD_BITS + (unsigned char) norm;
            }
        }
    }
}/*}}}*/
/* {{{ void lognorm_fill_rat_reference */
/* Initialize lognorms on the rational side for the bucket_region
 * number N.
 *
 * This is adapted from the reference version which was slaughtered in
 * commit 80df430a. It is slow, but at least readable.
 *
 * nothing clever, wrt discarding (a,b) pairs that are
 * not coprime, except for the line j=0.
 */
static void lognorm_fill_rat_reference(
        unsigned char *S,
        uint32_t N,
        int logI,
        double scale,
        cxx_double_poly const & fijd,
        double maxlog2,
        const unsigned char * L) /* L = lognorm_table */
{
    LOGNORM_FILL_COMMON_DEFS();
    LOGNORM_COMMON_HANDLE_ORIGIN();

    double u0 = fijd->coeff[0];
    double u1 = fijd->coeff[1];

    int l = lognorm_reference::NORM_BITS - (int) ceil(log2(maxlog2));

    for(unsigned int j = j0 ; j < j1 ; j++) {
	double z = u0 * j + u1 * i0;
	for (int i = i0; i < i1; i++) {
            uint64_t y;
            /* clang doesn't seem to like this one */
#if defined(HAVE_SSE2) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM) && !defined(__clang__)
            /* This does: y = *(uint64_t*)&z */
            asm("":"=x"(y):"0"(z));
#else
            union { uint64_t y; double z; } foo;
            foo.z=z;
            y=foo.y;
#endif
            /* we first get rid of the sign bit, then unshift the
             * exponent.  */
            y = ((y<<1) - (UINT64_C(0x3FF)<<53)) >> (53-l);
            unsigned char n = L[y];
	    ASSERT(n > 0);
	    *S++ = n;
	    z += u1;
	}
    }
}
/* }}} */
/* {{{ void lognorm_fill_alg_reference */
/* Exact initialisation of F(i,j) with degre >= 2 (not mandatory). Slow.
   Internal function, only with simple types, for unit/integration testing. */
static void lognorm_fill_alg_reference (unsigned char *S, uint32_t N, int logI, double scale, cxx_double_poly const & fijd)
{
    LOGNORM_FILL_COMMON_DEFS();
    LOGNORM_COMMON_HANDLE_ORIGIN();

    double modscale = scale/0x100000;
    const double offset = 0x3FF00000 - LOGNORM_GUARD_BITS / modscale;

    cxx_double_poly u;
    for (unsigned int j = j0 ; j < j1 ; j++) {
        double_poly_reverse_scale(u, fijd, j);
        for(int i = i0; i < i0 + I; i++) {
            *S++ = lg2(fabs(double_poly_eval (u, i)), offset, modscale);
        }
    }
}
/* }}} */
void lognorm_reference::fill(unsigned char * S, int N) const/*{{{*/
{
    if (fijd->deg == 1)
        lognorm_fill_rat_reference(S, N, logI, scale, fijd, maxlog2, lognorm_table);
    else
        lognorm_fill_alg_reference(S, N, logI, scale, fijd);
}
/*}}}*/
/* }}} */

/***********************************************************************/

/* {{{ faster code */
lognorm_smart::lognorm_smart(siever_config const & sc, cxx_cado_poly const & cpoly, int side, qlattice_basis const & Q, int logI, int J) : lognorm_base(sc, cpoly, side, Q, logI, J)/*{{{*/
{
    /* See init_degree_one_norms_bucket_region_smart for the explanation of
     * this table */
    for (int i = 0; i < 257 ; i++)
        cexp2[i] = lg2_reciprocal((i-LOGNORM_GUARD_BITS)/scale);

    if (fijd->deg > 1) {
        piecewise_linear_approximator A(fijd, log(2)/scale);
        int I = 1 << logI;
        G = A.logapprox(-(I/2), I/2);
    }
}/*}}}*/

static inline double compute_y(double G, double offset, double modscale) {
    double res = lg2 ((G) + 1., offset, modscale);
    return res;
}

/* {{{ void lognorm_fill_rat_smart. */
/* Initialize lognorms of the bucket region S[] number N, for F(i,j) with
 * degree = 1.
 */
static void lognorm_fill_rat_smart_inner (unsigned char *S, int i0, int i1, unsigned int j0, unsigned int j1, double scale, cxx_double_poly const & fijd, const double *cexp2)
{
    double modscale = scale / 0x100000;
    double offset = 0x3FF00000 - LOGNORM_GUARD_BITS / modscale;
    /* The lg2 function wants  a positive value.
     * Here, in the classical rational initialization, the degree of the
     * used polynomial F(i,j) is hardcoded to 1. So, it's possible to have
     * F(i,j) = 0, even if i, j and the coefficients of F(i,j) are integers
     * and != 0.  So, I add 1.0 on all G values.  It's not useful to do a
     * fabs(G) here because the code uses always COMPUTE_Y(G) with G >= 0.
     */
// #define COMPUTE_Y(G) lg2 ((G) + 1., offset, modscale)
#define COMPUTE_Y(G) compute_y(G, offset, modscale)
    /* COMPUTE_Y(z) returns LOGNORM_GUARD_BITS + L(z) * scale. Recall that
     * we have 0 <= log2(z) - L(z) < 0.0861, for any real z > 0.
     */

    double u0 = fijd->coeff[0];
    double u1 = fijd->coeff[1];

    if (UNLIKELY(u1 == 0)) {
        /* constant approximating functions do occur, see bug 21684. This
         * is triggered in particular by quadratic polynomials, which we
         * do get with nonlinear polynomial selection.
         */
        for (unsigned int j = j0; j < j1; j++) {
            double g = fabs(u0 * j);
            uint8_t y = COMPUTE_Y(g);
            size_t di = i1 - i0;
            memset_with_writeahead(S, y, di, MEMSET_MIN);
            S += di;
        }
        return;
    }

    using std::signbit;
    double u1_abs = fabs(u1);
    double inv_u1_abs = 1/u1_abs;

    for (unsigned int j = j0; j < j1; j++) {
	int i = i0;
	double g = fabs(u0 * j + u1 * i0);
	uint8_t y;
	double root = -u0 * j / u1;

	bool root_ahead = false;

	/*
	 * Caveat: the sign of g is not significant when g is almost zero
	 * -- then we'd like to use the sign of u1 as indicating the sign
	 *  (after all, g+u1 is the value for the next i).
	 *
	 * To check for "g is almost zero", see bug #16388 and commit
	 * 5f682c6. We use this test, which is admittedly a bit surprising.
	 */
	if (LIKELY(g * (1ULL << 51) >= fabs(u0 * j))) {
	    y = COMPUTE_Y(g);
	    root_ahead = root >= i0;
	} else {
	    y = LOGNORM_GUARD_BITS;
	}

	if (root_ahead) {
            /* One of two possible cases here:
             * g > 0, real root ahead, so decreasing logs, and u1 < 0
             * g < 0, real root ahead, so decreasing logs, and u1 > 0
             */

	    /* Handle sequence of decreasing logs, compute stop points
	     * using incremental updates on the ix values */
            /* Each time we've rounded (down) the log value, since we
             * assume that logs are decreasing, we expect that the exact
             * spot where the log actually takes the very rounded value
             * we've computed is some steps away. Compute that, and
             * advance there.
             *
             * To do that, cexp2[y] incorporates a reciprocal to lg2abs.
             */
            for (; i < i1; y--) {
                double ix = root - cexp2[y] * inv_u1_abs;
                /* conversion rounding is towards zero, while we want it
                 * to be towards +infinity. */
                ix += (ix >= 0);
                if (UNLIKELY(ix >= (double) i1))
		    ix = i1;
		size_t di = (int) ix - i;	/* The cast matters ! */
		i = ix;
		if (!di)
                    break;
		memset_with_writeahead(S, y, di, MEMSET_MIN);
		S += di;
	    }
	    if (i >= i1)
		continue;

	    /* Now compute until y really changes signs. We don't expect
	     * that this will be needed more than very few times.  */
	    g = u0 * j + u1 * i;
            for( ; i < i1 && signbit(g) != signbit(u1) ; i++, g+=u1)
                *S++ = COMPUTE_Y(fabs(g));
	    if (i >= i1)
		continue;

	    /* change sign */
	    g = fabs(g);
	    y = COMPUTE_Y(g);
	}

        /* Invariant: g = fabs(u0 * j + u1 * i), and y = COMPUTE_Y(g) */

	/* One of two possible cases here:
	 * g < 0, real root behind, so increasing logs, and u1 < 0
	 * g > 0, real root behind, so increasing logs, and u1 > 0
         */

        /* If we are in the steep region where the log increases very
         * fast, we can't go for the general loop just now. We have to
         * advance a bit, at least until the wild jumps have calmed down
         * a bit.
         */
        for (; i < i1;) {
            *S++ = y;
            i++;
            g += u1_abs;
            uint8_t oy = y;
            y = COMPUTE_Y(g);
            if (y == oy)
                break;
        }
        if (i >= i1)
            continue;

        /* Now that logs are increasing, we want ix to be the position
         * where the logs reaches the value y+1, of course.
         */
	for (; i < i1; y++) {
	    double ix = root + cexp2[(unsigned int) y + 1] * inv_u1_abs;
            /* conversion rounding is towards zero, while we want it
             * to be towards +infinity. */
            ix += (ix >= 0);
            // ASSERT ((int) ix >= i); // see bug 21518
            if (LIKELY(ix >= (double) i)) {
                /* It is most likely that we'll only see this branch. Yet
                 * bug 21518 seems to trigger a nasty corner case */
                if (UNLIKELY(ix >= i1))
                    ix = i1;
                size_t di = (int) ix - i;	/* The cast matters ! */
                i = ix;
                memset_with_writeahead(S, y, di, MEMSET_MIN);
                S += di;
            }
	}
    }
#undef COMPUTE_Y
}

static void lognorm_fill_rat_smart (unsigned char *S, uint32_t N, int logI, double scale, cxx_double_poly const & fijd, const double *cexp2)
{
    LOGNORM_FILL_COMMON_DEFS();
    LOGNORM_COMMON_HANDLE_ORIGIN();
    lognorm_fill_rat_smart_inner(S, i0, i1, j0, j1, scale, fijd, cexp2);
}
/* }}} */

static void lognorm_fill_alg_smart (unsigned char *S, uint32_t N, int logI, double scale, cxx_double_poly const & fijd, piecewise_linear_function const & G, const double *cexp2) /* {{{ */
{
    LOGNORM_FILL_COMMON_DEFS();
    LOGNORM_COMMON_HANDLE_ORIGIN();
    for(unsigned int j = j0 ; j < j1 ; j++) {
        /* G approximates F(x,1).
         *
         * We have F(i,j) = j^deg(F) * F(i/j,1), so that given a set of linear
         * functions {g} which match F(x,1) on the intervals {[r0,r1[},
         * selecting those which intersect [-I/j, I/j[. We want to
         * evaluate the functions g(x/j)*j^d on the intervals [j*r0,j*r1[.
         *
         * Note that when g=u+vx,
         *      g(x/j)*j^d = u*j^d+v*j^(d-1)x
         *                 = u*j^(d-1) * j + v*j^(d-1) * x
         */
        auto it = G.endpoints.begin();
        double r0 = *it++;
        cxx_double_poly uv_pol(1);
        uv_pol->deg = 1;
        // int i = i0;
        double mj1 = j;
        ASSERT(fijd->deg > 1);
        for(int d = fijd->deg ; --d > 1 ; mj1 *= j);
        for(std::pair<double, double> uv : G.equations) {
            double r1 = *it++;
            int Gi0 = j*r0 + (j*r0 >= 0);
            int Gi1 = j*r1 + (j*r1 >= 0);
            Gi0 = std::max(Gi0, i0);
            Gi1 = std::min(Gi1, i1);
            // ASSERT(Gi0 >= i);
            if (Gi1 > Gi0) {
                // ASSERT(Gi0 == i);
                uv_pol->coeff[0] = uv.first * mj1;
                uv_pol->coeff[1] = uv.second * mj1;
                lognorm_fill_rat_smart_inner(S, Gi0, Gi1, j, j+1, scale, uv_pol, cexp2);
                S += Gi1 - Gi0;
                // i = Gi1;
            }
            r0 = r1;
        }
    }
}
/* }}} */

void lognorm_smart::fill(unsigned char * S, int N) const/*{{{*/
{
    if (fijd->deg == 1)
        lognorm_fill_rat_smart(S, N, logI, scale, fijd, cexp2);
    else
        lognorm_fill_alg_smart(S, N, logI, scale, fijd, G, cexp2);
}
/*}}}*/

/* }}} */


/* TODO: some of the sieve_range_adjust stuff can quite probably be
 * refactored on top of lognorm_base */

/* returns the maximal value of |F(X*cos(t),Y*sin(t))|,
   where F(x,y) is a homogeneous polynomial of degree d, 0 <= t <= pi.
   Let cos(t) = u/sqrt(1+u^2) and sin(t) = 1/sqrt(1+u^2) for -Inf < u < Inf,
   we have F^2(X*cos(t),Y*sin(t)) = F^2(X*u,Y)/(1+u^2)^d where d = deg(F),
   thus its derivative with respect to u is (up to a factor):
   X * f'(u*X/Y) * (1+u^2) - d * Y * u * f(u*X/Y).
*/
static double
get_maxnorm_circular (double_poly_srcptr src_poly, const double X,
		      const double Y)
{
  const unsigned int d = src_poly->deg;
  double_poly poly;
  double *roots, x, y, v, max_norm, t;
  unsigned int nr, i;

  double_poly_init (poly, d + 1);

  /* first compute X * f'(u*X/Y) * (1+u^2) */
  poly->coeff[0] = 0.0;
  poly->coeff[1] = 0.0;
  for (i = 1; i <= d; i++)
    {
      t = pow (X / Y, (double) i - 1.0);
      /* the following will set coefficients of degree 2, 3, ..., d+1 */
      poly->coeff[i + 1] = X * (double) i * src_poly->coeff[i] * t;
      /* the following will add to coefficients of degree 0, 1, ..., d-1 */
      poly->coeff[i - 1] += X * (double) i * src_poly->coeff[i] * t;
    }

  /* now subtract d * Y * u * f(u*X/Y) */
  for (i = 0; i <= d; i++)
    {
      t = src_poly->coeff[i] * pow (X / Y, (double) i);
      poly->coeff[i + 1] -= (double) d * Y * t;
    }
  double_poly_cleandeg(poly, d + 1);

  roots = (double*) malloc ((d + 2) * sizeof (double));
  nr = double_poly_compute_all_roots (roots, poly);

  /* evaluate at y=0 */
  max_norm = fabs (src_poly->coeff[d] * pow (X, (double) d));
  for (i = 0; i < nr; i++)
    {
      double u = roots[i];
      x = X * u / sqrt (1.0 + u * u);
      y = Y / sqrt (1.0 + u * u);
      v = double_poly_eval (src_poly, x / y) * pow (y, (double) d);
      v = fabs (v);
      if (v > max_norm)
	max_norm = v;
    }

  free (roots);
  double_poly_clear(poly);

  return max_norm;
}

/* {{{ various strategies to adjust the sieve area */

void sieve_range_adjust::prepare_fijd()/*{{{*/
{
    int64_t H[4] = { Q.a0, Q.b0, Q.a1, Q.b1 };
    /* We need to get the floating point polynomials. Yes, it will be
     * done several times in the computation, but that's a trivial
     * computation anyway.
     */
    for (int side = 0; side < 2; side++) {
        cxx_mpz_poly fz;
        mpz_poly_homography (fz, cpoly->pols[side], H);
        if (Q.doing.side == side) {
            ASSERT_ALWAYS(mpz_poly_divisible_mpz(fz, Q.doing.p));
            mpz_poly_divexact_mpz(fz, fz, Q.doing.p);
        }
        double_poly_set_mpz_poly(fijd[side], fz);
    }
}/*}}}*/

/* sieve_range_adjust::sieve_info_update_norm_data_Jmax {{{
 *
 * choose the largest possible J by simply bounding the Fij and Gij
 * polynomials
 * 
 * The image in the a,b-plane of the sieve region might be slanted at an
 * angle against the abscissa, thus even though it has the correct
 * skewness, it might include larger a,b-coordinates than a non-slanted
 * image would.
 * 
 * We compute the maximum norm that would occur if we had a perfect
 * lattice basis in the sense that its image forms a rectangle -A/2 <= a <
 * A/2, 0 <= b < B, with A/2/B = skew, and A*B = I*J*q (assuming J=I/2
 * here).  Thus we have B = A/2/skew, A*A/2/skew = I*I/2*q, A =
 * I*sqrt(q*skew).  The optimal maximum norm is then the maximum of
 * |F(a,b)| in this rectangle, divided by q on the special-q side.
 * 
 * Then we reduce J so that the maximum norm in the image of the actual
 * sieve region is no larger than this optimal maximum, times some
 * constant fudge factor.
 */
int
sieve_range_adjust::sieve_info_update_norm_data_Jmax (bool keep_logI)
{
  // The following parameter controls the scaling on the norm.
  // Relevant values are between 1.0 and 3.0. A higher value means we
  // select higher values of J, and therefore we find more relations, but
  // this increases the time per relation.
  // The value 2.0 seems to be a good compromise. Setting 1.5 reduces the
  // time per relation and number of relations by about 1.5% on a typical
  // RSA704 benchmark.
  const double fudge_factor = 2.0;
  if (!keep_logI)
      logI = (logA+1)/2;
  const double I = (double) (1 << logI);
  const double q = mpz_get_d(Q.doing.p);
  const double skew = cpoly->skew;
  const double A = (1 << logI)*sqrt(q*skew);
  const double B = (1 << (logA - logI))*sqrt(q/skew);
  double Jmax = (1 << (logA - logI));

  prepare_fijd();

  for (int side = 0; side < 2; side++)
    {
      /* Compute the best possible maximum norm, i.e., assuming a nice
         circular sieve region in the a,b-plane */
      double_poly dpoly;
      double_poly_init (dpoly, cpoly->pols[side]->deg);
      double_poly_set_mpz_poly (dpoly, cpoly->pols[side]);
      if (Q.sublat.m > 0)
          double_poly_mul_double(dpoly, dpoly, pow(Q.sublat.m, cpoly->pols[side]->deg));
      double maxnorm = get_maxnorm_circular (dpoly, fudge_factor*A/2.,
              fudge_factor*B);
      double_poly_clear (dpoly);
      if (side == Q.doing.side)
        maxnorm /= q;

      /* in the (i,j)-plane, the sieving region is rectangular */
      double v = get_maxnorm_rectangular (fijd[side], I/2, Jmax);

      if (v > maxnorm)
      { /* use dichotomy to determine largest Jmax */
          double a, b, c;
          a = 0.0;
          b = Jmax;
          while (trunc (a) != trunc (b))
          {
              c = (a + b) * 0.5;
              v = get_maxnorm_rectangular (fijd[side], I/2, c);
              if (v < maxnorm)
                  a = c;
              else
                  b = c;
          }
          Jmax = trunc (a) + 1; /* +1 since we don't sieve for j = Jmax */
      }
    }

  J = (unsigned int) Jmax;

  return round_to_full_bucket_regions(__func__);
}//}}}

/* {{{ a few helpers (otherwise I'll manage to get things wrong
 * eventually)
 */
sieve_range_adjust::vec<double> operator*(sieve_range_adjust::vec<double> const& a, sieve_range_adjust::mat<int> const& m) {
    return sieve_range_adjust::vec<double>(a[0]*m(0,0)+a[1]*m(1,0),a[0]*m(0,1)+a[1]*m(1,1));
}

qlattice_basis operator*(sieve_range_adjust::mat<int> const& m, qlattice_basis const& Q) {
    qlattice_basis R = Q;       // copy all non-matrix fields.
    R.a0 = m(0,0) * Q.a0 + m(0,1) * Q.a1;
    R.a1 = m(1,0) * Q.a0 + m(1,1) * Q.a1;
    R.b0 = m(0,0) * Q.b0 + m(0,1) * Q.b1;
    R.b1 = m(1,0) * Q.b0 + m(1,1) * Q.b1;
    return R;
}
//}}}

/* estimate_yield_in_sieve_area {{{ 
 *
 * Approximate the integral of
 * dickman_rho(log2(F(x,y))/lpb)*dickman_rho(log2(G(x,y))/lpb), which
 * supposedly gives an idea of the expected yield over this sieve area
 *
 * The integral is taken over a range of size 2^A by integrating over
 * 2^(2N-1) points.
 *
 * The range is taken as one half of the 0-centered range whose width and
 * height are 2^(ceil(A/2)-squeeze)) times 2^(floor(A/2)+squeeze+1) --
 * this larger range has size 2^(A+1).
 * 
 * Integration points are chosen as centers of *rectangles* (not
 * squares) which are proportional to the sieve area.
 *
 * The shuffle[] argument can be used to specify an alternate basis
 */
double sieve_range_adjust::estimate_yield_in_sieve_area(mat<int> const& shuffle, int squeeze, int N)
{
    int lpbs[2] = { conf.sides[0].lpb, conf.sides[1].lpb };
    int nx = 1 << (N - squeeze);
    int ny = 1 << (N + squeeze);

    /* We'll integrate over [-X/2,X/2] times [0,Y/2], which has to have
     * size 2^A */
    double X = 1UL << ((logA-logA/2) - squeeze);
    double Y = 1UL << (logA/2     + squeeze + 1);
    /* In other words, X is I, and Jmax is Y/2. We can see that X*Y/2 =
     * 2^A as desired */

    /* Beware, we're really using (nx+1)*(ny/2+1) points, but weighted. */

    double weightsum = 0;

    double sum = 0;
    for(int i = -nx/2 ; i <= nx/2 ; i++) {
        double x = X/nx * i;
        /* We're doing half of the computation on the y axis, since
         * it's symmetric anyway */
        for(int j = 0 ; j <= ny/2 ; j++) {
            double y = Y/ny * j;
            vec<double> xys = vec<double>(x,y) * shuffle;

            double weight = 1;
            if (i == -nx/2 || i == nx/2) weight /= 2;
            if (j == 0 || j == ny/2) weight /= 2;
            verbose_output_print(0, 4, "# %d %d (%.2f) %.1f %.1f", i, j, weight, xys[0], xys[1]);

            double prod = 1;
            for(int side = 0 ; side < 2 ; side++) {
                double z = double_poly_eval_homogeneous(fijd[side], xys[0], xys[1]);
                double a = log2(fabs(z));
                double d = dickman_rho_local(a/lpbs[side], fabs(z));
                verbose_output_print(0, 4, " %d %e %e", side, z, d);
                prod *= d;
            }
            verbose_output_print(0, 4, " %e\n", prod);

            weightsum += weight;
            sum += weight*prod;
        }
    }
    sum /= weightsum;
    sum *= 1UL << logA;
    return sum;
}//}}}

int sieve_range_adjust::adjust_with_estimated_yield()/*{{{*/
{
    prepare_fijd(); // side-effect of the above

    /* List a few candidate matrices which can be used for distorting the
     * sieving range.
     *
     * Because we are including the "swapped" matrices here, we will not
     * investigate negative values of the squeeze parameter.
     *
     * It is important, though, that matrices come here in swapped pairs,
     * since we use that to avoid part of the computation (for squeeze==0,
     * the range is square when A is even, so swapping makes no sense).
     */
    mat<int> shuffle_matrices[] = {
        mat<int>( 1, 0, 0, 1 ),
        mat<int>( 0, 1, 1, 0 ),

        mat<int>( 1, 1, 1, 0 ),
        mat<int>( 1, 0, 1, 1 ),

        mat<int>( 1, 1, 0, -1 ),
        mat<int>( 0, -1, 1, 1 ),

        mat<int>( 1, -1, 0, 1 ),
        mat<int>( 0, 1, 1, -1 ),

        mat<int>( 1, -1, 1, 0 ),
        mat<int>( 1, 0, 1, -1 ),

        /* We're also adding matrices with twos, although we're not really
         * convinced it's worth it. It depends on the polynomial, anyway.
         *
         * On the hsnfs dlp-1024, only 15% of the special-q's among a test
         * of 1000 find a better estimated yield with the matrices below than
         * without.
         */
        mat<int>( 1, 2, 0, 1 ),
        mat<int>( 0, 1, 1, 2 ),

        mat<int>( 1, -2, -1, 1 ),
        mat<int>( -1, 1, 1, -2 ),

        mat<int>( 2, 1, 1, 1 ),
        mat<int>( 1, 1, 2, 1 ),

        mat<int>( -2, 1, 1, 0 ),
        mat<int>( 1, 0, -2, 1 ),

        mat<int>( 1, 0, 2, 1 ),
        mat<int>( 2, 1, 1, 0 ),

        mat<int>( 1, 2, 1, 1 ),
        mat<int>( 1, 1, 1, 2 ),

        mat<int>( 2, -1, 1, -1 ),
        mat<int>( 1, -1, 2, -1 ),

        mat<int>( -1, 2, 0, 1 ),
        mat<int>( 0, 1, -1, 2 ),
    };
    const int nmatrices = sizeof(shuffle_matrices)/sizeof(shuffle_matrices[0]);
#if 0
    /*
     * The following magma code can be used to generate the list of matrices
     * above. I'm only slightly editing the result so that the identity
     * matrix comes first.
MM:=[Matrix(2,2,[a,b,c,d]):a,b,c,d in [-2..2]];
MM:=[M:M in MM|IsUnit(Determinant(M))];
d:=DiagonalMatrix([1,-1]);
s:=Matrix(2,2,[0,1,1,0]);
npos:=func<v|#[a:a in Eltseq(v)|a gt 0]>;
bestrep:=func<s|x[i] where _,i is Max([npos(v):v in x]) where x is Setseq(s)>;
prepr:=func<m|[Eltseq(m),Eltseq(s*m)]>;
B:=[bestrep(a):a in {{a*b*c*x:a in {1,-1},b in {1,d},c in {1,s}}:x in MM}];
  &cat [prepr(x):x in B];
     */
#endif

    double best_sum = 0;
    int best_r = -1;
    int best_squeeze = -1;

    /* We integrate on 2^(2*N-1) points (well, morally 2^(2N), but we halve
     * that by homogeneity) */
    int N = 5;

    double reference = estimate_yield_in_sieve_area(shuffle_matrices[0], 0, N);
    for(int squeeze = ADJUST_STRATEGY2_MIN_SQUEEZE ; squeeze <= ADJUST_STRATEGY2_MAX_SQUEEZE ; squeeze++) {
        for(int r = 0 ; r < nmatrices ; r++) {
            if (squeeze == 0 && (r & 1)) continue;
            mat<int> const & Sr(shuffle_matrices[r]);
            double sum = estimate_yield_in_sieve_area(Sr, squeeze, N);
            if (sum > best_sum) {
                best_r = r;
                best_squeeze = squeeze;
                best_sum = sum;
            }
            verbose_output_print(0, 4, "# estimated yield for rectangle #%d,%d: %e\n", r, squeeze, sum);
        }
    }

    mat<int> const& shuffle (shuffle_matrices[best_r]);

    logI = ((logA-logA/2) - best_squeeze);
    Q = shuffle * Q;
    J = 1 << (logA/2    + best_squeeze);

    std::ostringstream adjust_message;
    adjust_message << "adjusting by ";
    shuffle.print_me(adjust_message);
    adjust_message << " (+" << std::fixed << std::setprecision(2) <<
            100.0*(best_sum/reference-1) << "%)";

    return round_to_full_bucket_regions(__func__, adjust_message.str());
}/*}}}*/

/* return 0 if we should discard that special-q because the rounded
 * region in the (a,b)-plane is flat to the point of having height 0.
 * For diagnostic, we set this->J to the unrounded value (rounding would
 * give 0) and then we return "false".
 *
 * The current check for discarding is whether we do fill one bucket
 * region or not. If we don't even achieve that, we should of course
 * discard.
 *
 * Now for efficiency reasons, the ``minimum reasonable'' number of
 * bucket regions should be more than that.
 */
int sieve_range_adjust::sieve_info_adjust_IJ()/*{{{*/
{
    using namespace std;
    /* compare skewed max-norms: let u0 = [a0, b0] and u1 = [a1, b1],
     * and u'0 = [a0/sqrt(s), sqrt(s)*b0],
     * u'1 = [a1/sqrt(s), sqrt(s)*b1] where s is the skewness.
     * Assume |u'0| <= |u'1|, so that |u'0|^2 <= |u'0|.|u'1|.

     * We know from Gauss reduction that u'0 and u'1 form an angle of at
     * least pi/3, thus since their determinant is q, we have
     * q = |u'0|*|u'1|*sin(angle(u'0,u'1))>=|u'0|*|u'1|*sqrt(3)/2

     * So that
     *  |u'0|^2 <= |u'0|.|u'1| <= 2*q/sqrt(3)

     * Define B := sqrt(q)*sqrt(2/sqrt(3)), then we have:
     *      |a0|/sqrt(s) <= B
     *  and |b0|*sqrt(s) <= B

     * If we take J <= I/2*min(sqrt(s)*B/|a1|, B/sqrt(s)/|b1|), then for
     * any j <= J we have
     *     |a1|/sqrt(s)*J <= I/2*B
     * and |b1|*sqrt(s)*J <= I/2*B,

     * thus for any i,j with |i|<=I/2 and 0<=j<J, we have:
     *     |a|/sqrt(s) = |a0*i+a1*j|/sqrt(s) <= sqrt(q)*I*sqrt(2/sqrt(3))
     * and |b|*sqrt(s) = |b0*i+b1*j|*sqrt(s) <= sqrt(q)*I*sqrt(2/sqrt(3)).
     */

    /*
     * So the strategy above is cado-nfs's legacy strategy for choosing
     * J, based on I.
     *
     * We now design another strategy. Let A be the target sieve area
     * size. We would like to reach the conditions:
     *
     *     for any i,j with |i|<=I/2 and 0<=j<J:
     *     |a|/sqrt(s) = |a0*i+a1*j|/sqrt(s) <= sqrt(q)*sqrt(A)*sqrt(2/sqrt(3))
     *     |b|*sqrt(s) = |b0*i+b1*j|*sqrt(s) <= sqrt(q)*sqrt(A)*sqrt(2/sqrt(3))
     *     I*J = A
     *
     * The geometrical reasoning given above would be the same. However,
     * to achieve I*J=A, we will have to bump I and J slightly.
     *
     * So we're going to start by the previous approach, and adjust it in
     * a second step.
     */
    const double skew = cpoly->skew;
    const double rt_skew = sqrt(skew);
    verbose_output_vfprint(0, 3, gmp_vfprintf,
            "# Called sieve_info_adjust_IJ((a0=%" PRId64 "; b0=%" PRId64
            "; a1=%" PRId64 "; b1=%" PRId64 "), p=%Zd, skew=%f)\n",
            Q.a0, Q.b0, Q.a1, Q.b1,
            (mpz_srcptr) Q.doing.p, skew);
    if (Q.skewed_norm0(skew) > Q.skewed_norm1(skew)) {
        /* exchange u0 and u1, thus I and J */
        swap(Q.a0, Q.a1);
        swap(Q.b0, Q.b1);
    }
    double maxab1 = MAX(abs(Q.a1) / rt_skew, abs(Q.b1) * rt_skew);
    double B = sqrt (2.0 * mpz_get_d(Q.doing.p) / sqrt (3.0));

    uint32_t I = 1UL << ((logA+1)/2);
    J = 1UL << ((logA-1)/2);
    J = (uint32_t) (B / maxab1 * (double) J);

    /* make sure J does not exceed I/2 */
    J = MIN((uint32_t) J, I >> 1);
    logI = (logA + 1) / 2;

    return round_to_full_bucket_regions(__func__);
}/*}}}*/

int sieve_range_adjust::round_to_full_bucket_regions(const char * origin, std::string const & message)/*{{{*/
{
    /* Compute number of i-lines per bucket region, must be integer */
    uint32_t i = 1U << MAX(LOG_BUCKET_REGION - logI, 0);

    // we no longer need to do that.
    // i *= nb_threads;  /* ensures nb of bucket regions divisible by nb_threads */

    /* Bug 15617: if we round up, we are not true to our promises */
    uint32_t nJ = (J / i) * i; /* Round down to multiple of i */

    if (message.empty()) {
        verbose_output_print(0, 3, "# %s(): logI=%d J=%" PRIu32 "\n", origin, logI, nJ);
    } else {
        verbose_output_print(0, 3, "# %s(): logI=%d J=%" PRIu32 " [%s]\n", origin, logI, nJ, message.c_str());
    }
    /* XXX No rounding if we intend to abort */
    if (nJ > 0) J = nJ;
    return nJ > 0;
}/*}}}*/

/*}}}*/

 
uint32_t sieve_range_adjust::get_minimum_J()
{
    /* no longer dependent on the number of threads */
    return 1 << MAX(1, (LOG_BUCKET_REGION - logI));
}

void sieve_range_adjust::set_minimum_J_anyway()
{
    J = get_minimum_J();
}
back to top