Revision 132f63c1a343af4280df47cf9cc0453060d506b7 authored by Jérémie Detrey on 10 March 2014, 10:29:14 UTC, committed by Jérémie Detrey on 10 March 2014, 10:31:12 UTC
1 parent b2cdbc1
Raw File
latsieve.c
#include <stdlib.h>

#include "latsieve.h"
#include "types.h"
#include "ijvec.h"
#include "sublat.h"
#include "gray.h"


/*
 * Given j is a multiple of p. 
 * Compute the next multiple of p, in lex order. 
 * The output is of degree less than J. If failure because this bound is
 * reached, return 0, otherwise return 1.
 *
 * For that, make us of the given basis of multiples of p, as precomputed
 * in the normalized_echelon_multiples() function.
 *
 * NB: in the case of sublat, the given j is not really a multiple of p.
 * The computation is still valid.
 */
static MAYBE_UNUSED
int next_projective_j(ij_t rj, ij_t j, ij_t *basis, int degp, int J)
{
    // First use monic_set_next() on the high part of j.
    ij_t jhi, njhi;
    ij_div_ti(jhi, j, degp);
    int rc = ij_monic_set_next(njhi, jhi, J-degp);
    if (!rc)
        return 0;
    // The degree of the difference between in and out of set_next gives
    // the basis-element to add. (magic!)
    ij_diff(njhi, njhi, jhi);
    int d = ij_deg(njhi);
    ij_add(rj, j, basis[d]);
    // There is an adjustment to do in the case where the degree of rj is
    // larger than the degree of j, due to the fact that we deal with
    // monic polynomials.
    if (d >= ij_deg(jhi)) {
        ASSERT(d > ij_deg(jhi)); // monic case
        // Have to kill the bit d-1, which is currently 2, without
        // touching lower bits.
        for (int k = 2; k < FP_CHAR; ++k) {
            if (d > 0)
                ij_add(rj, rj, basis[d-1]);
            if (d > 1)
                ij_add(rj, rj, basis[d-2]);
        }
    }
    return 1;
}


static inline
void sieve_hit(uint8_t *S, uint8_t scaledegp, ijpos_t pos,
        MAYBE_UNUSED fbprime_srcptr p,
        MAYBE_UNUSED fbprime_srcptr r,
        MAYBE_UNUSED ijpos_t pos0)
{
#ifdef TRACE_POS
  if (pos0+pos == TRACE_POS) {
    fprintf(stderr, "TRACE_POS(%lu): ", pos0+pos);
    fbprime_out(stderr, p); fprintf(stderr, " ");
    fbprime_out(stderr, r); fprintf(stderr, "\n");
    fprintf(stderr, "TRACE_POS(%lu): degnorm is now %d\n",
            pos0+pos, S[pos]-scaledegp);
  }
#endif
#ifndef NDEBUG
  if (S[pos] < scaledegp)
    fprintf(stderr, "faulty pos is %lu\n", pos0+pos);
  ASSERT(S[pos] >= scaledegp);
#endif
  S[pos] -= scaledegp;
}


void sieveSFB(uint8_t *S, unsigned int *thr,
    small_factor_base_ptr FB, unsigned I, unsigned J,
    ij_t j0, ijpos_t pos0, ijpos_t size, sublat_ptr sublat)
{
    *thr = 0;
    for (unsigned int ii = 0; ii < FB->n; ++ii) {
        small_fbideal_ptr gothp = FB->elts[ii];
        int L = gothp->degq;
        int degp = gothp->degp;
        int scaledegp = degp >> SCALE;

        // Larger primes are left to the bucket sieve.
        ASSERT((unsigned)L < I);

        // In case of sublat, the primes of degree 1 gives a uniform
        // contribution and it is better to handle them globally using
        // thresholds.
        // TODO: we recompute this for each bucket region, whereas it
        // could be computed once and for all.
        if (use_sublat(sublat) && L == 1) {
          if (!gothp->proj) {
            fppol16_t qq, rr;
            fppol16_set_fbprime(qq, gothp->q);
            fppol16_set_fbprime(rr, gothp->lambda);
            fppol16_mul(rr, rr, sublat->lat[sublat->n][1]);
            fppol16_add(rr, rr, sublat->lat[sublat->n][0]);
            fppol16_rem(rr, rr, qq);
            if (fppol16_is_zero(rr))
              *thr += gothp->degp;
          } else {
            fppol16_t qq;
            fppol16_set_fbprime(qq, gothp->q);
            fppol16_rem(qq, sublat->lat[sublat->n][1], qq);
            if (fppol16_is_zero(qq))
              *thr += gothp->degp;
          }
          continue;
        }

        // projective roots are handled differently
        if (gothp->proj) {
          // large projective roots are just skipped.
          if (UNLIKELY((unsigned)L > J))
            continue;
          // First time round?
          if (UNLIKELY(!pos0)) {
            // Find the first line to fill. If no sublat, this is zero.
            // Otherwise, there is a bit of computation.
            ij_set_zero(gothp->current);
            if (use_sublat(sublat)) {
              ij_t tmp0, tmp1, ijmod;
              ij_set_16(ijmod, sublat->modulus);
              ij_set_16(tmp0, sublat->lat[sublat->n][1]);
              ij_mulmod(tmp0, gothp->tildep, tmp0, ijmod);
              ij_set_fbprime(tmp1, gothp->q);
              ij_mul(tmp1, tmp1, tmp0);
              ij_div(gothp->current, tmp1, ijmod);
            }
          }
          ij_t j;
          int rcj = 1;
          ij_set(j, gothp->current);
          while (rcj) {
            ijpos_t start = ijvec_get_start_pos(j, I, J) - pos0;
            if (start >= size)
              break;

            // Sieve the whole line
#ifndef USE_F2
            // TODO: this should be a big memsub().
            ij_t i;
            int rci = 1;
            for (ij_set_zero(i); rci; rci = ij_set_next(i, i, I)) {
              ijpos_t pos = start + ijvec_get_offset(i, I);
              sieve_hit(S, scaledegp, pos, gothp->q, gothp->r, pos0);
            }
#else
            // For GF(2), this becomes so simple (and Gcc does it well).
            for(int i=0; i < 1<<I; ++i)
              S[start+i] -= scaledegp;
#endif

            rcj = next_projective_j(j, j, gothp->projective_basis, L, J);
          }
          ij_set(gothp->current, j); // remember the next line to sieve.
          continue;
        }

        // Only the first time round.
        if (UNLIKELY(!pos0)) {
          if (use_sublat(sublat)) {
            // In the case of sublattices, compute the starting point for the
            // sieve by gothp for the current sublattice.
            // TODO: Way too expensive!
            // xi and yi have degree 2
            ij_t i0, xi, xip, yip;
            ij_set_16(xi, sublat->lat[sublat->n][0]);
            fbprime_t tmp0;
            ij_t tmp1;
            fbprime_set_16(tmp0, sublat->lat[sublat->n][1]);
            fbprime_mulmod(tmp0, gothp->lambda, tmp0, gothp->q);
            ij_set_fbprime(yip, tmp0);
            ij_add(xip, xi, yip);
            ij_set_16(tmp1, sublat->modulus);
            ij_mulmod(xip, xip, gothp->tildep, tmp1);
            ij_set_fbprime(tmp1, gothp->q);
            ij_mul(i0, xip, tmp1);
            ij_add(i0, i0, yip);

            ij_t ijmod;
            ij_set_16(ijmod, sublat->modulus);
            ij_sub(i0, i0, xi);
            ij_div(gothp->current, i0, ijmod);
          } else {
            ij_set_zero(gothp->current);
          }
        }

        ij_t i, j, jj;
        int rcj = 1, degj, degjj;
        for (ij_set(j, j0), degj = MAX(ij_deg(j), 0); rcj; ) {
          ijpos_t start = ijvec_get_start_pos(j, I, J) - pos0;
          if (start >= size)
            break;
          ij_set(i, gothp->current);
          ijpos_t pos = start + ijvec_get_offset(i, I);
          if (LIKELY(pos0 || pos))
            sieve_hit(S, scaledegp, pos, gothp->q, gothp->r, pos0);

          // Size of Gray codes to use for the inner loop.
#         if   defined(USE_F2)
#           define ENUM_LATTICE_UNROLL 5
#         elif defined(USE_F3)
#           define ENUM_LATTICE_UNROLL 5
#         endif

          // Unrolled p-ary Gray code of size ENUM_LATTICE_UNROLL.
          static const uint8_t gray[] = { GRAY(ENUM_LATTICE_UNROLL) };
          unsigned             ngray  = GRAY_LENGTH(ENUM_LATTICE_UNROLL);

          // Just in case the dimension of the vector space is lower than
          // ENUM_LATTICE_UNROLL.
          unsigned gray_dim = MIN(I-L, ENUM_LATTICE_UNROLL);
          unsigned k0       = ngray - GRAY_LENGTH(gray_dim);

          ij_t s, t;
          ij_set_zero(t);
          int rc = I-L > ENUM_LATTICE_UNROLL;
          do {
            // Inner-level Gray code enumeration: just go through the Gray code
            // array, each time adding the indicated basis vector.
            // This uses undocumented feature of GCC to access the lower
            // 32 bits of a register with the %k prefix.
            // See gcc-4.7.1/gcc/config/i386/i386.md
#if defined(USE_F2) 
            if (k0 == 0 && sizeof(ij_t) == 4) {
#             define DOGRAY(n)                      \
                "xorl     " #n "(%[B]), %k[i]\n\t"  \
                "subb     %[degp], (%[S],%[i])\n\t"
#             define DOALLGRAY2 DOGRAY(0)  DOGRAY(4)  DOGRAY(0)
#             define DOALLGRAY3 DOALLGRAY2 DOGRAY(8)  DOALLGRAY2
#             define DOALLGRAY4 DOALLGRAY3 DOGRAY(12) DOALLGRAY3
#             define DOALLGRAY5 DOALLGRAY4 DOGRAY(16) DOALLGRAY4
#             define DOALLGRAY6 DOALLGRAY5 DOGRAY(20) DOALLGRAY5
#             define DOALLGRAY7 DOALLGRAY6 DOGRAY(24) DOALLGRAY6
#             define DOALLGRAY8 DOALLGRAY7 DOGRAY(28) DOALLGRAY7
#             define DOALLGRAY  CAT(DOALLGRAY, ENUM_LATTICE_UNROLL)
              uint64_t ii = i[0];
              uint8_t  dd = scaledegp;
              __asm volatile( DOALLGRAY
                            : [i]   "+r" (ii)
                            : [S]    "r" (S+start),
                              [B]    "r" (gothp->basis),
                              [degp] "r" (dd)
                            : "memory");
              ASSERT((ii >> 32) == 0);
              i[0] = ii;
            } else 
#endif
            {
                for (unsigned k = k0; k < ngray; ++k) {
                    ij_add(i, i, gothp->basis[gray[k]]);
                    pos = start + ijvec_get_offset(i, I);
                    sieve_hit(S, scaledegp, pos, gothp->q, gothp->r, pos0);
                }
            }
            k0 = 0;


            // Outer-level Gray code enumeration: using ij_set_next, the degree
            // of the difference with the previous one indicates which basis
            // vector should be added to the current lattice point.
            // rc is set to 0 when all vectors have been enumerated.
            ij_set(s, t);
            rc = rc && ij_set_next(t, t, I-L-ENUM_LATTICE_UNROLL);
            if (rc) {
              ij_diff(s, s, t);
              ij_add(i, i, gothp->basis[ij_deg(s)+ENUM_LATTICE_UNROLL]);
              pos = start + ijvec_get_offset(i, I);
              sieve_hit(S, scaledegp, pos, gothp->q, gothp->r, pos0);
            }
          } while (rc);

          ij_set(jj, j);
          rcj = ij_monic_set_next(j, j, J);
          if (rcj) {
            ij_diff(jj, jj, j);
            degjj = ij_deg(jj);
            ij_add(gothp->current, gothp->current,
                   gothp->basis[I-L+degjj]);
            if (degjj > degj) {
              ij_sub(gothp->current, gothp->current,
                     gothp->adjustment_basis[degjj-1]);
              degj = degjj;
            }
          }
        }
    }
}
back to top