Revision d12d603036b334b53d6e886cfa985a082e981860 authored by Emmanuel Thomé on 29 January 2021, 06:20:31 UTC, committed by Emmanuel Thomé on 29 January 2021, 21:39:17 UTC
1 parent 590bfe4
Raw File
las-unsieve.cpp
#include "cado.h" // IWYU pragma: keep

/* This compilation units reacts to TRACK_CODE_PATH and uses macros
 * such as WHERE_AM_I_UPDATE.
 * This compilation unit _must_ produce different object files depending
 * on the value of TRACK_CODE_PATH.
 * The WHERE_AM_I_UPDATE macro itself is defined in las-where-am-i.hpp
 */

#include <algorithm>        // for max
#include <climits>          // for UINT_MAX
#include <cstdint>          // for uint32_t
#include <cstdlib>          // for abs, abort, size_t, NULL
#include <cstring>          // for memcpy, memset
#include <vector>           // for vector
#include "fb-types.h"       // for sublat_t
#include "gcd.h"       // for bin_gcd_int64_safe
#include "gpf.h"        // gpf_init
#include "las-unsieve.hpp"  // for unsieve_data, unsieve_data::pattern_t
#include "macros.h"         // for ASSERT_ALWAYS, no_break, MAYBE_UNUSED
#include "ularith.h"        // for ularith_invmod
#ifdef TRACE_K
#include "las-where-am-i.hpp"             // for where_am_I, WHERE_AM_I_UPDATE
#include "las-output.hpp"   // IWYU pragma: keep
#include "verbose.h"    // verbose_output_print
#endif

static const int verify_gcd = 0; /* Enable slow but thorough test */

/* Set every stride-th byte, starting at index 0, to 255 in an array of
   stride unsieve_data::pattern_t's, and set all other bytes to 0. */
static void
minisieve(unsieve_data::pattern_t * const array, const size_t stride)
{
    memset (array, 0, stride * sizeof(unsieve_data::pattern_t));
    for (size_t i = 0; i < stride * sizeof(unsieve_data::pattern_t); i += stride)
      ((unsigned char *) array)[i] = 255;
}

unsieve_data::unsieve_data()
{
  entries = NULL;
  Jmax = 0;
}

unsieve_data::unsieve_data(int logI, int logA)
{
  ASSERT_ALWAYS(logI >= 0);
  ASSERT_ALWAYS(logA >= logI);
  Jmax = 1UL << (logA - logI);
  /* Store largest prime factor of k in us.lpf[k], 0 for k=0, 1 for k=1 */
  entries = new entry[Jmax];
  entries[0] = entry(0,0);
  entries[1] = entry(1,1);
  gpf_init(Jmax - 1);
  for (unsigned int k = 2U; k < Jmax; k++)
    {
      unsigned int p, c;
      p = gpf_get(k);
      c = k; do {c /= p;} while (c % p == 0);
      entries[k] = entry(p, c);
    }

    minisieve(pattern3, 3);
    minisieve(pattern5, 5);
    minisieve(pattern7, 7);
}

unsieve_data::unsieve_data(unsieve_data const & o) : Jmax(o.Jmax)
{
    entries = NULL;
    if (Jmax == 0) return;
    entries = new entry[Jmax];
    memcpy(entries, o.entries, Jmax * sizeof(entry));
    memcpy(pattern3, o.pattern3, sizeof(pattern3));
    memcpy(pattern5, o.pattern5, sizeof(pattern5));
    memcpy(pattern7, o.pattern7, sizeof(pattern7));
}

unsieve_data & unsieve_data::operator=(unsieve_data const & o)
{
    if (Jmax) delete[] entries;
    Jmax = o.Jmax;
    entries = new entry[Jmax];
    memcpy(entries, o.entries, Jmax * sizeof(entry));
    memcpy(pattern3, o.pattern3, sizeof(pattern3));
    memcpy(pattern5, o.pattern5, sizeof(pattern5));
    memcpy(pattern7, o.pattern7, sizeof(pattern7));
    return *this;
}

unsieve_data::~unsieve_data()
{
    if (Jmax == 0) return;
    delete[] entries;
}

static inline void
unsieve_one_prime (unsigned char *line_start, const unsigned int p, 
                   const unsigned int j, const unsigned int start_idx,
                   const unsigned int I)
{
  unsigned int x, np = p; /* if 2|j, np=2p, else np=p */

  x = start_idx;
  if (j % 2U == 0U)
    {
      np += p;
      if (x % 2U == 0U)
        x += p;
    }
  for ( ; x < I; x += np)
    line_start[x] = 255;
}


static inline void
unsieve_3(unsigned char *line_start, const unsigned int start_idx,
          const unsigned int I, unsieve_data const & us)
{
  const unsigned int I_upt  = I / sizeof (unsieve_data::pattern_t);
  unsigned int i, pattern_idx;
  unsieve_data::pattern_t p0, p1, p2;
  unsieve_data::pattern_t * ul_line_start = (unsieve_data::pattern_t *) line_start;

  if (sizeof(unsieve_data::pattern_t) == 4) {
    /* -4^(-1) == 2 (mod 3) */
    pattern_idx = (2 * start_idx) % 3;
  } else if (sizeof(unsieve_data::pattern_t) == 8) {
    /* -8^(-1) == 1 (mod 3) */
    pattern_idx = start_idx;
  } else if (sizeof(unsieve_data::pattern_t) == 16) {
    /* -16^(-1) == 2 (mod 3) */
    pattern_idx = (2 * start_idx) % 3;
  } else
    abort();
  
  p0 = us.pattern3[pattern_idx];
  p1 = us.pattern3[(pattern_idx + 1) % 3];
  p2 = us.pattern3[(pattern_idx + 2) % 3];

  ASSERT_ALWAYS(((unsigned char *)&p0)[start_idx] == 255);
  
  /* Apply pattern to array */
  for (i = 0U; i < I_upt - 2U; i += 3U)
    {
      UNSIEVE_OR(ul_line_start[i], p0);
      UNSIEVE_OR(ul_line_start[i + 1], p1);
      UNSIEVE_OR(ul_line_start[i + 2], p2);
    }
  if (i < I_upt)
    UNSIEVE_OR(ul_line_start[i], p0);
  if (i + 1 < I_upt)
    UNSIEVE_OR(ul_line_start[i + 1], p1);
}


static inline void
unsieve_5(unsigned char *line_start, const unsigned int start_idx,
          const unsigned int I, unsieve_data const & us)
{
  const unsigned int I_upt  = I / sizeof (unsieve_data::pattern_t);
  unsigned int i;
  unsieve_data::pattern_t p0, p1, p2, p3, p4;
  unsieve_data::pattern_t * ul_line_start = (unsieve_data::pattern_t *) line_start;
  size_t pattern_idx;

  if (sizeof(unsieve_data::pattern_t) == 4) {
    /* -4^(-1) == 1 (mod 5) */
    pattern_idx = start_idx;
  } else if (sizeof(unsieve_data::pattern_t) == 8) {
    /* -8^(-1) == 3 (mod 5) */
    pattern_idx = (3 * start_idx) % 5;
  } else if (sizeof(unsieve_data::pattern_t) == 16) {
    /* -16^(-1) == -1 (mod 5) */
    pattern_idx = (5 - start_idx) % 5;
  } else
    abort();
  
  p0 = us.pattern5[pattern_idx];
  p1 = us.pattern5[(pattern_idx + 1) % 5];
  p2 = us.pattern5[(pattern_idx + 2) % 5];
  p3 = us.pattern5[(pattern_idx + 3) % 5];
  p4 = us.pattern5[(pattern_idx + 4) % 5];

  if (start_idx < sizeof(p0)) {
      ASSERT_ALWAYS(((unsigned char *)&p0)[start_idx] == 255);
  } else {
      ASSERT_ALWAYS(start_idx < 2 * sizeof(p0));
      ASSERT_ALWAYS(((unsigned char *)&p1)[start_idx - sizeof(p0)] == 255);
  }

  /* Apply pattern to array */
  for (i = 0U; i < I_upt - 4U; i += 5U)
    {
      UNSIEVE_OR(ul_line_start[i], p0);
      UNSIEVE_OR(ul_line_start[i + 1], p1);
      UNSIEVE_OR(ul_line_start[i + 2], p2);
      UNSIEVE_OR(ul_line_start[i + 3], p3);
      UNSIEVE_OR(ul_line_start[i + 4], p4);
    }
  if (i < I_upt)
    UNSIEVE_OR(ul_line_start[i], p0);
  if (i + 1 < I_upt)
    UNSIEVE_OR(ul_line_start[i + 1], p1);
  if (i + 2 < I_upt)
    UNSIEVE_OR(ul_line_start[i + 2], p2);
  if (i + 3 < I_upt)
    UNSIEVE_OR(ul_line_start[i + 3], p3);
}

static inline void
unsieve_7(unsigned char *line_start, const unsigned int start_idx,
          const unsigned int I, unsieve_data const & us)
{
  const unsigned int I_upt  = I / sizeof (unsieve_data::pattern_t);
  unsigned int i;
  unsieve_data::pattern_t p0, p1, p2, p3, p4, p5, p6;
  unsieve_data::pattern_t * ul_line_start = (unsieve_data::pattern_t *) line_start;
  size_t pattern_idx;

  if (sizeof(unsieve_data::pattern_t) == 4) {
    /* -4^(-1) == 5 (mod 7) */
    pattern_idx = (5 * start_idx) % 7;
  } else if (sizeof(unsieve_data::pattern_t) == 8) {
    /* -8^(-1) == -1 (mod 7) */
    pattern_idx = (7 - start_idx) % 7;
  } else if (sizeof(unsieve_data::pattern_t) == 16) {
    /* -16^(-1) == 3 (mod 7) */
    pattern_idx = (3 * start_idx) % 7;
  } else
    abort();
  
  p0 = us.pattern7[pattern_idx];
  p1 = us.pattern7[(pattern_idx + 1) % 7];
  p2 = us.pattern7[(pattern_idx + 2) % 7];
  p3 = us.pattern7[(pattern_idx + 3) % 7];
  p4 = us.pattern7[(pattern_idx + 4) % 7];
  p5 = us.pattern7[(pattern_idx + 5) % 7];
  p6 = us.pattern7[(pattern_idx + 6) % 7];

  if (start_idx < sizeof(p0)) {
      ASSERT_ALWAYS(((unsigned char *)&p0)[start_idx] == 255);
  } else {
      ASSERT_ALWAYS(start_idx < 2 * sizeof(p0));
      ASSERT_ALWAYS(((unsigned char *)&p1)[start_idx - sizeof(p0)] == 255);
  }

  /* Apply pattern to array */
  for (i = 0U; i < I_upt - 6U; i += 7U)
    {
      UNSIEVE_OR(ul_line_start[i], p0);
      UNSIEVE_OR(ul_line_start[i + 1], p1);
      UNSIEVE_OR(ul_line_start[i + 2], p2);
      UNSIEVE_OR(ul_line_start[i + 3], p3);
      UNSIEVE_OR(ul_line_start[i + 4], p4);
      UNSIEVE_OR(ul_line_start[i + 5], p5);
      UNSIEVE_OR(ul_line_start[i + 6], p6);
    }
  if (i < I_upt)
    UNSIEVE_OR(ul_line_start[i], p0);
  if (i + 1 < I_upt)
    UNSIEVE_OR(ul_line_start[i + 1], p1);
  if (i + 2 < I_upt)
    UNSIEVE_OR(ul_line_start[i + 2], p2);
  if (i + 3 < I_upt)
    UNSIEVE_OR(ul_line_start[i + 3], p3);
  if (i + 4 < I_upt)
    UNSIEVE_OR(ul_line_start[i + 4], p4);
  if (i + 5 < I_upt)
    UNSIEVE_OR(ul_line_start[i + 5], p5);
}


static void
unsieve_not_coprime_line(unsigned char * line_start,
                         unsigned int j,
                         int i0, int i1,
                         unsigned int min_p,
                         unsieve_data const & us)
{
  unsigned int p, c=j;
  int start_idx;

  if (j == 0)
    return;

  while (c % 2U == 0U) 
    c >>= 1;

  while (1)
    {
      p = us.entries[c].lpf; /* set p to largest prime factor of c */
      if (p < min_p)
        return;
      start_idx = (-i0) % p; if (start_idx < 0) start_idx += p;
      c = us.entries[c].cof;
      if (p <= 7)
        break;
      unsieve_one_prime (line_start, p, j, start_idx, i1 - i0);
    }
  
  if (p == 7U)
    {
      unsieve_7(line_start, start_idx, i1 - i0, us);
      p = us.entries[c].lpf;
      start_idx = (-i0) % p; if (start_idx < 0) start_idx += p;
      c = us.entries[c].cof;
    }

  if (p < min_p)
    return;
  
  if (p == 5U)
    {
      unsieve_5(line_start, start_idx, i1 - i0, us);
      p = us.entries[c].lpf;
      start_idx = (-i0) % p; if (start_idx < 0) start_idx += p;
      c = us.entries[c].cof;
    }

  if (p < min_p)
    return;
  
  if (p == 3U)
      unsieve_3(line_start, start_idx, i1 - i0, us);

  ASSERT_ALWAYS(c <= 1);
}

j_divisibility_helper::j_divisibility_helper(uint32_t J)
{
    /* Store largest prime factor of k in j_div[k].p, for 1 < k < J,
       and store 0 for k=0, 1 for k=1 */
    ASSERT_ALWAYS(J >= 2);
    ASSERT_ALWAYS(!(J & (J-1)));        /* J must be a power of two */
    entries.reserve(J);
    entries.push_back({ 0U, 0U, 0U, 0U });
    entries.push_back({ 1U, 1U, 1U, UINT_MAX });
    for (unsigned int k = 2; k < J; k++) {
        /* Find largest prime factor of k */
        unsigned int p, c = k;
        for (p = 2U; p * p <= c; p += 1U + p % 2U)
        {
            while (c % p == 0U)
                c /= p;
            if (c == 1U)
                break;
        }
        p = (c == 1U) ? p : c;
        c = k; do {c /= p;} while (c % p == 0);
        unsigned int inv = p == 2 ? 0U : (unsigned int)ularith_invmod(p);
        unsigned int bound = UINT_MAX / p;
        entries.push_back({ p, c, inv, bound});
    }
}

static inline int
sieve_info_test_lognorm (const unsigned char C1, const unsigned char C2,
                         const unsigned char S1, const unsigned char S2)
{
  return S1 <= C1 && S2 <= C2;
}

/* In SS[2][x_start] ... SS[2][x_start * 2^logI - 1], look for survivors.
   We test divisibility of the resulting i value by the trial-divided primes.
   Return the number of survivors found. This function works for all j */
MAYBE_UNUSED static void
search_survivors_in_line1(unsigned char * const SS[2],
        const unsigned char bound[2],
        unsigned int j, 
        int i0, int i1,
        int N MAYBE_UNUSED, j_divisibility_helper const & j_div,
        unsigned int td_max, std::vector<uint32_t> &survivors)
{
    unsigned int div[6][2], nr_div;

    nr_div = extract_j_div(div, j, j_div, 3, td_max);
    ASSERT_ALWAYS(nr_div <= 6);

    for (int x = 0; x < (i1 - i0); x++) {
        if (!sieve_info_test_lognorm(bound[0], bound[1], SS[0][x], SS[1][x]))
        {
            SS[0][x] = 255;
            continue;
        }

        /* The very small prime used in the bound pattern, and unsieving larger
           primes have not identified this as gcd(i,j) > 1. It remains to check
           the trial-divided primes. */
        const unsigned int i = abs (i0 + x);
        int divides = 0;
        switch (nr_div) {
            case 6: divides |= (i * div[5][0] <= div[5][1]);no_break();
            case 5: divides |= (i * div[4][0] <= div[4][1]);no_break();
            case 4: divides |= (i * div[3][0] <= div[3][1]);no_break();
            case 3: divides |= (i * div[2][0] <= div[2][1]);no_break();
            case 2: divides |= (i * div[1][0] <= div[1][1]);no_break();
            case 1: divides |= (i * div[0][0] <= div[0][1]);no_break();
            case 0: break;
        }

        if (divides) {
            if (verify_gcd)
                ASSERT_ALWAYS(bin_gcd_int64_safe (i, j) != 1);
  #ifdef TRACE_K
            if (trace_on_spot_Nx(N, x)) {
                verbose_output_print(TRACE_CHANNEL, 0, "# Slot [%u] in bucket %u has non coprime (i,j)=(%d,%u)\n",
                        x, N, i, j);
            }
  #endif
            SS[0][x] = 255;
        } else {
            survivors.push_back(x);
            if (verify_gcd)
                ASSERT_ALWAYS(bin_gcd_int64_safe (i, j) == 1);
  #ifdef TRACE_K
            if (trace_on_spot_Nx(N, x)) {
                verbose_output_print(TRACE_CHANNEL, 0, "# Slot [%u] in bucket %u is survivor with coprime (i,j)\n",
                        x, N);
            }
  #endif
        }
      }
}


/* same, but with one side only.
 *
 *
 * XXX I think the overall structure would need a cleanup anyway, but at
 * least here we can refactor somewhat.
 */

MAYBE_UNUSED static void
search_survivors_in_line1_oneside(unsigned char * Sf,
        const unsigned char bound,
        unsigned int j, 
        int i0, int i1,
        int N MAYBE_UNUSED, j_divisibility_helper const & j_div,
        unsigned int td_max, std::vector<uint32_t> &survivors)
{
    unsigned int div[6][2], nr_div;

    nr_div = extract_j_div(div, j, j_div, 3, td_max);
    ASSERT_ALWAYS(nr_div <= 6);

    for (int x = 0; x < (i1 - i0); x++) {
        if (Sf[x] > bound) {
            Sf[x] = 255;
            continue;
        }

        /* The very small prime used in the bound pattern, and unsieving larger
           primes have not identified this as gcd(i,j) > 1. It remains to check
           the trial-divided primes. */
        const unsigned int i = abs (i0 + x);
        int divides = 0;
        switch (nr_div) {
            case 6: divides |= (i * div[5][0] <= div[5][1]);no_break();
            case 5: divides |= (i * div[4][0] <= div[4][1]);no_break();
            case 4: divides |= (i * div[3][0] <= div[3][1]);no_break();
            case 3: divides |= (i * div[2][0] <= div[2][1]);no_break();
            case 2: divides |= (i * div[1][0] <= div[1][1]);no_break();
            case 1: divides |= (i * div[0][0] <= div[0][1]);no_break();
            case 0: break;
        }

        if (divides) {
            if (verify_gcd)
                ASSERT_ALWAYS(bin_gcd_int64_safe (i, j) != 1);
  #ifdef TRACE_K
            if (trace_on_spot_Nx(N, x)) {
                verbose_output_print(TRACE_CHANNEL, 0, "# Slot [%u] in bucket %u has non coprime (i,j)=(%d,%u)\n",
                        x, N, i, j);
            }
  #endif
            Sf[x] = 255;
        } else {
            survivors.push_back(x);
            if (verify_gcd)
                ASSERT_ALWAYS(bin_gcd_int64_safe (i, j) == 1);
  #ifdef TRACE_K
            if (trace_on_spot_Nx(N, x)) {
                verbose_output_print(TRACE_CHANNEL, 0, "# Slot [%u] in bucket %u is survivor with coprime (i,j)\n",
                        x, N);
            }
  #endif
        }
      }
}


/* linefragment is used when logI > LOG_BUCKET_REGION ; it is a positive
 * integer multiple of 2^LOG_BUCKET_REGION ; so the sub-line is actually
 * for:
 * -I/2 + linefragment <= i < -I/2 + MIN(2^LOG_BUCKET_REGION, I)
 */
void
search_survivors_in_line(unsigned char * const SS[2], 
        const unsigned char bound[2],
        unsigned int j,
        int i0, int i1,
        int N, j_divisibility_helper const & j_div,
        unsigned int td_max, unsieve_data const & us,
        std::vector<uint32_t> &survivors, sublat_t sublat)
{
    ASSERT_ALWAYS(SS[0] || SS[1]);
    unsigned char * Sf = SS[0];

    if (SS[0] && SS[1]) {

        /* In line j = 0, only the coordinate (i, j) = (-1, 0) may survive */
        // FIXME: in sublat mode, this is broken!
        if (j == 0 && (!sublat.m)) {
            if (i0 <= 0 && i1 > 0) {
                unsigned char s0 = SS[0][1-i0];
                unsigned char s1 = SS[1][1-i0];
                memset(SS[0], 255, i1 - i0);
                if (s0 <= bound[0] && s1 <= bound[1]) {
                    SS[0][1 - i0] = s0;
                    SS[1][1 - i0] = s1;
                    survivors.push_back(1 - i0);
                }
            } else {
                memset(SS[0], 255, i1 - i0);
            }
            return;
        }

        // Naive version when we have sublattices, because unsieving is
        // harder. TODO: implement a fast version
        if (sublat.m) {
            for (int x = 0; x < (i1 - i0); x++) {
                if (!sieve_info_test_lognorm(bound[0], bound[1], SS[0][x], SS[1][x])) {
                    SS[0][x] = 255;
                    continue;
                }
                const unsigned int i = abs(int(sublat.m)*(i0 + x)+int(sublat.i0));
                const unsigned int jj = sublat.m*j+sublat.j0;
                if ((((jj % 2) == 0) && ((i % 2) == 0)) ||
                        (bin_gcd_int64_safe (i, jj) != 1)) {
                    SS[0][x] = 255;
                } else {
                    survivors.push_back(x);
                }
            }
            return;
        }

        unsieve_not_coprime_line(Sf, j, i0, i1, td_max + 1, us);

#if defined(HAVE_SSE2)
        search_survivors_in_line_sse2(SS, bound, j, i0, i1, N, j_div, td_max,
                survivors);
#else
        search_survivors_in_line1(SS, bound, j, i0, i1, N, j_div, td_max,
                survivors);
#endif
    } else {
        unsigned char b = bound[0];
        if (!Sf) {
            Sf = SS[1];
            b = bound[1];
        }
        /* ok, here only Sf is non-null. We want the values below b. */

        /* In line j = 0, only the coordinate (i, j) = (-1, 0) may survive */
        // FIXME: in sublat mode, this is broken!
        if (j == 0 && (!sublat.m)) {
            if (i0 <= 0 && i1 > 0) {
                unsigned char s = Sf[1-i0];
                memset(Sf, 255, i1 - i0);
                if (s <= b) {
                    Sf[1 - i0] = s;
                    survivors.push_back(1 - i0);
                }
            } else {
                memset(Sf, 255, i1 - i0);
            }
            return;
        }

        // Naive version when we have sublattices, because unsieving is
        // harder. TODO: implement a fast version
        if (sublat.m) {
            for (int x = 0; x < (i1 - i0); x++) {
                if (Sf[x] > b) {
                    Sf[x] = 255;
                    continue;
                }
                const unsigned int i = abs(int(sublat.m)*(i0 + x))+int(sublat.i0);
                const unsigned int jj = sublat.m*j+sublat.j0;
                if ((((jj % 2) == 0) && ((i % 2) == 0)) ||
                        (bin_gcd_int64_safe (i, jj) != 1)) {
                    Sf[x] = 255;
                } else {
                    survivors.push_back(x);
                }
            }
            return;
        }

        unsieve_not_coprime_line(Sf, j, i0, i1, td_max + 1, us);

#if defined(HAVE_SSE2)
        search_survivors_in_line_sse2_oneside(Sf, b, j, i0, i1, N, j_div, td_max,
                survivors);
#else
        search_survivors_in_line1_oneside(Sf, b, j, i0, i1, N, j_div, td_max,
                survivors);
#endif
    }
}
back to top