Revision d12d603036b334b53d6e886cfa985a082e981860 authored by Emmanuel Thomé on 29 January 2021, 06:20:31 UTC, committed by Emmanuel Thomé on 29 January 2021, 21:39:17 UTC
1 parent 590bfe4
las-unsieve-sse2.cpp
#include "cado.h" // IWYU pragma: keep
/* This compilation units reacts to TRACK_CODE_PATH and uses macros
* such as WHERE_AM_I_UPDATE.
* This compilation unit _must_ produce different object files depending
* on the value of TRACK_CODE_PATH.
* The WHERE_AM_I_UPDATE macro itself is defined in las-where-am-i.hpp
*/
#ifdef HAVE_SSE2
#include <algorithm> // for max
#include <emmintrin.h> // for __m128i, _mm_xor_si128, _mm_set1_epi8
#include <cstdint> // for uint32_t
#include <cstdlib> // for size_t, abs
#include <vector> // for vector
#ifdef TRACE_K
#include "las-where-am-i.hpp" // for where_am_I, WHERE_AM_I_UPDATE
#include "las-output.hpp" // IWYU pragma: keep
#include "verbose.h" // verbose_output_print
#endif
#include "las-unsieve.hpp" // for extract_j_div, j_divisibility_helper, sea...
#include "macros.h" // for ASSERT_ALWAYS, MAYBE_UNUSED, no_break
#include "ularith.h" // for ularith_ctz
#include "gcd.h" // for bin_gcd_int64_safe
static const int verify_gcd = 0; /* Enable slow but thorough test */
static const __m128i sign_conversion = _mm_set1_epi8(-128);
static const __m128i even_masks[2] = {
_mm_set_epi8(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0),
_mm_set1_epi8(0xff)};
static const __m128i ff = _mm_set1_epi8(0xff);
static inline unsigned int
sieve_info_test_lognorm_sse2_mask(__m128i * S0, const __m128i pattern0,
const __m128i *S1, const __m128i pattern1)
{
__m128i a = *S0;
__m128i r = *S1;
__m128i m1, m2;
/* _mm_cmpgt_epi8() performs signed comparison, but we have unsigned
bytes. We can switch to signed in a way that preserves ordering by
flipping the MSB, e.g., 0xFF (255 unsigned) becomes 0x7F (+127 signed),
and 0x00 (0 unsigned) becomes 0x80 (-128 signed).
If a byte in the first operand is greater than the corresponding byte in
the second operand, the corresponding byte in the result is set to all 1s
(i.e., to 0xFF); otherwise, it is set to all 0s.
Normally, S[x] <= bound means a sieve survivor. However, for skipping over
locations where 3 | gcd(i,j), we set the bound to 0 in the pattern at
those locations. We then need a comparison that never produces a survivor
in those locations, even when S[x] is in fact 0. Thus we initialise the
pattern to bound + 1, then set the bound to 0 where 3 | gcd(i,j), and
change the comparison to S[x] < bound, which is guaranteed not to let any
survivors through where the pattern byte is 0. */
m1 = _mm_cmpgt_epi8 (pattern0, _mm_xor_si128(a, sign_conversion));
m2 = _mm_cmpgt_epi8 (pattern1, _mm_xor_si128(r, sign_conversion));
/* m1 is 0xFF where pattern[x] > S0[x], i.e., where it survived.
Same for S1. */
/* Logically AND the two masks: survivor only where both sides survived */
m1 = _mm_and_si128(m1, m2);
/* m1 is 0xFF in those locations where the sieve entry survived on
both sides */
/* For the OR mask we need the bit complement, via m1 XOR 0xFF */
m2 = _mm_xor_si128(m1, ff);
*S0 = _mm_or_si128(a, m2);
/* Do we want to update this one? */
// *S1 = _mm_or_si128(r, m2);
/* Compute mask of non-zero bytes */
return (unsigned int) _mm_movemask_epi8(m1);
}
static inline unsigned int
sieve_info_test_lognorm_sse2_mask_oneside(__m128i * S0, const __m128i pattern0)
{
__m128i a = *S0;
__m128i m1 = _mm_cmpgt_epi8 (pattern0, _mm_xor_si128(a, sign_conversion));
*S0 = _mm_or_si128(a, _mm_xor_si128(m1, ff));
return (unsigned int) _mm_movemask_epi8(m1);
}
/* Look for survivors as indicated by a bit mask.
We still have to test divisibility of the resulting i value by the
trial-divided primes. Return the number of survivors found. */
static inline void
search_single_survivors_mask(unsigned char * const SS,
unsigned int j,
int i0,
int i1 MAYBE_UNUSED,
int N MAYBE_UNUSED,
int x_start,
unsigned int nr_div,
unsigned int (*div)[2],
unsigned int bitmask,
std::vector<uint32_t> &survivors)
{
for (int x = x_start; UNLIKELY(bitmask != 0); x++) {
const int tz = ularith_ctz(bitmask);
x += tz;
bitmask >>= tz + 1;
/* The very small prime used in the bound pattern, and unsieving larger
primes have not identified this as gcd(i,j) > 1. It remains to check
the trial-divided primes. */
const unsigned int i = abs (i0 + x);
int divides = 0;
switch (nr_div) {
case 6: divides |= (i * div[5][0] <= div[5][1]); no_break();
case 5: divides |= (i * div[4][0] <= div[4][1]); no_break();
case 4: divides |= (i * div[3][0] <= div[3][1]); no_break();
case 3: divides |= (i * div[2][0] <= div[2][1]); no_break();
case 2: divides |= (i * div[1][0] <= div[1][1]); no_break();
case 1: divides |= (i * div[0][0] <= div[0][1]); no_break();
case 0: while(0){};
}
if (divides)
{
if (verify_gcd)
ASSERT_ALWAYS(bin_gcd_int64_safe (i, j) != 1);
#ifdef TRACE_K
if (trace_on_spot_Nx(N, x)) {
verbose_output_print(TRACE_CHANNEL, 0, "# Slot [%u] in bucket %u has non coprime (i,j)=(%d,%u)\n",
x, N, i, j);
}
#endif
SS[x] = 255;
} else {
survivors.push_back(x);
if (verify_gcd)
ASSERT_ALWAYS(bin_gcd_int64_safe (i, j) == 1);
#ifdef TRACE_K
if (trace_on_spot_Nx(N, x)) {
verbose_output_print(TRACE_CHANNEL, 0, "# Slot [%u] in bucket %u is survivor with coprime (i,j)\n",
x, N);
}
#endif
}
}
}
/* This function works for all j. Uses SSE2. */
static void
search_survivors_in_line1_sse2(unsigned char * const SS[2],
const unsigned char bound[2],
unsigned int j,
int i0, int i1,
int N MAYBE_UNUSED,
j_divisibility_helper const & j_div,
unsigned int td_max,
std::vector<uint32_t> &survivors)
{
unsigned int div[6][2], nr_div;
nr_div = extract_j_div(div, j, j_div, 3, td_max);
ASSERT_ALWAYS(nr_div <= 6);
/* If j is even, set all the even entries in the bound pattern to
unsigned 0 */
const __m128i even_mask = even_masks[j % 2];
/* The reason for the bound+1 here is documented in
sieve_info_test_lognorm_sse2_mask() */
__m128i patterns[2] = {
_mm_xor_si128(_mm_and_si128(_mm_set1_epi8(bound[0] + 1), even_mask), sign_conversion),
_mm_xor_si128(_mm_and_si128(_mm_set1_epi8(bound[1] + 1), even_mask), sign_conversion)
};
const int x_step = sizeof(__m128i);
for (int x_start = 0; x_start < (i1 - i0); x_start += x_step)
{
/* Do bounds check using SSE pattern, set non-survivors in SS[0] array
to 255 */
const unsigned int mask = sieve_info_test_lognorm_sse2_mask(
(__m128i*) (SS[0] + x_start), patterns[0],
(__m128i*) (SS[1] + x_start), patterns[1]);
search_single_survivors_mask(SS[0], j, i0, i1, N, x_start,
nr_div, div, mask, survivors);
}
}
static void
search_survivors_in_line1_sse2_oneside(unsigned char * SS,
const unsigned char bound,
unsigned int j,
int i0, int i1,
int N MAYBE_UNUSED,
j_divisibility_helper const & j_div,
unsigned int td_max,
std::vector<uint32_t> &survivors)
{
unsigned int div[6][2], nr_div;
nr_div = extract_j_div(div, j, j_div, 3, td_max);
ASSERT_ALWAYS(nr_div <= 6);
/* If j is even, set all the even entries in the bound pattern to
unsigned 0 */
const __m128i even_mask = even_masks[j % 2];
/* The reason for the bound+1 here is documented in
sieve_info_test_lognorm_sse2_mask() */
__m128i pattern =
_mm_xor_si128(_mm_and_si128(_mm_set1_epi8(bound + 1), even_mask), sign_conversion);
const int x_step = sizeof(__m128i);
for (int x_start = 0; x_start < (i1 - i0); x_start += x_step)
{
/* Do bounds check using SSE pattern, set non-survivors in SS[0] array
to 255 */
const unsigned int mask = sieve_info_test_lognorm_sse2_mask_oneside(
(__m128i*) (SS + x_start), pattern);
search_single_survivors_mask(SS, j, i0, i1, N, x_start,
nr_div, div, mask, survivors);
}
}
/* This function assumes j % 3 == 0. It uses an SSE bound pattern where
i-coordinates with i % 3 == 0 are set to a bound of 0. */
static void
search_survivors_in_line3_sse2(unsigned char * const SS[2],
const unsigned char bound[2],
unsigned int j,
int i0, int i1,
int N MAYBE_UNUSED,
j_divisibility_helper const & j_div,
unsigned int td_max,
std::vector<uint32_t> &survivors)
{
__m128i patterns[2][3];
const int x_step = sizeof(__m128i);
const int pmin = 5;
int next_pattern = 0;
/* We know that 3 does not divide j in the code of this function, and we
don't store 2. Allowing 5 distinct odd prime factors >3 thus handles
all j < 1616615, which is the smallest integer with 6 such factors */
unsigned int div[5][2];
unsigned int nr_div;
nr_div = extract_j_div(div, j, j_div, pmin, td_max);
ASSERT_ALWAYS(nr_div <= 5);
/* If j is even, set all the even entries in the bound pattern to
unsigned 0 */
const __m128i even_mask = even_masks[j % 2];
patterns[0][0] = patterns[0][1] = patterns[0][2] =
_mm_xor_si128(_mm_and_si128(_mm_set1_epi8(bound[0] + 1), even_mask), sign_conversion);
patterns[1][0] = patterns[1][1] = patterns[1][2] =
_mm_xor_si128(_mm_and_si128(_mm_set1_epi8(bound[1] + 1), even_mask), sign_conversion);
/* Those locations in patterns[0] that correspond to i being a multiple
* of 3 are set to 0. Byte 0 of patterns[0][0] corresponds to i = i0.
* We want d s.t. i0 + d == 0 (mod 3), or d == -i0 (mod 3).
*/
int d = (-i0) % 3;
if (d < 0) d += 3;
/*
* Special hack for i0=-(I/2):
* I = 2^logI and 2 == -1 (mod 3), we have d == -1^(logI-1) (mod 3),
* or d = 2 if logI is even and d = 1 if logI is odd.
size_t d = 2 - logI % 2;
*/
/* We use the sign conversion trick (i.e., XOR 0x80), so to get an
* effective bound of unsigned 0, we need to set the byte to 0x80.
*/
for (size_t i = 0; i < sizeof(__m128i); i++)
((unsigned char *)&patterns[0][0])[3*i + d] = 0x80;
for (int x_start = 0; x_start < (i1 - i0); x_start += x_step)
{
const unsigned int mask =
sieve_info_test_lognorm_sse2_mask(
(__m128i*) (SS[0] + x_start), patterns[0][next_pattern],
(__m128i*) (SS[1] + x_start), patterns[1][next_pattern]);
if (++next_pattern == 3)
next_pattern = 0;
search_single_survivors_mask(SS[0], j, i0, i1, N, x_start,
nr_div, div, mask, survivors);
}
}
/* This function assumes j % 3 == 0. It uses an SSE bound pattern where
i-coordinates with i % 3 == 0 are set to a bound of 0. */
static void
search_survivors_in_line3_sse2_oneside(unsigned char * const SS,
const unsigned char bound,
unsigned int j,
int i0, int i1,
int N MAYBE_UNUSED,
j_divisibility_helper const & j_div,
unsigned int td_max,
std::vector<uint32_t> &survivors)
{
__m128i patterns[3];
const int x_step = sizeof(__m128i);
const int pmin = 5;
int next_pattern = 0;
/* We know that 3 does not divide j in the code of this function, and we
don't store 2. Allowing 5 distinct odd prime factors >3 thus handles
all j < 1616615, which is the smallest integer with 6 such factors */
unsigned int div[5][2];
unsigned int nr_div;
nr_div = extract_j_div(div, j, j_div, pmin, td_max);
ASSERT_ALWAYS(nr_div <= 5);
/* If j is even, set all the even entries in the bound pattern to
unsigned 0 */
const __m128i even_mask = even_masks[j % 2];
patterns[0] = patterns[1] = patterns[2] =
_mm_xor_si128(_mm_and_si128(_mm_set1_epi8(bound + 1), even_mask), sign_conversion);
/* Those locations in patterns[0] that correspond to i being a multiple
* of 3 are set to 0. Byte 0 of patterns[0][0] corresponds to i = i0.
* We want d s.t. i0 + d == 0 (mod 3), or d == -i0 (mod 3).
*/
int d = (-i0) % 3;
if (d < 0) d += 3;
/*
* Special hack for i0=-(I/2):
* I = 2^logI and 2 == -1 (mod 3), we have d == -1^(logI-1) (mod 3),
* or d = 2 if logI is even and d = 1 if logI is odd.
size_t d = 2 - logI % 2;
*/
/* We use the sign conversion trick (i.e., XOR 0x80), so to get an
* effective bound of unsigned 0, we need to set the byte to 0x80.
*/
for (size_t i = 0; i < sizeof(__m128i); i++)
((unsigned char *)&patterns[0])[3*i + d] = 0x80;
for (int x_start = 0; x_start < (i1 - i0); x_start += x_step)
{
const unsigned int mask =
sieve_info_test_lognorm_sse2_mask_oneside(
(__m128i*) (SS + x_start), patterns[next_pattern]);
if (++next_pattern == 3)
next_pattern = 0;
search_single_survivors_mask(SS, j, i0, i1, N, x_start,
nr_div, div, mask, survivors);
}
}
/* This function assumes j % 3 != 0 and j % 5 == 0. It uses an SSE bound
pattern where i-coordinates with i % 5 == 0 are set to a bound of 0,
and trial divides only by primes > 5. */
static void
search_survivors_in_line5_sse2(unsigned char * const SS[2],
const unsigned char bound[2],
unsigned int j,
int i0, int i1,
int N MAYBE_UNUSED,
j_divisibility_helper const & j_div,
unsigned int td_max,
std::vector<uint32_t> &survivors)
{
const int nr_patterns = 5;
__m128i patterns[2][nr_patterns];
const int x_step = sizeof(__m128i);
const int pmin = 7;
int next_pattern = 0;
/* We know that 3 and 5 do not divide j in the code of this function, and
we don't store 2. Allowing 5 distinct odd prime factors >5 thus handles
all j < 7436429 */
unsigned int div[5][2];
unsigned int nr_div;
nr_div = extract_j_div(div, j, j_div, pmin, td_max);
ASSERT_ALWAYS(nr_div <= 5);
/* If j is even, set all the even entries in the bound pattern to
unsigned 0 */
const __m128i even_mask = even_masks[j % 2];
for (int i = 0; i < nr_patterns; i++) {
patterns[0][i] = _mm_xor_si128(_mm_and_si128(_mm_set1_epi8(bound[0] + 1), even_mask), sign_conversion);
patterns[1][i] = _mm_xor_si128(_mm_and_si128(_mm_set1_epi8(bound[1] + 1), even_mask), sign_conversion);
}
if (j % 2 == 0)
for (size_t i = 0; i < nr_patterns * sizeof(__m128i); i += 2)
((unsigned char *)&patterns[0][0])[i] = 0x80;
/* Those locations in patterns[0] that correspond to i being a multiple
of 5 are set to 0. Byte 0 of patterns[0][0] corresponds to i = i0.
We want d s.t. i0 + d == 0 (mod 5), or d == i0 (mod 5).
*/
int d = (-i0) % 5;
if (d < 0) d += 5;
/* Special trick for i0 = -(I/2) ; With
I = 2^logI and ord_5(2) == 4 (mod 5), we have d == 2^((logI-1)%4)
(mod 5), so we want a function: 0->3, 1->1, 2->2, 3->4.
static const unsigned char d_lut[] = {3,1,2,4};
size_t d = d_lut[logI % 4];
*/
/* We use the sign conversion trick (i.e., XOR 0x80), so to get an
effective bound of unsigned 0, we need to set the byte to 0x80. */
for (size_t i = 0; i < sizeof(__m128i); i++)
((unsigned char *)&patterns[0][0])[nr_patterns*i + d] = 0x80;
for (int x_start = 0; x_start < (i1 - i0); x_start += x_step)
{
const unsigned int mask = sieve_info_test_lognorm_sse2_mask(
(__m128i*) (SS[0] + x_start), patterns[0][next_pattern],
(__m128i*) (SS[1] + x_start), patterns[1][next_pattern]);
if (++next_pattern == nr_patterns)
next_pattern = 0;
search_single_survivors_mask(SS[0], j, i0, i1, N, x_start,
nr_div, div, mask, survivors);
}
}
/* This function assumes j % 3 != 0 and j % 5 == 0. It uses an SSE bound
pattern where i-coordinates with i % 5 == 0 are set to a bound of 0,
and trial divides only by primes > 5. */
static void
search_survivors_in_line5_sse2_oneside(unsigned char * const SS,
const unsigned char bound,
unsigned int j,
int i0, int i1,
int N MAYBE_UNUSED,
j_divisibility_helper const & j_div,
unsigned int td_max,
std::vector<uint32_t> &survivors)
{
const int nr_patterns = 5;
__m128i patterns[nr_patterns];
const int x_step = sizeof(__m128i);
const int pmin = 7;
int next_pattern = 0;
/* We know that 3 and 5 do not divide j in the code of this function, and
we don't store 2. Allowing 5 distinct odd prime factors >5 thus handles
all j < 7436429 */
unsigned int div[5][2];
unsigned int nr_div;
nr_div = extract_j_div(div, j, j_div, pmin, td_max);
ASSERT_ALWAYS(nr_div <= 5);
/* If j is even, set all the even entries in the bound pattern to
unsigned 0 */
const __m128i even_mask = even_masks[j % 2];
for (int i = 0; i < nr_patterns; i++) {
patterns[i] = _mm_xor_si128(_mm_and_si128(_mm_set1_epi8(bound + 1), even_mask), sign_conversion);
}
if (j % 2 == 0)
for (size_t i = 0; i < nr_patterns * sizeof(__m128i); i += 2)
((unsigned char *)&patterns[0])[i] = 0x80;
/* Those locations in patterns[0] that correspond to i being a multiple
of 5 are set to 0. Byte 0 of patterns[0][0] corresponds to i = i0.
We want d s.t. i0 + d == 0 (mod 5), or d == i0 (mod 5).
*/
int d = (-i0) % 5;
if (d < 0) d += 5;
/* Special trick for i0 = -(I/2) ; With
I = 2^logI and ord_5(2) == 4 (mod 5), we have d == 2^((logI-1)%4)
(mod 5), so we want a function: 0->3, 1->1, 2->2, 3->4.
static const unsigned char d_lut[] = {3,1,2,4};
size_t d = d_lut[logI % 4];
*/
/* We use the sign conversion trick (i.e., XOR 0x80), so to get an
effective bound of unsigned 0, we need to set the byte to 0x80. */
for (size_t i = 0; i < sizeof(__m128i); i++)
((unsigned char *)&patterns[0])[nr_patterns*i + d] = 0x80;
for (int x_start = 0; x_start < (i1 - i0); x_start += x_step)
{
const unsigned int mask = sieve_info_test_lognorm_sse2_mask_oneside(
(__m128i*) (SS + x_start), patterns[next_pattern]);
if (++next_pattern == nr_patterns)
next_pattern = 0;
search_single_survivors_mask(SS, j, i0, i1, N, x_start,
nr_div, div, mask, survivors);
}
}
#define USE_PATTERN_3 1
#define USE_PATTERN_5 1
#if USE_PATTERN_5 && ! USE_PATTERN_3
#error USE_PATTERN_5 requires USE_PATTERN_3
#endif
void
search_survivors_in_line_sse2(unsigned char * const SS[2],
const unsigned char bound[2],
unsigned int j,
int i0, int i1,
int N,
j_divisibility_helper const & j_div,
const unsigned int td_max, std::vector<uint32_t> &survivors)
{
#if USE_PATTERN_3
if (j % 3 == 0)
search_survivors_in_line3_sse2(SS, bound, j, i0, i1, N, j_div,
td_max, survivors);
#if USE_PATTERN_5
else if (j % 5 == 0)
search_survivors_in_line5_sse2(SS, bound, j, i0, i1, N, j_div,
td_max, survivors);
#endif
else
#endif
search_survivors_in_line1_sse2(SS, bound, j, i0, i1, N, j_div,
td_max, survivors);
}
void
search_survivors_in_line_sse2_oneside(unsigned char * const SS,
const unsigned char bound,
unsigned int j,
int i0, int i1,
int N,
j_divisibility_helper const & j_div,
const unsigned int td_max, std::vector<uint32_t> &survivors)
{
#if USE_PATTERN_3
if (j % 3 == 0)
search_survivors_in_line3_sse2_oneside(SS, bound, j, i0, i1, N, j_div,
td_max, survivors);
#if USE_PATTERN_5
else if (j % 5 == 0)
search_survivors_in_line5_sse2_oneside(SS, bound, j, i0, i1, N, j_div,
td_max, survivors);
#endif
else
#endif
search_survivors_in_line1_sse2_oneside(SS, bound, j, i0, i1, N, j_div,
td_max, survivors);
}
#endif /* HAVE_SSE2 */
Computing file changes ...