#include #include #include #include "macros.h" #include "types.h" #include "fppol.h" #include "ffspol.h" #include "norm.h" #include "qlat.h" #include "ijvec.h" #include "sublat.h" #ifdef WANT_NORM_STATS static uint64_t norm_stats_n[2] = {0, 0}; static int norm_stats_min[2] = {0, 0}; static uint64_t norm_stats_sum[2] = {0, 0}; static int norm_stats_max[2] = {0, 0}; void norm_stats_print() { printf("# Statistics on the sizes of the norms:\n"); for (int i = 0; i < 2; ++i) { printf("# Side %d: min = %u ; avg = %1.1f ; max = %u\n", i, norm_stats_min[i], (double)norm_stats_sum[i] / (double)norm_stats_n[i], norm_stats_max[i]); } } #endif #define MAX_PREC_N 32 /* Take the N less significant bits of p and the N less significant bits of q, multiply them, and put the N most significant bits of the result in the low part of r. NB: in fact, it is a good practice to give p and q of degree at most N-1, so that all the bits of the input are indeed used. Then this function puts zeroes in the high part of r. For the moment, we put asserts to check that the input are well formed. */ static MAYBE_UNUSED void fppol64_mul_high(fppol64_ptr r, fppol64_srcptr p, fppol64_srcptr q, unsigned int N) { ASSERT(N <= MAX_PREC_N); ASSERT(N > 0); ASSERT(fppol64_deg(p) < (int)N); ASSERT(fppol64_deg(q) < (int)N); if (N <= 8) { // N in [0,8] fppol8_t pp, qq; fppol16_t rr; fppol8_set_64(pp, p); fppol8_set_64(qq, q); fppol16_mul_8x8(rr, pp, qq); fppol16_div_ti(rr, rr, N-1); fppol64_set_16(r, rr); } else if (N <= 16) { // N in ]8,16] fppol16_t pp, qq; fppol32_t rr; fppol16_set_64(pp, p); fppol16_set_64(qq, q); fppol32_mul_16x16(rr, pp, qq); fppol32_div_ti(rr, rr, N-1); fppol64_set_32(r, rr); } else { // N in ]16,32] ASSERT(MAX_PREC_N <= 32); // otherwise, implement another case. fppol32_t pp, qq; fppol64_t rr; fppol32_set_64(pp, p); fppol32_set_64(qq, q); fppol64_mul_32x32(rr, pp, qq); fppol64_div_ti(r, rr, N-1); } ASSERT(fppol64_deg(r) < (int)N); } /* Function computing the norm of ffspol at (a,b) norm = b^d * ffspol(a/b), d = deg(ffspol) */ void ffspol_norm(fppol_t norm, ffspol_srcptr ffspol, fppol_t a, fppol_t b) { fppol_t *pow_b; fppol_t pow_a; fppol_t pol_norm_i; fppol_t tmp_norm; fppol_init(pow_a); fppol_init(pol_norm_i); fppol_init(tmp_norm); fppol_set_zero(pol_norm_i); fppol_set_zero(tmp_norm); fppol_set_one(pow_a); /* pow_b contains b^d, b^{d-1}, ... , b^2, b, 1 */ pow_b = (fppol_t *)malloc((ffspol->deg + 1) * sizeof(fppol_t)); fppol_init(pow_b[ffspol->deg]); fppol_set_one(pow_b[ffspol->deg]); for (int i = ffspol->deg - 1; i > -1; i--) { fppol_init(pow_b[i]); fppol_mul(pow_b[i], pow_b[i+1], b); } for (int i = 0; i < ffspol->deg + 1; i++) { fppol_mul(pol_norm_i, ffspol->coeffs[i], pow_b[i]); fppol_mul(pol_norm_i, pol_norm_i, pow_a); fppol_add(tmp_norm, tmp_norm, pol_norm_i); fppol_mul(pow_a, pow_a, a); } fppol_set(norm, tmp_norm); fppol_clear(pow_a); fppol_clear(pol_norm_i); fppol_clear(tmp_norm); for (int i = 0; i <= ffspol->deg; i++) fppol_clear(pow_b[i]); free(pow_b); } /* Function computing ffspol_ij, a polynomial such that norm(ffspol,a,b) = norm(ffspol_ij, i, j), i.e. it is possible to apply the function norm directly on (i,j) with the transformed polynomial ffspol_ij, without having to compute a and b with ij2ab(). */ /* ffspol_ab = [f_0, ..., f_{d-1}, f_d] ffspol_ij = [h_0, ..., h_{d-1}, h_d] powb_ij = [g_0, ... , g_{d-1}, g_d] We consider the expression f(a, b) = f_d a^d + f_{d-1} a^{d-1} b + ... + f_0 b^d We can write it as f(a, b) = ff(a,b) a + f_0 b^d. with a = a0 i + a1 j and b = b0 i + b1 j, we have: f(a0 i + a1 j, b0 i + b1 j) = ff(a0 i + a1 j, b0 i + b1 j) (a0 i + a1 j) + f_0 (b0 i + b1 j)^d We use the formula recursively, i.e. iteration number k uses: h(i, j) = hh(i, j) (a0 i + a1 j) + f_{d-k} (b0 i + b1 j)^k We use a table powb_ij to store the coefficients of (b0 i + b1 j)^k = [g_k, g_{k-1}, ..., g_0]. We have the following formula to compute from step (k-1) to step k: For hh(i,j) * (a0 i + a1 j): h_{d-k} = a1 * h_{d-k+1} ... h_{d-l} = a0 * h_{d-l} + a1 * h_{d-l+1} , 0 < l < k ... h_{d-1} = a0 * h_{d-1} + a1 * h_d h_d = a0 * h_d For (b0 i + b1 j)^k: g_k = b0 * g_{k-1} ... g_{k-l} = b0 * g_{k-l-1} + b1 * g_{k-l}, 0 < l < k ... g_0 = b1 * g_0 We then multiply (b0 i + b1 j)^k by f_{d-k} and add it to hh(i,j) (a0 i + a1 j) we have computed. We do it for k = 0 to k = d. Warning: l and k for the iterations in the function definition are taken in reverse order compared to this comment. */ static void ffspol_2ij(ffspol_ptr ffspol_ij, ffspol_srcptr ffspol_ab, qlat_t qlat) { int d = ffspol_ab->deg; fppol_t *powb_ij; fppol_t tmp1, tmp2; fppol_init(tmp1); fppol_init(tmp2); /* Init of powb_ij which contains the coefficients of (b0 i + b1 j)^k */ powb_ij = (fppol_t *)malloc((d+1) * sizeof(fppol_t)); for (int k = 0; k <= d; ++k) fppol_init(powb_ij[k]); /* Step 0 */ fppol_set(ffspol_ij->coeffs[d], ffspol_ab->coeffs[d]); ffspol_ij->deg = d; fppol_set_one(powb_ij[0]); for (int k = d - 1; k >= 0; --k) { /* For hh(i,j) * (a0 i + a1 j) */ if (!qlat->want_longq) fppol_mul_ai(ffspol_ij->coeffs[k], ffspol_ij->coeffs[k + 1], qlat->a1); else fppol_mul(ffspol_ij->coeffs[k], ffspol_ij->coeffs[k + 1], qlat->longa1); for (int l = k + 1; l < d; ++l) { if (!qlat->want_longq) { fppol_mul_ai(tmp1, ffspol_ij->coeffs[l], qlat->a0); fppol_mul_ai(tmp2, ffspol_ij->coeffs[l + 1], qlat->a1); } else { fppol_mul(tmp1, ffspol_ij->coeffs[l], qlat->longa0); fppol_mul(tmp2, ffspol_ij->coeffs[l + 1], qlat->longa1); } fppol_add(ffspol_ij->coeffs[l], tmp1, tmp2); } if (!qlat->want_longq) fppol_mul_ai(ffspol_ij->coeffs[d], ffspol_ij->coeffs[d], qlat->a0); else fppol_mul(ffspol_ij->coeffs[d], ffspol_ij->coeffs[d], qlat->longa0); /* For (b0 i + b1 j)^{d-k} */ if (!qlat->want_longq) fppol_mul_ai(powb_ij[d - k], powb_ij[d - k - 1], qlat->b0); else fppol_mul(powb_ij[d - k], powb_ij[d - k - 1], qlat->longb0); for (int l = d - k - 1; l > 0; --l) { if (!qlat->want_longq) { fppol_mul_ai(tmp1, powb_ij[l - 1], qlat->b0); fppol_mul_ai(tmp2, powb_ij[l], qlat->b1); } else { fppol_mul(tmp1, powb_ij[l - 1], qlat->longb0); fppol_mul(tmp2, powb_ij[l], qlat->longb1); } fppol_add(powb_ij[l], tmp1, tmp2); } if (!qlat->want_longq) fppol_mul_ai(powb_ij[0], powb_ij[0], qlat->b1); else fppol_mul(powb_ij[0], powb_ij[0], qlat->longb1); /* Multiply (b0 i + b1 j)^{d-k} by f_k and add it to hh(i,j) (a0 i + a1 j) we have computed */ for (int l = k; l <= d; ++l) { fppol_mul(tmp1, powb_ij[l - k], ffspol_ab->coeffs[k]); fppol_add(ffspol_ij->coeffs[l], ffspol_ij->coeffs[l], tmp1); } } for (int k = 0; k <= d; ++k) fppol_clear(powb_ij[k]); free(powb_ij); fppol_clear(tmp1); fppol_clear(tmp2); } /* max_special(prev_max, j, &repeated) returns the maximum of prev_max and j The value repeated should be initialized to 1 If there is only one maximum, then repeated is one If prev_max == j, the variable (*repeated) is *incremented* This function is completely ad hoc to be used in a for loop like max = max_special(max, tab[i], &repeated) so the result will be different from max = max_special(tab[i], max, &repeated) */ static int max_special(int prev_max, int j, int *repeated) { if (prev_max == j) { (*repeated)++; return prev_max; } else if (prev_max > j) { return prev_max; } else { *repeated = 1; return j; } } static int deg_norm_prec_0(ffspol_t ffspol_ij, int degi, int degj, int *gap) { int degree, max_deg = -1; int repeated = 1; for (int k = 0; k < ffspol_ij->deg + 1; ++k) { degree = fppol_deg(ffspol_ij->coeffs[k]) + k * degi + (ffspol_ij->deg - k) * degj; max_deg = max_special(max_deg, degree, &repeated); } /* If there is only one monomial of maximal degree */ if (repeated == 1) *gap = -1; #if FP_SIZE == 2 /* In characteristic 2, we can use the parity of the number of monomials of maximal degree to find cancellations */ else if (repeated != 1 && (repeated & 1u)) *gap = 0; #endif else *gap = max_deg + 1; return max_deg; } /* function which takes as input a fppol and returns a fppol64 with N bits of precision containing the N monomials of higher degree, with N <= MAX_PREC_N */ static void to_prec_N(fppol64_ptr r, fppol_t p, unsigned int N) { ASSERT(N <= MAX_PREC_N); ASSERT(N > 0); int shift = fppol_deg(p) - N + 1; if (shift == 0) { fppol64_set_mp(r, p); } else { fppol_t tmp; fppol_init2(tmp, N); if (shift > 0) { fppol_div_ti(tmp, p, (unsigned int) shift); fppol64_set_mp(r, tmp); } else { /* we need to align the most significant bit on the left */ fppol_mul_ti(tmp, p, (unsigned int) -shift); fppol64_set_mp(r,tmp); } fppol_clear(tmp); } } static int deg_norm_prec_N(ffspol_t ffspol_ij, int degi, fppol_t *pow_i, int degj, fppol_t *pow_j, int *gap, int max_deg) { /* monomials contains the fppol64_t monomials of the norm in precision 0 < N <= 32. The monomials and their degrees are sorted by increasing order of their degrees */ int deg_norm = -1; unsigned int N = 0; const int d = ffspol_ij->deg; int tab_size; int degree; int *degrees; degrees = (int *)malloc((d+1) * sizeof(int)); fppol64_t monomial; fppol64_t *monomials; monomials = (fppol64_t *)malloc((d+1) * sizeof(fppol64_t)); fppol64_t coeff_prec_N; fppol64_t pow_i_prec_N; fppol64_t pow_j_prec_N; #define PREC_GAP 8 do{ N = N + PREC_GAP; /* Computing and sorting the degrees and the monomials in precision N. Also align the truncated monomials to be able to add them without further shifts. */ tab_size = 0; for (int k = 0; k < d+1; ++k) { degree = fppol_deg(ffspol_ij->coeffs[k]) + k * degi + (d - k) * degj; if (degree <= max_deg - (int)N) continue; to_prec_N(coeff_prec_N, ffspol_ij->coeffs[k], N); to_prec_N(pow_i_prec_N, pow_i[k], N); to_prec_N(pow_j_prec_N, pow_j[k], N); fppol64_mul_high(monomial, pow_i_prec_N, pow_j_prec_N, N); fppol64_mul_high(monomial, monomial, coeff_prec_N, N); // insertion sort int l = tab_size; while (l > 0 && degrees[l - 1] > degree) { degrees[l] = degrees[l - 1]; fppol64_set(monomials[l], monomials[l - 1]); --l; } degrees[l] = degree; fppol64_div_ti(monomials[l], monomial, (max_deg - degree)); tab_size++; } ASSERT(tab_size >= 2); // otherwise, we shouldn't be here! /* We set tab_size to the index of the last value in degrees[] and monomials[]. This will always be the case until the end of this loop. */ --tab_size; /* Computing the sum of all monomials of maximal degree until we find only one monomial (term) of maximal degree */ fppol64_t sum; int deg_sum_prec; do { fppol64_set(sum, monomials[tab_size]); // if (max_deg_prec != (int) (N - 1)) // fprintf(stderr, "N = %u, max_deg_prec = %d \t", N, max_deg_prec); /* we make the sum of all monomials of maximal degree */ while (tab_size > 0 && degrees[tab_size] == degrees[tab_size - 1]) { --tab_size; fppol64_add(sum, sum, monomials[tab_size]); } deg_sum_prec = fppol64_deg(sum); /* If we still have information, we put the sum in monomials[] and degree in degrees[] and keep the tables sorted */ if (deg_sum_prec >= 0) { degree = deg_sum_prec + 1 + max_deg - N; // exact degree of the sum. int l = tab_size; while (l > 0 && degrees[l - 1] > degree) { degrees[l] = degrees[l - 1]; fppol64_set(monomials[l], monomials[l - 1]); --l; } degrees[l] = degree; fppol64_set(monomials[l], sum); } else { /* If the drop in degree is greater than the precision, we throw away the new monomial sum */ --tab_size; } /* We do this until there is only one monomial of maximal degree */ } while (tab_size > 0 && degrees[tab_size] == degrees[tab_size - 1]); /* Finished or have to increase the precision ? */ if ((tab_size > 0 && degrees[tab_size] != degrees[tab_size - 1]) || tab_size == 0) { deg_norm = degrees[tab_size]; *gap = max_deg - deg_norm; free(degrees); free(monomials); return deg_norm; } } while (tab_size < 0 && N + PREC_GAP <= MAX_PREC_N); // Failed, even at maximum allowed precision. Mark it in gap. *gap = max_deg + 1; free(degrees); free(monomials); return deg_norm; } #undef PREC_GAP #undef MAX_PREC_N static int deg_norm_full(ffspol_t ffspol_ij, fppol_t *pow_i, fppol_t *pow_j, int *gap, int max_deg) { fppol_t pol_norm_k; fppol_t norm; int deg_norm = -1; fppol_init(pol_norm_k); fppol_init(norm); fppol_set_zero(pol_norm_k); fppol_set_zero(norm); for (int k = 0; k < ffspol_ij->deg + 1; ++k) { fppol_mul(pol_norm_k, ffspol_ij->coeffs[k], pow_j[k]); fppol_mul(pol_norm_k, pol_norm_k, pow_i[k]); fppol_add(norm, norm, pol_norm_k); } deg_norm = fppol_deg(norm); *gap = max_deg - deg_norm; fppol_clear(pol_norm_k); fppol_clear(norm); return deg_norm; } /* Function deg_norm_ij which computes the degree of the norm using some kind of floating point representation of polynomials fppol. We also want the function to update an integer value called gap which is defined as the difference between the maximum degree of the monomials in the norm and the degree of the norm. Hence a strictly positive gap means there are cancellations of high degrees during the norm computation. We also define the gap to be -1 if there is only one monomial of maximal degree. The gap value zero means there are several monomials of higher degree but no cancellation */ int deg_norm_ij(ffspol_ptr ffspol_ij, ij_t i, ij_t j, int *gap) { int degi = ij_deg(i); int degj = ij_deg(j); int max_deg; int deg_norm = -1; if (degi == -1) return (ffspol_ij->deg)*degj + fppol_deg(ffspol_ij->coeffs[0]); if (degj == -1) return (ffspol_ij->deg)*degi + fppol_deg(ffspol_ij->coeffs[ffspol_ij->deg]); /* Computation of the degree of the norm for precision 0 i.e. the computation works only if deg_norm is the maximal degree of the monomials */ max_deg = deg_norm_prec_0(ffspol_ij, degi, degj, gap); if (*gap != max_deg + 1) return max_deg; else { fppol_t *pow_i; fppol_t *pow_j; fppol_t ii, jj; fppol_init(ii); fppol_init(jj); fppol_set_ij(ii, i); fppol_set_ij(jj, j); /* pow_j contains the powers of j in DECREASING order */ pow_j = (fppol_t *)malloc((ffspol_ij->deg + 1) * sizeof(fppol_t)); fppol_init(pow_j[ffspol_ij->deg]); fppol_set_one(pow_j[ffspol_ij->deg]); for (int k = ffspol_ij->deg - 1; k > -1; --k) { fppol_init(pow_j[k]); fppol_mul(pow_j[k], pow_j[k + 1], jj); } /* pow_i contains the powers of i in INCREASING order */ pow_i = (fppol_t *)malloc((ffspol_ij->deg + 1) * sizeof(fppol_t)); fppol_init(pow_i[0]); fppol_set_one(pow_i[0]); for (int k = 1; k < ffspol_ij->deg + 1; ++k) { fppol_init(pow_i[k]); fppol_mul(pow_i[k], pow_i[k - 1], ii); } deg_norm = deg_norm_prec_N(ffspol_ij, degi, pow_i, degj, pow_j, gap, max_deg); if (*gap == max_deg + 1) { deg_norm = deg_norm_full(ffspol_ij, pow_i, pow_j, gap, max_deg); } else { #if 0 // this is a very expensive check #ifndef NDEBUG int deg_norm2 = deg_norm_full(ffspol_ij, pow_i, pow_j, gap, max_deg); ASSERT(deg_norm2 == deg_norm); #endif #endif } fppol_clear(ii); fppol_clear(jj); for (int k = 0; k deg + 1; ++k) { fppol_clear(pow_i[k]); fppol_clear(pow_j[k]); } free(pow_i); free(pow_j); return deg_norm; } } /* Function init_norms Compute the degree of the norm at each valid position in the given j-range. The sqside parameter is a boolean that tells whether we are on the side of the special q. If so, then the degree of q must be subtracted from the norm. */ void init_norms(uint8_t * S, ffspol_srcptr ffspol, unsigned I, unsigned J, ij_t j0, ijpos_t pos0, ijpos_t size, qlat_t qlat, int sqside, sublat_ptr sublat, MAYBE_UNUSED int side) { ffspol_t ffspol_ij; ffspol_init2(ffspol_ij, ffspol->alloc); // TODO: this could be precomputed once for all and stored in qlat ffspol_2ij(ffspol_ij, ffspol, qlat); int degq = 0; if (sqside) { if (!qlat->want_longq) degq = sq_deg(qlat->q); else degq = fppol_deg(qlat->longq); } ij_t i, j, hati, hatj; int gap; int rci, rcj = 1; for (ij_set(j, j0); rcj; rcj = ij_monic_set_next(j, j, J)) { ijpos_t start = ijvec_get_start_pos(j, I, J) - pos0; if (start >= size) break; rci = 1; for (ij_set_zero(i); rci; ) { ijpos_t pos = start + ijvec_get_offset(i, I); if (S[pos] == 255) { rci = ij_set_next(i, i, I); continue; } // Sublat conversion ij_convert_sublat(hati, hatj, i, j, sublat); // Compute the degree of the norm, and the gap information. int deg = deg_norm_ij(ffspol_ij, hati, hatj, &gap); // Deduce the next i for which we have to compute the norm. ij_t i_next; { int degi = ij_deg(i); if (gap == -1) { ij_set_ti(i_next, degi+1); } else { int s = MAX(degi - gap, 0); ij_div_ti(i_next, i, s); ij_set_next(i_next, i_next, I+1); // we don't care for overflow, here ij_mul_ti(i_next, i_next, s); } } // Fast loop with constant degree of norm. deg -= degq; deg >>= SCALE; if (deg > 254) { fprintf(stderr, "Error: the scaling of norms is not enough. Please a version compiled with a larger SCALE value.\n"); exit(EXIT_FAILURE); } if (deg == 0) deg = 255; #if defined(USE_F2) && !defined(TRACE_POS) && !defined(WANT_NORM_STATS) // The generic loop below takes a dozen of asm instructions in // charac 2. // With specific, simplified code for charac 2, gcc does a good // job, unroll the loop, use simd instructions, and so on. { uint8_t * Sptr = S + pos; uint32_t ii = i[0]; uint32_t iinext = i_next[0]; do { *Sptr++ |= deg; ii++; } while (ii!=iinext); i[0] = ii; i_next[0] = iinext; rci = iinext < (1U< norm_stats_max[side]) norm_stats_max[side] = deg+degq; } #endif rci = ij_set_next(i, i, I); pos = start + ijvec_get_offset(i, I); } while (rci && !ij_eq(i, i_next)); #endif } } ffspol_clear(ffspol_ij); }