Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

swh:1:snp:418f8417068b61dc00572c13ca3d8ff0c2f214db
  • Code
  • Branches (10)
  • Releases (0)
    • Branches
    • Releases
    • HEAD
    • refs/heads/efm32gg11b
    • refs/heads/github-actions-test
    • refs/heads/master
    • refs/heads/qemu-gcc11-fix
    • refs/heads/setupscript
    • refs/heads/sikefix-rpls
    • refs/remotes/amin/kyberintt
    • refs/tags/Round1
    • refs/tags/Round2
    • refs/tags/Round3
    No releases to show
  • f8fa7f7
  • /
  • crypto_sign
  • /
  • falcon-512
  • /
  • m4-ct
  • /
  • fft.c
Raw File Download

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • content
  • directory
  • revision
  • snapshot
content badge
swh:1:cnt:b1904b24cf1d9d955e7ba77fe0498bcc0460a1f8
directory badge
swh:1:dir:17ab525370ec7e27e54c1c6e13a7c135e32d0318
revision badge
swh:1:rev:26f810d332b829a2c16220294db7a882b2072f4d
snapshot badge
swh:1:snp:418f8417068b61dc00572c13ca3d8ff0c2f214db

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • content
  • directory
  • revision
  • snapshot
(requires biblatex-software package)
Generating citation ...
(requires biblatex-software package)
Generating citation ...
(requires biblatex-software package)
Generating citation ...
(requires biblatex-software package)
Generating citation ...
Tip revision: 26f810d332b829a2c16220294db7a882b2072f4d authored by rugo on 07 June 2022, 08:39:12 UTC
Fix alignment issues in Kyber (#236)
Tip revision: 26f810d
fft.c
/*
 * FFT code.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2017-2019  Falcon Project
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
 */

#include "inner.h"

/*
 * Rules for complex number macros:
 * --------------------------------
 *
 * Operand order is: destination, source1, source2...
 *
 * Each operand is a real and an imaginary part.
 *
 * All overlaps are allowed.
 */

/*
 * Addition of two complex numbers (d = a + b).
 */
#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
		fpr fpct_re, fpct_im; \
		fpct_re = fpr_add(a_re, b_re); \
		fpct_im = fpr_add(a_im, b_im); \
		(d_re) = fpct_re; \
		(d_im) = fpct_im; \
	} while (0)

/*
 * Subtraction of two complex numbers (d = a - b).
 */
#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
		fpr fpct_re, fpct_im; \
		fpct_re = fpr_sub(a_re, b_re); \
		fpct_im = fpr_sub(a_im, b_im); \
		(d_re) = fpct_re; \
		(d_im) = fpct_im; \
	} while (0)

/*
 * Multplication of two complex numbers (d = a * b).
 */
#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
		fpr fpct_a_re, fpct_a_im; \
		fpr fpct_b_re, fpct_b_im; \
		fpr fpct_d_re, fpct_d_im; \
		fpct_a_re = (a_re); \
		fpct_a_im = (a_im); \
		fpct_b_re = (b_re); \
		fpct_b_im = (b_im); \
		fpct_d_re = fpr_sub( \
			fpr_mul(fpct_a_re, fpct_b_re), \
			fpr_mul(fpct_a_im, fpct_b_im)); \
		fpct_d_im = fpr_add( \
			fpr_mul(fpct_a_re, fpct_b_im), \
			fpr_mul(fpct_a_im, fpct_b_re)); \
		(d_re) = fpct_d_re; \
		(d_im) = fpct_d_im; \
	} while (0)

/*
 * Squaring of a complex number (d = a * a).
 */
#define FPC_SQR(d_re, d_im, a_re, a_im)   do { \
		fpr fpct_a_re, fpct_a_im; \
		fpr fpct_d_re, fpct_d_im; \
		fpct_a_re = (a_re); \
		fpct_a_im = (a_im); \
		fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
		fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
		(d_re) = fpct_d_re; \
		(d_im) = fpct_d_im; \
	} while (0)

/*
 * Inversion of a complex number (d = 1 / a).
 */
#define FPC_INV(d_re, d_im, a_re, a_im)   do { \
		fpr fpct_a_re, fpct_a_im; \
		fpr fpct_d_re, fpct_d_im; \
		fpr fpct_m; \
		fpct_a_re = (a_re); \
		fpct_a_im = (a_im); \
		fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
		fpct_m = fpr_inv(fpct_m); \
		fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
		fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
		(d_re) = fpct_d_re; \
		(d_im) = fpct_d_im; \
	} while (0)

/*
 * Division of complex numbers (d = a / b).
 */
#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
		fpr fpct_a_re, fpct_a_im; \
		fpr fpct_b_re, fpct_b_im; \
		fpr fpct_d_re, fpct_d_im; \
		fpr fpct_m; \
		fpct_a_re = (a_re); \
		fpct_a_im = (a_im); \
		fpct_b_re = (b_re); \
		fpct_b_im = (b_im); \
		fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
		fpct_m = fpr_inv(fpct_m); \
		fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
		fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
		fpct_d_re = fpr_sub( \
			fpr_mul(fpct_a_re, fpct_b_re), \
			fpr_mul(fpct_a_im, fpct_b_im)); \
		fpct_d_im = fpr_add( \
			fpr_mul(fpct_a_re, fpct_b_im), \
			fpr_mul(fpct_a_im, fpct_b_re)); \
		(d_re) = fpct_d_re; \
		(d_im) = fpct_d_im; \
	} while (0)

/*
 * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
 * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
 * of X^N+1 in the field of complex numbers. A crucial property is that
 * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
 *
 * FFT representation of a polynomial f (taken modulo X^N+1) is the
 * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
 * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
 * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
 * needed. A consequence is that FFT representation has the same size
 * as normal representation: N/2 complex numbers use N real numbers (each
 * complex number is the combination of a real and an imaginary part).
 *
 * We use a specific ordering which makes computations easier. Let rev()
 * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
 * store the real and imaginary parts of f(w_j) in slots:
 *
 *    Re(f(w_j)) -> slot rev(j)/2
 *    Im(f(w_j)) -> slot rev(j)/2+N/2
 *
 * (Note that rev(j) is even for j < N/2.)
 */

/* see inner.h */
TARGET_AVX2
void
Zf(FFT)(fpr *f, unsigned logn)
{
	/*
	 * FFT algorithm in bit-reversal order uses the following
	 * iterative algorithm:
	 *
	 *   t = N
	 *   for m = 1; m < N; m *= 2:
	 *       ht = t/2
	 *       for i1 = 0; i1 < m; i1 ++:
	 *           j1 = i1 * t
	 *           s = GM[m + i1]
	 *           for j = j1; j < (j1 + ht); j ++:
	 *               x = f[j]
	 *               y = s * f[j + ht]
	 *               f[j] = x + y
	 *               f[j + ht] = x - y
	 *       t = ht
	 *
	 * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
	 *
	 * In the description above, f[] is supposed to contain complex
	 * numbers. In our in-memory representation, the real and
	 * imaginary parts of f[k] are in array slots k and k+N/2.
	 *
	 * We only keep the first half of the complex numbers. We can
	 * see that after the first iteration, the first and second halves
	 * of the array of complex numbers have separate lives, so we
	 * simply ignore the second part.
	 */

	unsigned u;
	size_t t, n, hn, m;

	/*
	 * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
	 * (because GM[1] = w^rev(1) = w^(N/2) = i).
	 * In our chosen representation, this is a no-op: everything is
	 * already where it should be.
	 */

	/*
	 * Subsequent iterations are truncated to use only the first
	 * half of values.
	 */
	n = (size_t)1 << logn;
	hn = n >> 1;
	t = hn;
	for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
		size_t ht, hm, i1, j1;

		ht = t >> 1;
		hm = m >> 1;
		for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
			size_t j, j2;

			j2 = j1 + ht;
#if FALCON_AVX2 // yyyAVX2+1
			if (ht >= 4) {
				__m256d s_re, s_im;

				s_re = _mm256_set1_pd(
					fpr_gm_tab[((m + i1) << 1) + 0].v);
				s_im = _mm256_set1_pd(
					fpr_gm_tab[((m + i1) << 1) + 1].v);
				for (j = j1; j < j2; j += 4) {
					__m256d x_re, x_im, y_re, y_im;
					__m256d z_re, z_im;

					x_re = _mm256_loadu_pd(&f[j].v);
					x_im = _mm256_loadu_pd(&f[j + hn].v);
					z_re = _mm256_loadu_pd(&f[j+ht].v);
					z_im = _mm256_loadu_pd(&f[j+ht + hn].v);
					y_re = FMSUB(z_re, s_re,
						_mm256_mul_pd(z_im, s_im));
					y_im = FMADD(z_re, s_im,
						_mm256_mul_pd(z_im, s_re));
					_mm256_storeu_pd(&f[j].v,
						_mm256_add_pd(x_re, y_re));
					_mm256_storeu_pd(&f[j + hn].v,
						_mm256_add_pd(x_im, y_im));
					_mm256_storeu_pd(&f[j + ht].v,
						_mm256_sub_pd(x_re, y_re));
					_mm256_storeu_pd(&f[j + ht + hn].v,
						_mm256_sub_pd(x_im, y_im));
				}
			} else {
				fpr s_re, s_im;

				s_re = fpr_gm_tab[((m + i1) << 1) + 0];
				s_im = fpr_gm_tab[((m + i1) << 1) + 1];
				for (j = j1; j < j2; j ++) {
					fpr x_re, x_im, y_re, y_im;

					x_re = f[j];
					x_im = f[j + hn];
					y_re = f[j + ht];
					y_im = f[j + ht + hn];
					FPC_MUL(y_re, y_im,
						y_re, y_im, s_re, s_im);
					FPC_ADD(f[j], f[j + hn],
						x_re, x_im, y_re, y_im);
					FPC_SUB(f[j + ht], f[j + ht + hn],
						x_re, x_im, y_re, y_im);
				}
			}
#else // yyyAVX2+0
			fpr s_re, s_im;

			s_re = fpr_gm_tab[((m + i1) << 1) + 0];
			s_im = fpr_gm_tab[((m + i1) << 1) + 1];
			for (j = j1; j < j2; j ++) {
				fpr x_re, x_im, y_re, y_im;

				x_re = f[j];
				x_im = f[j + hn];
				y_re = f[j + ht];
				y_im = f[j + ht + hn];
				FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
				FPC_ADD(f[j], f[j + hn],
					x_re, x_im, y_re, y_im);
				FPC_SUB(f[j + ht], f[j + ht + hn],
					x_re, x_im, y_re, y_im);
			}
#endif // yyyAVX2-
		}
		t = ht;
	}
}

/* see inner.h */
TARGET_AVX2
void
Zf(iFFT)(fpr *f, unsigned logn)
{
	/*
	 * Inverse FFT algorithm in bit-reversal order uses the following
	 * iterative algorithm:
	 *
	 *   t = 1
	 *   for m = N; m > 1; m /= 2:
	 *       hm = m/2
	 *       dt = t*2
	 *       for i1 = 0; i1 < hm; i1 ++:
	 *           j1 = i1 * dt
	 *           s = iGM[hm + i1]
	 *           for j = j1; j < (j1 + t); j ++:
	 *               x = f[j]
	 *               y = f[j + t]
	 *               f[j] = x + y
	 *               f[j + t] = s * (x - y)
	 *       t = dt
	 *   for i1 = 0; i1 < N; i1 ++:
	 *       f[i1] = f[i1] / N
	 *
	 * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
	 * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
	 *
	 * In the main loop (not counting the final division loop), in
	 * all iterations except the last, the first and second half of f[]
	 * (as an array of complex numbers) are separate. In our chosen
	 * representation, we do not keep the second half.
	 *
	 * The last iteration recombines the recomputed half with the
	 * implicit half, and should yield only real numbers since the
	 * target polynomial is real; moreover, s = i at that step.
	 * Thus, when considering x and y:
	 *    y = conj(x) since the final f[j] must be real
	 *    Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
	 *    filled with 2*Im(x).
	 * But we already have Re(x) and Im(x) in array slots j and j+t
	 * in our chosen representation. That last iteration is thus a
	 * simple doubling of the values in all the array.
	 *
	 * We make the last iteration a no-op by tweaking the final
	 * division into a division by N/2, not N.
	 */
	size_t u, n, hn, t, m;

	n = (size_t)1 << logn;
	t = 1;
	m = n;
	hn = n >> 1;
	for (u = logn; u > 1; u --) {
		size_t hm, dt, i1, j1;

		hm = m >> 1;
		dt = t << 1;
		for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
			size_t j, j2;

			j2 = j1 + t;
#if FALCON_AVX2 // yyyAVX2+1
			if (t >= 4) {
				__m256d s_re, s_im;

				s_re = _mm256_set1_pd(
					fpr_gm_tab[((hm + i1) << 1) + 0].v);
				s_im = _mm256_set1_pd(
					fpr_gm_tab[((hm + i1) << 1) + 1].v);
				for (j = j1; j < j2; j += 4) {
					__m256d x_re, x_im, y_re, y_im;
					__m256d z_re, z_im;

					x_re = _mm256_loadu_pd(&f[j].v);
					x_im = _mm256_loadu_pd(&f[j + hn].v);
					y_re = _mm256_loadu_pd(&f[j+t].v);
					y_im = _mm256_loadu_pd(&f[j+t + hn].v);
					_mm256_storeu_pd(&f[j].v,
						_mm256_add_pd(x_re, y_re));
					_mm256_storeu_pd(&f[j + hn].v,
						_mm256_add_pd(x_im, y_im));
					x_re = _mm256_sub_pd(y_re, x_re);
					x_im = _mm256_sub_pd(x_im, y_im);
					z_re = FMSUB(x_im, s_im,
						_mm256_mul_pd(x_re, s_re));
					z_im = FMADD(x_re, s_im,
						_mm256_mul_pd(x_im, s_re));
					_mm256_storeu_pd(&f[j+t].v, z_re);
					_mm256_storeu_pd(&f[j+t + hn].v, z_im);
				}
			} else {
				fpr s_re, s_im;

				s_re = fpr_gm_tab[((hm + i1) << 1)+0];
				s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1)+1]);
				for (j = j1; j < j2; j ++) {
					fpr x_re, x_im, y_re, y_im;

					x_re = f[j];
					x_im = f[j + hn];
					y_re = f[j + t];
					y_im = f[j + t + hn];
					FPC_ADD(f[j], f[j + hn],
						x_re, x_im, y_re, y_im);
					FPC_SUB(x_re, x_im,
						x_re, x_im, y_re, y_im);
					FPC_MUL(f[j + t], f[j + t + hn],
						x_re, x_im, s_re, s_im);
				}
			}
#else // yyyAVX2+0
			fpr s_re, s_im;

			s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
			s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
			for (j = j1; j < j2; j ++) {
				fpr x_re, x_im, y_re, y_im;

				x_re = f[j];
				x_im = f[j + hn];
				y_re = f[j + t];
				y_im = f[j + t + hn];
				FPC_ADD(f[j], f[j + hn],
					x_re, x_im, y_re, y_im);
				FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
				FPC_MUL(f[j + t], f[j + t + hn],
					x_re, x_im, s_re, s_im);
			}
#endif // yyyAVX2-
		}
		t = dt;
		m = hm;
	}

	/*
	 * Last iteration is a no-op, provided that we divide by N/2
	 * instead of N. We need to make a special case for logn = 0.
	 */
	if (logn > 0) {
		fpr ni;

		ni = fpr_p2_tab[logn];
		for (u = 0; u < n; u ++) {
			f[u] = fpr_mul(f[u], ni);
		}
	}
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_add)(
	fpr *restrict a, const fpr *restrict b, unsigned logn)
{
	size_t n, u;

	n = (size_t)1 << logn;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 4) {
		for (u = 0; u < n; u += 4) {
			_mm256_storeu_pd(&a[u].v,
				_mm256_add_pd(
					_mm256_loadu_pd(&a[u].v),
					_mm256_loadu_pd(&b[u].v)));
		}
	} else {
		for (u = 0; u < n; u ++) {
			a[u] = fpr_add(a[u], b[u]);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < n; u ++) {
		a[u] = fpr_add(a[u], b[u]);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_sub)(
	fpr *restrict a, const fpr *restrict b, unsigned logn)
{
	size_t n, u;

	n = (size_t)1 << logn;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 4) {
		for (u = 0; u < n; u += 4) {
			_mm256_storeu_pd(&a[u].v,
				_mm256_sub_pd(
					_mm256_loadu_pd(&a[u].v),
					_mm256_loadu_pd(&b[u].v)));
		}
	} else {
		for (u = 0; u < n; u ++) {
			a[u] = fpr_sub(a[u], b[u]);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < n; u ++) {
		a[u] = fpr_sub(a[u], b[u]);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_neg)(fpr *a, unsigned logn)
{
	size_t n, u;

	n = (size_t)1 << logn;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 4) {
		__m256d s;

		s = _mm256_set1_pd(-0.0);
		for (u = 0; u < n; u += 4) {
			_mm256_storeu_pd(&a[u].v,
				_mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
		}
	} else {
		for (u = 0; u < n; u ++) {
			a[u] = fpr_neg(a[u]);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < n; u ++) {
		a[u] = fpr_neg(a[u]);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_adj_fft)(fpr *a, unsigned logn)
{
	size_t n, u;

	n = (size_t)1 << logn;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		__m256d s;

		s = _mm256_set1_pd(-0.0);
		for (u = (n >> 1); u < n; u += 4) {
			_mm256_storeu_pd(&a[u].v,
				_mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
		}
	} else {
		for (u = (n >> 1); u < n; u ++) {
			a[u] = fpr_neg(a[u]);
		}
	}
#else // yyyAVX2+0
	for (u = (n >> 1); u < n; u ++) {
		a[u] = fpr_neg(a[u]);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_mul_fft)(
	fpr *restrict a, const fpr *restrict b, unsigned logn)
{
	size_t n, hn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		for (u = 0; u < hn; u += 4) {
			__m256d a_re, a_im, b_re, b_im, c_re, c_im;

			a_re = _mm256_loadu_pd(&a[u].v);
			a_im = _mm256_loadu_pd(&a[u + hn].v);
			b_re = _mm256_loadu_pd(&b[u].v);
			b_im = _mm256_loadu_pd(&b[u + hn].v);
			c_re = FMSUB(
				a_re, b_re, _mm256_mul_pd(a_im, b_im));
			c_im = FMADD(
				a_re, b_im, _mm256_mul_pd(a_im, b_re));
			_mm256_storeu_pd(&a[u].v, c_re);
			_mm256_storeu_pd(&a[u + hn].v, c_im);
		}
	} else {
		for (u = 0; u < hn; u ++) {
			fpr a_re, a_im, b_re, b_im;

			a_re = a[u];
			a_im = a[u + hn];
			b_re = b[u];
			b_im = b[u + hn];
			FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < hn; u ++) {
		fpr a_re, a_im, b_re, b_im;

		a_re = a[u];
		a_im = a[u + hn];
		b_re = b[u];
		b_im = b[u + hn];
		FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_muladj_fft)(
	fpr *restrict a, const fpr *restrict b, unsigned logn)
{
	size_t n, hn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		for (u = 0; u < hn; u += 4) {
			__m256d a_re, a_im, b_re, b_im, c_re, c_im;

			a_re = _mm256_loadu_pd(&a[u].v);
			a_im = _mm256_loadu_pd(&a[u + hn].v);
			b_re = _mm256_loadu_pd(&b[u].v);
			b_im = _mm256_loadu_pd(&b[u + hn].v);
			c_re = FMADD(
				a_re, b_re, _mm256_mul_pd(a_im, b_im));
			c_im = FMSUB(
				a_im, b_re, _mm256_mul_pd(a_re, b_im));
			_mm256_storeu_pd(&a[u].v, c_re);
			_mm256_storeu_pd(&a[u + hn].v, c_im);
		}
	} else {
		for (u = 0; u < hn; u ++) {
			fpr a_re, a_im, b_re, b_im;

			a_re = a[u];
			a_im = a[u + hn];
			b_re = b[u];
			b_im = fpr_neg(b[u + hn]);
			FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < hn; u ++) {
		fpr a_re, a_im, b_re, b_im;

		a_re = a[u];
		a_im = a[u + hn];
		b_re = b[u];
		b_im = fpr_neg(b[u + hn]);
		FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn)
{
	/*
	 * Since each coefficient is multiplied with its own conjugate,
	 * the result contains only real values.
	 */
	size_t n, hn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		__m256d zero;

		zero = _mm256_setzero_pd();
		for (u = 0; u < hn; u += 4) {
			__m256d a_re, a_im;

			a_re = _mm256_loadu_pd(&a[u].v);
			a_im = _mm256_loadu_pd(&a[u + hn].v);
			_mm256_storeu_pd(&a[u].v,
				FMADD(a_re, a_re,
					_mm256_mul_pd(a_im, a_im)));
			_mm256_storeu_pd(&a[u + hn].v, zero);
		}
	} else {
		for (u = 0; u < hn; u ++) {
			fpr a_re, a_im;

			a_re = a[u];
			a_im = a[u + hn];
			a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
			a[u + hn] = fpr_zero;
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < hn; u ++) {
		fpr a_re, a_im;

		a_re = a[u];
		a_im = a[u + hn];
		a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
		a[u + hn] = fpr_zero;
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn)
{
	size_t n, u;

	n = (size_t)1 << logn;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 4) {
		__m256d x4;

		x4 = _mm256_set1_pd(x.v);
		for (u = 0; u < n; u += 4) {
			_mm256_storeu_pd(&a[u].v,
				_mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v)));
		}
	} else {
		for (u = 0; u < n; u ++) {
			a[u] = fpr_mul(a[u], x);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < n; u ++) {
		a[u] = fpr_mul(a[u], x);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_div_fft)(
	fpr *restrict a, const fpr *restrict b, unsigned logn)
{
	size_t n, hn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		__m256d one;

		one = _mm256_set1_pd(1.0);
		for (u = 0; u < hn; u += 4) {
			__m256d a_re, a_im, b_re, b_im, c_re, c_im, t;

			a_re = _mm256_loadu_pd(&a[u].v);
			a_im = _mm256_loadu_pd(&a[u + hn].v);
			b_re = _mm256_loadu_pd(&b[u].v);
			b_im = _mm256_loadu_pd(&b[u + hn].v);
			t = _mm256_div_pd(one,
				FMADD(b_re, b_re,
					_mm256_mul_pd(b_im, b_im)));
			b_re = _mm256_mul_pd(b_re, t);
			b_im = _mm256_mul_pd(b_im, t);
			c_re = FMADD(
				a_re, b_re, _mm256_mul_pd(a_im, b_im));
			c_im = FMSUB(
				a_im, b_re, _mm256_mul_pd(a_re, b_im));
			_mm256_storeu_pd(&a[u].v, c_re);
			_mm256_storeu_pd(&a[u + hn].v, c_im);
		}
	} else {
		for (u = 0; u < hn; u ++) {
			fpr a_re, a_im, b_re, b_im;

			a_re = a[u];
			a_im = a[u + hn];
			b_re = b[u];
			b_im = b[u + hn];
			FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < hn; u ++) {
		fpr a_re, a_im, b_re, b_im;

		a_re = a[u];
		a_im = a[u + hn];
		b_re = b[u];
		b_im = b[u + hn];
		FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_invnorm2_fft)(fpr *restrict d,
	const fpr *restrict a, const fpr *restrict b, unsigned logn)
{
	size_t n, hn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		__m256d one;

		one = _mm256_set1_pd(1.0);
		for (u = 0; u < hn; u += 4) {
			__m256d a_re, a_im, b_re, b_im, dv;

			a_re = _mm256_loadu_pd(&a[u].v);
			a_im = _mm256_loadu_pd(&a[u + hn].v);
			b_re = _mm256_loadu_pd(&b[u].v);
			b_im = _mm256_loadu_pd(&b[u + hn].v);
			dv = _mm256_div_pd(one,
				_mm256_add_pd(
					FMADD(a_re, a_re,
						_mm256_mul_pd(a_im, a_im)),
					FMADD(b_re, b_re,
						_mm256_mul_pd(b_im, b_im))));
			_mm256_storeu_pd(&d[u].v, dv);
		}
	} else {
		for (u = 0; u < hn; u ++) {
			fpr a_re, a_im;
			fpr b_re, b_im;

			a_re = a[u];
			a_im = a[u + hn];
			b_re = b[u];
			b_im = b[u + hn];
			d[u] = fpr_inv(fpr_add(
				fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
				fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < hn; u ++) {
		fpr a_re, a_im;
		fpr b_re, b_im;

		a_re = a[u];
		a_im = a[u + hn];
		b_re = b[u];
		b_im = b[u + hn];
		d[u] = fpr_inv(fpr_add(
			fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
			fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_add_muladj_fft)(fpr *restrict d,
	const fpr *restrict F, const fpr *restrict G,
	const fpr *restrict f, const fpr *restrict g, unsigned logn)
{
	size_t n, hn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		for (u = 0; u < hn; u += 4) {
			__m256d F_re, F_im, G_re, G_im;
			__m256d f_re, f_im, g_re, g_im;
			__m256d a_re, a_im, b_re, b_im;

			F_re = _mm256_loadu_pd(&F[u].v);
			F_im = _mm256_loadu_pd(&F[u + hn].v);
			G_re = _mm256_loadu_pd(&G[u].v);
			G_im = _mm256_loadu_pd(&G[u + hn].v);
			f_re = _mm256_loadu_pd(&f[u].v);
			f_im = _mm256_loadu_pd(&f[u + hn].v);
			g_re = _mm256_loadu_pd(&g[u].v);
			g_im = _mm256_loadu_pd(&g[u + hn].v);

			a_re = FMADD(F_re, f_re,
				_mm256_mul_pd(F_im, f_im));
			a_im = FMSUB(F_im, f_re,
				_mm256_mul_pd(F_re, f_im));
			b_re = FMADD(G_re, g_re,
				_mm256_mul_pd(G_im, g_im));
			b_im = FMSUB(G_im, g_re,
				_mm256_mul_pd(G_re, g_im));
			_mm256_storeu_pd(&d[u].v,
				_mm256_add_pd(a_re, b_re));
			_mm256_storeu_pd(&d[u + hn].v,
				_mm256_add_pd(a_im, b_im));
		}
	} else {
		for (u = 0; u < hn; u ++) {
			fpr F_re, F_im, G_re, G_im;
			fpr f_re, f_im, g_re, g_im;
			fpr a_re, a_im, b_re, b_im;

			F_re = F[u];
			F_im = F[u + hn];
			G_re = G[u];
			G_im = G[u + hn];
			f_re = f[u];
			f_im = f[u + hn];
			g_re = g[u];
			g_im = g[u + hn];

			FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
			FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
			d[u] = fpr_add(a_re, b_re);
			d[u + hn] = fpr_add(a_im, b_im);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < hn; u ++) {
		fpr F_re, F_im, G_re, G_im;
		fpr f_re, f_im, g_re, g_im;
		fpr a_re, a_im, b_re, b_im;

		F_re = F[u];
		F_im = F[u + hn];
		G_re = G[u];
		G_im = G[u + hn];
		f_re = f[u];
		f_im = f[u + hn];
		g_re = g[u];
		g_im = g[u + hn];

		FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
		FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
		d[u] = fpr_add(a_re, b_re);
		d[u + hn] = fpr_add(a_im, b_im);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_mul_autoadj_fft)(
	fpr *restrict a, const fpr *restrict b, unsigned logn)
{
	size_t n, hn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		for (u = 0; u < hn; u += 4) {
			__m256d a_re, a_im, bv;

			a_re = _mm256_loadu_pd(&a[u].v);
			a_im = _mm256_loadu_pd(&a[u + hn].v);
			bv = _mm256_loadu_pd(&b[u].v);
			_mm256_storeu_pd(&a[u].v,
				_mm256_mul_pd(a_re, bv));
			_mm256_storeu_pd(&a[u + hn].v,
				_mm256_mul_pd(a_im, bv));
		}
	} else {
		for (u = 0; u < hn; u ++) {
			a[u] = fpr_mul(a[u], b[u]);
			a[u + hn] = fpr_mul(a[u + hn], b[u]);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < hn; u ++) {
		a[u] = fpr_mul(a[u], b[u]);
		a[u + hn] = fpr_mul(a[u + hn], b[u]);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_div_autoadj_fft)(
	fpr *restrict a, const fpr *restrict b, unsigned logn)
{
	size_t n, hn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		__m256d one;

		one = _mm256_set1_pd(1.0);
		for (u = 0; u < hn; u += 4) {
			__m256d ib, a_re, a_im;

			ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v));
			a_re = _mm256_loadu_pd(&a[u].v);
			a_im = _mm256_loadu_pd(&a[u + hn].v);
			_mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib));
			_mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib));
		}
	} else {
		for (u = 0; u < hn; u ++) {
			fpr ib;

			ib = fpr_inv(b[u]);
			a[u] = fpr_mul(a[u], ib);
			a[u + hn] = fpr_mul(a[u + hn], ib);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < hn; u ++) {
		fpr ib;

		ib = fpr_inv(b[u]);
		a[u] = fpr_mul(a[u], ib);
		a[u + hn] = fpr_mul(a[u + hn], ib);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_LDL_fft)(
	const fpr *restrict g00,
	fpr *restrict g01, fpr *restrict g11, unsigned logn)
{
	size_t n, hn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		__m256d one;

		one = _mm256_set1_pd(1.0);
		for (u = 0; u < hn; u += 4) {
			__m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
			__m256d t, mu_re, mu_im, xi_re, xi_im;

			g00_re = _mm256_loadu_pd(&g00[u].v);
			g00_im = _mm256_loadu_pd(&g00[u + hn].v);
			g01_re = _mm256_loadu_pd(&g01[u].v);
			g01_im = _mm256_loadu_pd(&g01[u + hn].v);
			g11_re = _mm256_loadu_pd(&g11[u].v);
			g11_im = _mm256_loadu_pd(&g11[u + hn].v);

			t = _mm256_div_pd(one,
				FMADD(g00_re, g00_re,
					_mm256_mul_pd(g00_im, g00_im)));
			g00_re = _mm256_mul_pd(g00_re, t);
			g00_im = _mm256_mul_pd(g00_im, t);
			mu_re = FMADD(g01_re, g00_re,
				_mm256_mul_pd(g01_im, g00_im));
			mu_im = FMSUB(g01_re, g00_im,
				_mm256_mul_pd(g01_im, g00_re));
			xi_re = FMSUB(mu_re, g01_re,
				_mm256_mul_pd(mu_im, g01_im));
			xi_im = FMADD(mu_im, g01_re,
				_mm256_mul_pd(mu_re, g01_im));
			_mm256_storeu_pd(&g11[u].v,
				_mm256_sub_pd(g11_re, xi_re));
			_mm256_storeu_pd(&g11[u + hn].v,
				_mm256_add_pd(g11_im, xi_im));
			_mm256_storeu_pd(&g01[u].v, mu_re);
			_mm256_storeu_pd(&g01[u + hn].v, mu_im);
		}
	} else {
		for (u = 0; u < hn; u ++) {
			fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
			fpr mu_re, mu_im;

			g00_re = g00[u];
			g00_im = g00[u + hn];
			g01_re = g01[u];
			g01_im = g01[u + hn];
			g11_re = g11[u];
			g11_im = g11[u + hn];
			FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
			FPC_MUL(g01_re, g01_im,
				mu_re, mu_im, g01_re, fpr_neg(g01_im));
			FPC_SUB(g11[u], g11[u + hn],
				g11_re, g11_im, g01_re, g01_im);
			g01[u] = mu_re;
			g01[u + hn] = fpr_neg(mu_im);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < hn; u ++) {
		fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
		fpr mu_re, mu_im;

		g00_re = g00[u];
		g00_im = g00[u + hn];
		g01_re = g01[u];
		g01_im = g01[u + hn];
		g11_re = g11[u];
		g11_im = g11[u + hn];
		FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
		FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
		FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im);
		g01[u] = mu_re;
		g01[u + hn] = fpr_neg(mu_im);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_LDLmv_fft)(
	fpr *restrict d11, fpr *restrict l10,
	const fpr *restrict g00, const fpr *restrict g01,
	const fpr *restrict g11, unsigned logn)
{
	size_t n, hn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		__m256d one;

		one = _mm256_set1_pd(1.0);
		for (u = 0; u < hn; u += 4) {
			__m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
			__m256d t, mu_re, mu_im, xi_re, xi_im;

			g00_re = _mm256_loadu_pd(&g00[u].v);
			g00_im = _mm256_loadu_pd(&g00[u + hn].v);
			g01_re = _mm256_loadu_pd(&g01[u].v);
			g01_im = _mm256_loadu_pd(&g01[u + hn].v);
			g11_re = _mm256_loadu_pd(&g11[u].v);
			g11_im = _mm256_loadu_pd(&g11[u + hn].v);

			t = _mm256_div_pd(one,
				FMADD(g00_re, g00_re,
					_mm256_mul_pd(g00_im, g00_im)));
			g00_re = _mm256_mul_pd(g00_re, t);
			g00_im = _mm256_mul_pd(g00_im, t);
			mu_re = FMADD(g01_re, g00_re,
				_mm256_mul_pd(g01_im, g00_im));
			mu_im = FMSUB(g01_re, g00_im,
				_mm256_mul_pd(g01_im, g00_re));
			xi_re = FMSUB(mu_re, g01_re,
				_mm256_mul_pd(mu_im, g01_im));
			xi_im = FMADD(mu_im, g01_re,
				_mm256_mul_pd(mu_re, g01_im));
			_mm256_storeu_pd(&d11[u].v,
				_mm256_sub_pd(g11_re, xi_re));
			_mm256_storeu_pd(&d11[u + hn].v,
				_mm256_add_pd(g11_im, xi_im));
			_mm256_storeu_pd(&l10[u].v, mu_re);
			_mm256_storeu_pd(&l10[u + hn].v, mu_im);
		}
	} else {
		for (u = 0; u < hn; u ++) {
			fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
			fpr mu_re, mu_im;

			g00_re = g00[u];
			g00_im = g00[u + hn];
			g01_re = g01[u];
			g01_im = g01[u + hn];
			g11_re = g11[u];
			g11_im = g11[u + hn];
			FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
			FPC_MUL(g01_re, g01_im,
				mu_re, mu_im, g01_re, fpr_neg(g01_im));
			FPC_SUB(d11[u], d11[u + hn],
				g11_re, g11_im, g01_re, g01_im);
			l10[u] = mu_re;
			l10[u + hn] = fpr_neg(mu_im);
		}
	}
#else // yyyAVX2+0
	for (u = 0; u < hn; u ++) {
		fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
		fpr mu_re, mu_im;

		g00_re = g00[u];
		g00_im = g00[u + hn];
		g01_re = g01[u];
		g01_im = g01[u + hn];
		g11_re = g11[u];
		g11_im = g11[u + hn];
		FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
		FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
		FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im);
		l10[u] = mu_re;
		l10[u + hn] = fpr_neg(mu_im);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_split_fft)(
	fpr *restrict f0, fpr *restrict f1,
	const fpr *restrict f, unsigned logn)
{
	/*
	 * The FFT representation we use is in bit-reversed order
	 * (element i contains f(w^(rev(i))), where rev() is the
	 * bit-reversal function over the ring degree. This changes
	 * indexes with regards to the Falcon specification.
	 */
	size_t n, hn, qn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
	qn = hn >> 1;

#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 8) {
		__m256d half, sv;

		half = _mm256_set1_pd(0.5);
		sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0);
		for (u = 0; u < qn; u += 2) {
			__m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt;

			ab_re = _mm256_loadu_pd(&f[(u << 1)].v);
			ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v);
			ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half);
			ff0 = _mm256_permute4x64_pd(ff0, 0xD8);
			_mm_storeu_pd(&f0[u].v,
				_mm256_extractf128_pd(ff0, 0));
			_mm_storeu_pd(&f0[u + qn].v,
				_mm256_extractf128_pd(ff0, 1));

			ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half);
			gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
			ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5);
			ff3 = _mm256_hadd_pd(
				_mm256_mul_pd(ff1, gmt),
				_mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv));
			ff3 = _mm256_permute4x64_pd(ff3, 0xD8);
			_mm_storeu_pd(&f1[u].v,
				_mm256_extractf128_pd(ff3, 0));
			_mm_storeu_pd(&f1[u + qn].v,
				_mm256_extractf128_pd(ff3, 1));
		}
	} else {
		f0[0] = f[0];
		f1[0] = f[hn];

		for (u = 0; u < qn; u ++) {
			fpr a_re, a_im, b_re, b_im;
			fpr t_re, t_im;

			a_re = f[(u << 1) + 0];
			a_im = f[(u << 1) + 0 + hn];
			b_re = f[(u << 1) + 1];
			b_im = f[(u << 1) + 1 + hn];

			FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
			f0[u] = fpr_half(t_re);
			f0[u + qn] = fpr_half(t_im);

			FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
			FPC_MUL(t_re, t_im, t_re, t_im,
				fpr_gm_tab[((u + hn) << 1) + 0],
				fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
			f1[u] = fpr_half(t_re);
			f1[u + qn] = fpr_half(t_im);
		}
	}
#else // yyyAVX2+0
	/*
	 * We process complex values by pairs. For logn = 1, there is only
	 * one complex value (the other one is the implicit conjugate),
	 * so we add the two lines below because the loop will be
	 * skipped.
	 */
	f0[0] = f[0];
	f1[0] = f[hn];

	for (u = 0; u < qn; u ++) {
		fpr a_re, a_im, b_re, b_im;
		fpr t_re, t_im;

		a_re = f[(u << 1) + 0];
		a_im = f[(u << 1) + 0 + hn];
		b_re = f[(u << 1) + 1];
		b_im = f[(u << 1) + 1 + hn];

		FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
		f0[u] = fpr_half(t_re);
		f0[u + qn] = fpr_half(t_im);

		FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
		FPC_MUL(t_re, t_im, t_re, t_im,
			fpr_gm_tab[((u + hn) << 1) + 0],
			fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
		f1[u] = fpr_half(t_re);
		f1[u + qn] = fpr_half(t_im);
	}
#endif // yyyAVX2-
}

/* see inner.h */
TARGET_AVX2
void
Zf(poly_merge_fft)(
	fpr *restrict f,
	const fpr *restrict f0, const fpr *restrict f1, unsigned logn)
{
	size_t n, hn, qn, u;

	n = (size_t)1 << logn;
	hn = n >> 1;
	qn = hn >> 1;

#if FALCON_AVX2 // yyyAVX2+1
	if (n >= 16) {
		for (u = 0; u < qn; u += 4) {
			__m256d a_re, a_im, b_re, b_im, c_re, c_im;
			__m256d gm1, gm2, g_re, g_im;
			__m256d t_re, t_im, u_re, u_im;
			__m256d tu1_re, tu2_re, tu1_im, tu2_im;

			a_re = _mm256_loadu_pd(&f0[u].v);
			a_im = _mm256_loadu_pd(&f0[u + qn].v);
			c_re = _mm256_loadu_pd(&f1[u].v);
			c_im = _mm256_loadu_pd(&f1[u + qn].v);

			gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
			gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v);
			g_re = _mm256_unpacklo_pd(gm1, gm2);
			g_im = _mm256_unpackhi_pd(gm1, gm2);
			g_re = _mm256_permute4x64_pd(g_re, 0xD8);
			g_im = _mm256_permute4x64_pd(g_im, 0xD8);

			b_re = FMSUB(
				c_re, g_re, _mm256_mul_pd(c_im, g_im));
			b_im = FMADD(
				c_re, g_im, _mm256_mul_pd(c_im, g_re));

			t_re = _mm256_add_pd(a_re, b_re);
			t_im = _mm256_add_pd(a_im, b_im);
			u_re = _mm256_sub_pd(a_re, b_re);
			u_im = _mm256_sub_pd(a_im, b_im);

			tu1_re = _mm256_unpacklo_pd(t_re, u_re);
			tu2_re = _mm256_unpackhi_pd(t_re, u_re);
			tu1_im = _mm256_unpacklo_pd(t_im, u_im);
			tu2_im = _mm256_unpackhi_pd(t_im, u_im);
			_mm256_storeu_pd(&f[(u << 1)].v,
				_mm256_permute2f128_pd(tu1_re, tu2_re, 0x20));
			_mm256_storeu_pd(&f[(u << 1) + 4].v,
				_mm256_permute2f128_pd(tu1_re, tu2_re, 0x31));
			_mm256_storeu_pd(&f[(u << 1) + hn].v,
				_mm256_permute2f128_pd(tu1_im, tu2_im, 0x20));
			_mm256_storeu_pd(&f[(u << 1) + 4 + hn].v,
				_mm256_permute2f128_pd(tu1_im, tu2_im, 0x31));
		}
	} else {
		f[0] = f0[0];
		f[hn] = f1[0];

		for (u = 0; u < qn; u ++) {
			fpr a_re, a_im, b_re, b_im;
			fpr t_re, t_im;

			a_re = f0[u];
			a_im = f0[u + qn];
			FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
				fpr_gm_tab[((u + hn) << 1) + 0],
				fpr_gm_tab[((u + hn) << 1) + 1]);
			FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
			f[(u << 1) + 0] = t_re;
			f[(u << 1) + 0 + hn] = t_im;
			FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
			f[(u << 1) + 1] = t_re;
			f[(u << 1) + 1 + hn] = t_im;
		}
	}
#else // yyyAVX2+0
	/*
	 * An extra copy to handle the special case logn = 1.
	 */
	f[0] = f0[0];
	f[hn] = f1[0];

	for (u = 0; u < qn; u ++) {
		fpr a_re, a_im, b_re, b_im;
		fpr t_re, t_im;

		a_re = f0[u];
		a_im = f0[u + qn];
		FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
			fpr_gm_tab[((u + hn) << 1) + 0],
			fpr_gm_tab[((u + hn) << 1) + 1]);
		FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
		f[(u << 1) + 0] = t_re;
		f[(u << 1) + 0 + hn] = t_im;
		FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
		f[(u << 1) + 1] = t_re;
		f[(u << 1) + 1 + hn] = t_im;
	}
#endif // yyyAVX2-
}

back to top

Software Heritage — Copyright (C) 2015–2026, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Content policy— Contact— JavaScript license information— Web API