https://github.com/mupq/pqm4
Tip revision: 26f810d332b829a2c16220294db7a882b2072f4d authored by rugo on 07 June 2022, 08:39:12 UTC
Fix alignment issues in Kyber (#236)
Fix alignment issues in Kyber (#236)
Tip revision: 26f810d
ring_ops.c
/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0"
*
* Written by Nir Drucker, Shay Gueron and Dusan Kostic,
* AWS Cryptographic Algorithms Group.
*
* Modified by Ming-Shing Chen, Tung Chou and Markus Krausz.
*
*/
#include <assert.h>
#include "cleanup.h"
#include "ring_ops.h"
// c = a mod (x^r - 1)
_INLINE_ void ring_red(OUT pad_r_t *c, IN const dbl_pad_r_t *a)
{
const uint64_t *a64 = (const uint64_t *)a;
uint64_t * c64 = (uint64_t *)c;
for(size_t i = 0; i < R_QWORDS; i += REG_QWORDS) {
REG_T vt0 = LOAD(&a64[i]);
REG_T vt1 = LOAD(&a64[i + R_QWORDS]);
REG_T vt2 = LOAD(&a64[i + R_QWORDS - 1]);
vt1 = SLLI_I64(vt1, LAST_R_QWORD_TRAIL);
vt2 = SRLI_I64(vt2, LAST_R_QWORD_LEAD);
vt0 ^= (vt1 | vt2);
STORE(&c64[i], vt0);
}
c64[R_QWORDS - 1] &= LAST_R_QWORD_MASK;
// Clean the secrets from the upper part of c
secure_clean((uint8_t *)&c64[R_QWORDS],
(R_PADDED_QWORDS - R_QWORDS) * sizeof(uint64_t));
}
#if 2048 == R_PADDED_BYTES
#define LOG_POLYLEN 11
#if defined(_USE_BMUL2_)
#define bitpolymul_input_transform(ao,ai,l) bmul2_2048_to_4096_prepare((uint32_t*)ao,ai)
#define bitpolymul_mul(c,a,b,l) bmul2_2048_to_4096_mul(c,(const uint32_t*)a,(const uint32_t * )b)
#endif
#elif 4096 == R_PADDED_BYTES
#define LOG_POLYLEN 12
#if defined(_USE_BMUL2_)
#define bitpolymul_input_transform(ao,ai,l) bmul2_4096_to_8192_prepare((uint32_t*)ao,ai)
#define bitpolymul_mul(c,a,b,l) bmul2_4096_to_8192_mul(c,(const uint32_t*)a,(const uint32_t * )b)
#endif
#else
error
#endif
void ring_mul_inputtransform(OUT mul_internal_t *c, IN const pad_r_t *a)
{
bitpolymul_input_transform( (uint16_t*)c, (const uint8_t*)a , LOG_POLYLEN );
}
void ring_mul(OUT pad_r_t *c, IN const pad_r_t *a, IN const pad_r_t *b)
{
DEFER_CLEANUP(dbl_pad_r_t t = {0}, dbl_pad_r_cleanup);
DEFER_CLEANUP(mul_internal_t ta = {0}, mul_internal_cleanup);
DEFER_CLEANUP(mul_internal_t tb = {0}, mul_internal_cleanup);
bitpolymul_input_transform( (uint16_t*)&ta, (const uint8_t*)a , LOG_POLYLEN );
bitpolymul_input_transform( (uint16_t*)&tb, (const uint8_t*)b , LOG_POLYLEN );
bitpolymul_mul( (uint8_t*)&t , (uint16_t *)&ta, (const uint16_t *)&tb , LOG_POLYLEN + 1 );
ring_red(c, &t);
}
void ring_mul_2(OUT pad_r_t *c, IN const pad_r_t *a, IN const mul_internal_t *tb)
{
DEFER_CLEANUP(mul_internal_t t = {0}, mul_internal_cleanup);
DEFER_CLEANUP(mul_internal_t ta = {0}, mul_internal_cleanup);
bitpolymul_input_transform( (uint16_t*)&ta, (const uint8_t*)a , LOG_POLYLEN );
bitpolymul_mul( (uint8_t*)&t , (uint16_t *)&ta, (const uint16_t *)tb , LOG_POLYLEN + 1 );
ring_red(c, &t);
}
void ring_mul_rep(OUT pad_r_t *c, IN const pad_r_t *a, IN const mul_internal_t *tb)
{
DEFER_CLEANUP(mul_internal_t t = {0}, mul_internal_cleanup);
DEFER_CLEANUP(mul_internal_t ta = {0}, mul_internal_cleanup);
bitpolymul_input_transform( (uint16_t*)&ta, (const uint8_t*)a , LOG_POLYLEN );
bitpolymul_mul( (uint8_t*)&t , (uint16_t *)&ta, (const uint16_t *)tb , LOG_POLYLEN + 1 );
ring_red(c, &t);
}
////////////////////////////////////////////////////
static inline
uint64_t squ32(uint32_t a32 )
{
static const uint32_t m16 = 0xffff0000;
static const uint32_t m8 = 0xff00ff00;
static const uint32_t m4 = 0xf0f0f0f0;
static const uint32_t m2 = 0xcccccccc;
static const uint32_t m1 = 0xaaaaaaaa;
uint32_t c0 = a32&~m16;
uint32_t c1 = (a32&m16)>>16;
c0 = (c0&~m8)|((c0&m8)<<8);
c1 = (c1&~m8)|((c1&m8)<<8);
c0 = (c0&~m4)|((c0&m4)<<4);
c1 = (c1&~m4)|((c1&m4)<<4);
c0 = (c0&~m2)|((c0&m2)<<2);
c1 = (c1&~m2)|((c1&m2)<<2);
c0 = (c0&~m1)|((c0&m1)<<1);
c1 = (c1&~m1)|((c1&m1)<<1);
uint64_t r = c0;
r |= ((uint64_t)c1)<<32;
return r;
}
static inline
void squ64(uint64_t *r, uint64_t a64 )
{
r[0] = squ32(a64&0xffffffff);
r[1] = squ32((a64>>32)&0xffffffff);
}
static inline
void _bitsqu(OUT dbl_pad_r_t *c, IN const pad_r_t *a)
{
const uint64_t *a64 = (const uint64_t *)a;
uint64_t * c64 = (uint64_t *)c;
for(int i = 0; i < (R_QWORDS); i++) {
squ64( c64+i*2 , a64[i] );
}
}
void ring_squ(OUT pad_r_t *c, IN const pad_r_t *a )
{
DEFER_CLEANUP(dbl_pad_r_t t = {0}, dbl_pad_r_cleanup);
_bitsqu( &t , a );
ring_red(c, &t);
}