ftu/blst/no_asm.h
2022-09-09 02:47:49 -04:00

1288 lines
36 KiB
C

/*
* Copyright Supranational LLC
* Licensed under the Apache License, Version 2.0, see LICENSE for details.
* SPDX-License-Identifier: Apache-2.0
*/
#if LIMB_T_BITS==32
typedef unsigned long long llimb_t;
#endif
#if defined(__clang__)
# pragma GCC diagnostic ignored "-Wstatic-in-inline"
#endif
static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[],
const limb_t p[], limb_t n0, size_t n)
{
llimb_t limbx;
limb_t mask, borrow, mx, hi, tmp[n+1], carry;
size_t i, j;
for (mx=b[0], hi=0, i=0; i<n; i++) {
limbx = (mx * (llimb_t)a[i]) + hi;
tmp[i] = (limb_t)limbx;
hi = (limb_t)(limbx >> LIMB_T_BITS);
}
mx = n0*tmp[0];
tmp[i] = hi;
for (carry=0, j=0; ; ) {
limbx = (mx * (llimb_t)p[0]) + tmp[0];
hi = (limb_t)(limbx >> LIMB_T_BITS);
for (i=1; i<n; i++) {
limbx = (mx * (llimb_t)p[i] + hi) + tmp[i];
tmp[i-1] = (limb_t)limbx;
hi = (limb_t)(limbx >> LIMB_T_BITS);
}
limbx = tmp[i] + (hi + (llimb_t)carry);
tmp[i-1] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
if (++j==n)
break;
for (mx=b[j], hi=0, i=0; i<n; i++) {
limbx = (mx * (llimb_t)a[i] + hi) + tmp[i];
tmp[i] = (limb_t)limbx;
hi = (limb_t)(limbx >> LIMB_T_BITS);
}
mx = n0*tmp[0];
limbx = hi + (llimb_t)carry;
tmp[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
for (borrow=0, i=0; i<n; i++) {
limbx = tmp[i] - (p[i] + (llimb_t)borrow);
ret[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = carry - borrow;
for(i=0; i<n; i++)
ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
}
#define MUL_MONT_IMPL(bits) \
inline void mul_mont_##bits(vec##bits ret, const vec##bits a, \
const vec##bits b, const vec##bits p, limb_t n0) \
{ mul_mont_n(ret, a, b, p, n0, NLIMBS(bits)); } \
\
inline void sqr_mont_##bits(vec##bits ret, const vec##bits a, \
const vec##bits p, limb_t n0) \
{ mul_mont_n(ret, a, a, p, n0, NLIMBS(bits)); }
/*
* 256-bit subroutines can handle arbitrary modulus, even non-"sparse",
* but we have to harmonize the naming with assembly.
*/
#define mul_mont_256 mul_mont_sparse_256
#define sqr_mont_256 sqr_mont_sparse_256
MUL_MONT_IMPL(256)
#undef mul_mont_256
#undef sqr_mont_256
MUL_MONT_IMPL(384)
static void add_mod_n(limb_t ret[], const limb_t a[], const limb_t b[],
const limb_t p[], size_t n)
{
llimb_t limbx;
limb_t mask, carry, borrow, tmp[n];
size_t i;
for (carry=0, i=0; i<n; i++) {
limbx = a[i] + (b[i] + (llimb_t)carry);
tmp[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
for (borrow=0, i=0; i<n; i++) {
limbx = tmp[i] - (p[i] + (llimb_t)borrow);
ret[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = carry - borrow;
for(i=0; i<n; i++)
ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
}
#define ADD_MOD_IMPL(bits) \
inline void add_mod_##bits(vec##bits ret, const vec##bits a, \
const vec##bits b, const vec##bits p) \
{ add_mod_n(ret, a, b, p, NLIMBS(bits)); }
ADD_MOD_IMPL(256)
ADD_MOD_IMPL(384)
static void sub_mod_n(limb_t ret[], const limb_t a[], const limb_t b[],
const limb_t p[], size_t n)
{
llimb_t limbx;
limb_t mask, carry, borrow;
size_t i;
for (borrow=0, i=0; i<n; i++) {
limbx = a[i] - (b[i] + (llimb_t)borrow);
ret[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = 0 - borrow;
for (carry=0, i=0; i<n; i++) {
limbx = ret[i] + ((p[i] & mask) + (llimb_t)carry);
ret[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
}
#define SUB_MOD_IMPL(bits) \
inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \
const vec##bits b, const vec##bits p) \
{ sub_mod_n(ret, a, b, p, NLIMBS(bits)); }
SUB_MOD_IMPL(256)
SUB_MOD_IMPL(384)
static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[],
size_t n)
{
llimb_t limbx;
limb_t mask, carry, borrow, tmp[n], two_a[n];
size_t i;
for (carry=0, i=0; i<n; i++) {
limb_t a_i = a[i];
tmp[i] = a_i<<1 | carry;
carry = a_i>>(LIMB_T_BITS-1);
}
for (borrow=0, i=0; i<n; i++) {
limbx = tmp[i] - (p[i] + (llimb_t)borrow);
two_a[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = carry - borrow;
for(i=0; i<n; i++)
two_a[i] = (two_a[i] & ~mask) | (tmp[i] & mask);
for (carry=0, i=0; i<n; i++) {
limbx = a[i] + (two_a[i] + (llimb_t)carry);
tmp[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
for (borrow=0, i=0; i<n; i++) {
limbx = tmp[i] - (p[i] + (llimb_t)borrow);
ret[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = carry - borrow;
for(i=0; i<n; i++)
ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
}
#define MUL_BY_3_MOD_IMPL(bits) \
inline void mul_by_3_mod_##bits(vec##bits ret, const vec##bits a, \
const vec##bits p) \
{ mul_by_3_mod_n(ret, a, p, NLIMBS(bits)); }
MUL_BY_3_MOD_IMPL(256)
MUL_BY_3_MOD_IMPL(384)
static void lshift_mod_n(limb_t ret[], const limb_t a[], size_t count,
const limb_t p[], size_t n)
{
llimb_t limbx;
limb_t mask, carry, borrow, tmp[n];
size_t i;
while (count--) {
for (carry=0, i=0; i<n; i++) {
limb_t a_i = a[i];
tmp[i] = a_i<<1 | carry;
carry = a_i>>(LIMB_T_BITS-1);
}
for (borrow=0, i=0; i<n; i++) {
limbx = tmp[i] - (p[i] + (llimb_t)borrow);
ret[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = carry - borrow;
for(i=0; i<n; i++)
ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
a = ret;
}
}
#define LSHIFT_MOD_IMPL(bits) \
inline void lshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \
const vec##bits p) \
{ lshift_mod_n(ret, a, count, p, NLIMBS(bits)); }
LSHIFT_MOD_IMPL(256)
LSHIFT_MOD_IMPL(384)
static void cneg_mod_n(limb_t ret[], const limb_t a[], bool_t flag,
const limb_t p[], size_t n)
{
llimb_t limbx;
limb_t borrow, mask, tmp[n];
size_t i;
for (borrow=0, i=0; i<n; i++) {
limbx = p[i] - (a[i] + (llimb_t)borrow);
tmp[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
flag &= vec_is_zero(a, sizeof(tmp)) ^ 1;
mask = (limb_t)0 - flag;
for(i=0; i<n; i++)
ret[i] = (a[i] & ~mask) | (tmp[i] & mask);
}
#define CNEG_MOD_IMPL(bits) \
inline void cneg_mod_##bits(vec##bits ret, const vec##bits a, bool_t flag, \
const vec##bits p) \
{ cneg_mod_n(ret, a, flag, p, NLIMBS(bits)); }
CNEG_MOD_IMPL(256)
CNEG_MOD_IMPL(384)
static limb_t check_mod_n(const byte a[], const limb_t p[], size_t n)
{
llimb_t limbx;
limb_t borrow, ai, acc;
size_t i, j;
for (acc=borrow=0, i=0; i<n; i++) {
for (ai=0, j=0; j<8*sizeof(limb_t); j+=8)
ai |= (limb_t)(*a++) << j;
acc |= ai;
limbx = ai - (p[i] + (llimb_t)borrow);
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
return borrow & (is_zero(acc) ^ 1);
}
#define CHECK_MOD_IMPL(bits) \
inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \
{ return check_mod_n(a, p, NLIMBS(bits)); }
CHECK_MOD_IMPL(256)
static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[],
const limb_t p[], size_t n)
{
limb_t ret_[n], a_[n], b_[n], zero;
limbs_from_le_bytes(a_, a, sizeof(a_));
limbs_from_le_bytes(b_, b, sizeof(b_));
add_mod_n(ret_, a_, b_, p, n);
zero = vec_is_zero(ret_, sizeof(ret_));
le_bytes_from_limbs(ret, ret_, sizeof(ret_));
return zero^1;
}
#define ADD_N_CHECK_MOD_IMPL(bits) \
inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \
const pow##bits b, const vec##bits p) \
{ return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); }
ADD_N_CHECK_MOD_IMPL(256)
static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[],
const limb_t p[], size_t n)
{
limb_t ret_[n], a_[n], b_[n], zero;
limbs_from_le_bytes(a_, a, sizeof(a_));
limbs_from_le_bytes(b_, b, sizeof(b_));
sub_mod_n(ret_, a_, b_, p, n);
zero = vec_is_zero(ret_, sizeof(ret_));
le_bytes_from_limbs(ret, ret_, sizeof(ret_));
return zero^1;
}
#define SUB_N_CHECK_MOD_IMPL(bits) \
inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \
const pow##bits b, const vec##bits p) \
{ return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); }
SUB_N_CHECK_MOD_IMPL(256)
static void from_mont_n(limb_t ret[], const limb_t a[],
const limb_t p[], limb_t n0, size_t n)
{
llimb_t limbx;
limb_t mask, borrow, mx, hi, tmp[n];
size_t i, j;
for (j=0; j<n; j++) {
mx = n0*a[0];
limbx = (mx * (llimb_t)p[0]) + a[0];
hi = (limb_t)(limbx >> LIMB_T_BITS);
for (i=1; i<n; i++) {
limbx = (mx * (llimb_t)p[i] + hi) + a[i];
tmp[i-1] = (limb_t)limbx;
hi = (limb_t)(limbx >> LIMB_T_BITS);
}
tmp[i-1] = hi;
a = tmp;
}
/* this is needed only if input can be non-fully-reduced */
for (borrow=0, i=0; i<n; i++) {
limbx = tmp[i] - (p[i] + (llimb_t)borrow);
ret[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = 0 - borrow;
for(i=0; i<n; i++)
ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
}
#define FROM_MONT_IMPL(bits) \
inline void from_mont_##bits(vec##bits ret, const vec##bits a, \
const vec##bits p, limb_t n0) \
{ from_mont_n(ret, a, p, n0, NLIMBS(bits)); }
FROM_MONT_IMPL(256)
FROM_MONT_IMPL(384)
static void redc_mont_n(limb_t ret[], const limb_t a[],
const limb_t p[], limb_t n0, size_t n)
{
llimb_t limbx;
limb_t mask, carry, borrow, mx, hi, tmp[n];
const limb_t *b = a;
size_t i, j;
for (j=0; j<n; j++) {
mx = n0*b[0];
limbx = (mx * (llimb_t)p[0]) + b[0];
hi = (limb_t)(limbx >> LIMB_T_BITS);
for (i=1; i<n; i++) {
limbx = (mx * (llimb_t)p[i] + hi) + b[i];
tmp[i-1] = (limb_t)limbx;
hi = (limb_t)(limbx >> LIMB_T_BITS);
}
tmp[i-1] = hi;
b = tmp;
}
for (carry=0, i=0; i<n; i++) {
limbx = a[n+i] + (tmp[i] + (llimb_t)carry);
tmp[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
for (borrow=0, i=0; i<n; i++) {
limbx = tmp[i] - (p[i] + (llimb_t)borrow);
ret[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = carry - borrow;
for(i=0; i<n; i++)
ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
}
#define REDC_MONT_IMPL(bits, bits2) \
inline void redc_mont_##bits(vec##bits ret, const vec##bits2 a, \
const vec##bits p, limb_t n0) \
{ redc_mont_n(ret, a, p, n0, NLIMBS(bits)); }
REDC_MONT_IMPL(256, 512)
REDC_MONT_IMPL(384, 768)
static void rshift_mod_n(limb_t ret[], const limb_t a[], size_t count,
const limb_t p[], size_t n)
{
llimb_t limbx;
limb_t mask, carry, limb, next;
size_t i;
while (count--) {
mask = 0 - (a[0] & 1);
for (carry=0, i=0; i<n; i++) {
limbx = a[i] + ((p[i]&mask) + (llimb_t)carry);
ret[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
for (next=ret[0], i=0; i<n-1; i++) {
limb = next >> 1;
next = ret[i+1];
ret[i] = limb | next << (LIMB_T_BITS-1);
}
ret[i] = next >> 1 | carry << (LIMB_T_BITS-1);
a = ret;
}
}
#define RSHIFT_MOD_IMPL(bits) \
inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \
const vec##bits p) \
{ rshift_mod_n(ret, a, count, p, NLIMBS(bits)); }
RSHIFT_MOD_IMPL(256)
RSHIFT_MOD_IMPL(384)
#define DIV_BY_2_MOD_IMPL(bits) \
inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \
const vec##bits p) \
{ rshift_mod_n(ret, a, 1, p, NLIMBS(bits)); }
DIV_BY_2_MOD_IMPL(384)
static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n)
{
llimb_t limbx;
limb_t carry, borrow, ret, tmp[n];
size_t i;
ret = a[0] & 1; /* parity */
for (carry=0, i=0; i<n; i++) {
limb_t a_i = a[i];
tmp[i] = a_i<<1 | carry;
carry = a_i>>(LIMB_T_BITS-1);
}
for (borrow=0, i=0; i<n; i++) {
limbx = tmp[i] - (p[i] + (llimb_t)borrow);
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
ret |= ((carry - borrow) & 2) ^ 2;
return ret;
}
inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p)
{ return sgn0_pty_mod_n(a, p, NLIMBS(384)); }
inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0)
{
vec384 tmp;
from_mont_n(tmp, a, p, n0, NLIMBS(384));
return sgn0_pty_mod_n(tmp, p, NLIMBS(384));
}
inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p)
{
limb_t re, im, sign, prty;
re = sgn0_pty_mod_n(a[0], p, NLIMBS(384));
im = sgn0_pty_mod_n(a[1], p, NLIMBS(384));
/* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */
sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384));
sign = (re & sign) | (im & ~sign);
/* a->re==0 ? prty(a->im) : prty(a->re) */
prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384));
prty = (im & prty) | (re & ~prty);
return (sign & 2) | (prty & 1);
}
inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0)
{
vec384x tmp;
from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384));
from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384));
return sgn0_pty_mod_384x(tmp, p);
}
void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b,
const vec384 p, limb_t n0)
{
vec384 aa, bb, cc;
add_mod_n(aa, a[0], a[1], p, NLIMBS(384));
add_mod_n(bb, b[0], b[1], p, NLIMBS(384));
mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384));
mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384));
mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384));
sub_mod_n(ret[0], aa, cc, p, NLIMBS(384));
sub_mod_n(ret[1], bb, aa, p, NLIMBS(384));
sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384));
}
/*
* mul_mont_n without final conditional subtraction, which implies
* that modulus is one bit short, which in turn means that there are
* no carries to handle between iterations...
*/
static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[],
const limb_t p[], limb_t n0, size_t n)
{
llimb_t limbx;
limb_t mx, hi, tmp[n+1];
size_t i, j;
for (mx=b[0], hi=0, i=0; i<n; i++) {
limbx = (mx * (llimb_t)a[i]) + hi;
tmp[i] = (limb_t)limbx;
hi = (limb_t)(limbx >> LIMB_T_BITS);
}
mx = n0*tmp[0];
tmp[i] = hi;
for (j=0; ; ) {
limbx = (mx * (llimb_t)p[0]) + tmp[0];
hi = (limb_t)(limbx >> LIMB_T_BITS);
for (i=1; i<n; i++) {
limbx = (mx * (llimb_t)p[i] + hi) + tmp[i];
tmp[i-1] = (limb_t)limbx;
hi = (limb_t)(limbx >> LIMB_T_BITS);
}
tmp[i-1] = tmp[i] + hi;
if (++j==n)
break;
for (mx=b[j], hi=0, i=0; i<n; i++) {
limbx = (mx * (llimb_t)a[i] + hi) + tmp[i];
tmp[i] = (limb_t)limbx;
hi = (limb_t)(limbx >> LIMB_T_BITS);
}
mx = n0*tmp[0];
tmp[i] = hi;
}
vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t));
}
void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count,
const vec384 p, limb_t n0, const vec384 b)
{
while(count--) {
mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384));
a = ret;
}
mul_mont_n(ret, ret, b, p, n0, NLIMBS(384));
}
void sqr_mont_382x(vec384x ret, const vec384x a,
const vec384 p, limb_t n0)
{
llimb_t limbx;
limb_t mask, carry, borrow;
size_t i;
vec384 t0, t1;
/* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */
for (carry=0, i=0; i<NLIMBS(384); i++) {
limbx = a[0][i] + (a[1][i] + (llimb_t)carry);
t0[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
/* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */
for (borrow=0, i=0; i<NLIMBS(384); i++) {
limbx = a[0][i] - (a[1][i] + (llimb_t)borrow);
t1[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = 0 - borrow;
/* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */
mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));
/* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */
for (carry=0, i=0; i<NLIMBS(384); i++) {
limb_t a_i = ret[1][i];
ret[1][i] = a_i<<1 | carry;
carry = a_i>>(LIMB_T_BITS-1);
}
/* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */
mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384));
/* account for t1's sign... */
for (borrow=0, i=0; i<NLIMBS(384); i++) {
limbx = ret[0][i] - ((t0[i] & mask) + (llimb_t)borrow);
ret[0][i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = 0 - borrow;
for (carry=0, i=0; i<NLIMBS(384); i++) {
limbx = ret[0][i] + ((p[i] & mask) + (llimb_t)carry);
ret[0][i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
}
#define MSB(x) ((x) >> (LIMB_T_BITS-1))
static size_t num_bits(limb_t l)
{
limb_t x, mask;
size_t bits = is_zero(l) ^ 1;
if (sizeof(limb_t) == 8) {
x = l >> (32 & (8*sizeof(limb_t)-1));
mask = 0 - MSB(0 - x);
bits += 32 & mask;
l ^= (x ^ l) & mask;
}
x = l >> 16;
mask = 0 - MSB(0 - x);
bits += 16 & mask;
l ^= (x ^ l) & mask;
x = l >> 8;
mask = 0 - MSB(0 - x);
bits += 8 & mask;
l ^= (x ^ l) & mask;
x = l >> 4;
mask = 0 - MSB(0 - x);
bits += 4 & mask;
l ^= (x ^ l) & mask;
x = l >> 2;
mask = 0 - MSB(0 - x);
bits += 2 & mask;
l ^= (x ^ l) & mask;
bits += l >> 1;
return bits;
}
#if defined(__clang_major__) && __clang_major__>7
__attribute__((optnone))
#endif
static limb_t lshift_2(limb_t hi, limb_t lo, size_t l)
{
size_t r = LIMB_T_BITS - l;
limb_t mask = 0 - (is_zero(l)^1);
return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1)));
}
/*
* https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1.
*/
static void ab_approximation_n(limb_t a_[2], const limb_t a[],
limb_t b_[2], const limb_t b[], size_t n)
{
limb_t a_hi, a_lo, b_hi, b_lo, mask;
size_t i;
i = n-1;
a_hi = a[i], a_lo = a[i-1];
b_hi = b[i], b_lo = b[i-1];
for (i--; --i;) {
mask = 0 - is_zero(a_hi | b_hi);
a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi;
b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi;
a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo;
b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo;
}
i = LIMB_T_BITS - num_bits(a_hi | b_hi);
/* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */
a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i);
b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i);
}
typedef struct { limb_t f0, g0, f1, g1; } factors;
static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2],
size_t n)
{
llimb_t limbx;
limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1;
limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm;
a_lo = a_[0], a_hi = a_[1];
b_lo = b_[0], b_hi = b_[1];
while(n--) {
odd = 0 - (a_lo&1);
/* a_ -= b_ if a_ is odd */
t_lo = a_lo, t_hi = a_hi;
limbx = a_lo - (llimb_t)(b_lo & odd);
a_lo = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow);
a_hi = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS);
/* negate a_-b_ if it borrowed */
a_lo ^= borrow;
a_hi ^= borrow;
limbx = a_lo + (llimb_t)(borrow & 1);
a_lo = (limb_t)limbx;
a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1;
/* b_=a_ if a_-b_ borrowed */
b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo;
b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi;
/* exchange f0 and f1 if a_-b_ borrowed */
xorm = (f0 ^ f1) & borrow;
f0 ^= xorm;
f1 ^= xorm;
/* exchange g0 and g1 if a_-b_ borrowed */
xorm = (g0 ^ g1) & borrow;
g0 ^= xorm;
g1 ^= xorm;
/* subtract if a_ was odd */
f0 -= f1 & odd;
g0 -= g1 & odd;
f1 <<= 1;
g1 <<= 1;
a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1);
a_hi >>= 1;
}
fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1;
}
static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n)
{
llimb_t limbx = 0;
limb_t carry;
size_t i;
for (carry=neg&1, i=0; i<n; i++) {
limbx = (llimb_t)(a[i] ^ neg) + carry;
ret[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
return 0 - MSB((limb_t)limbx);
}
static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n)
{
llimb_t limbx;
limb_t carry;
size_t i;
for (carry=0, i=0; i<n; i++) {
limbx = a[i] + (b[i] + (llimb_t)carry);
ret[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
return carry;
}
static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n)
{
llimb_t limbx;
limb_t hi;
size_t i;
for (hi=0, i=0; i<n; i++) {
limbx = (b * (llimb_t)a[i]) + hi;
ret[i] = (limb_t)limbx;
hi = (limb_t)(limbx >> LIMB_T_BITS);
}
return hi;
}
static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_,
const limb_t b[], limb_t *g_,
size_t n)
{
limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi;
size_t i;
/* |a|*|f_| */
f = *f_;
neg = 0 - MSB(f);
f = (f ^ neg) - neg; /* ensure |f| is positive */
(void)cneg_n(a_, a, neg, n);
hi = umul_n(a_, a_, f, n);
a_[n] = hi - (f & neg);
/* |b|*|g_| */
g = *g_;
neg = 0 - MSB(g);
g = (g ^ neg) - neg; /* ensure |g| is positive */
(void)cneg_n(b_, b, neg, n);
hi = umul_n(b_, b_, g, n);
b_[n] = hi - (g & neg);
/* |a|*|f_| + |b|*|g_| */
(void)add_n(a_, a_, b_, n+1);
/* (|a|*|f_| + |b|*|g_|) >> k */
for (carry=a_[0], i=0; i<n; i++) {
hi = carry >> (LIMB_T_BITS-2);
carry = a_[i+1];
ret[i] = hi | (carry << 2);
}
/* ensure result is non-negative, fix up |f_| and |g_| accordingly */
neg = 0 - MSB(carry);
*f_ = (*f_ ^ neg) - neg;
*g_ = (*g_ ^ neg) - neg;
(void)cneg_n(ret, ret, neg, n);
return neg;
}
static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f,
const limb_t v[], limb_t g, size_t n)
{
limb_t u_[n], v_[n], neg, hi;
/* |u|*|f_| */
neg = 0 - MSB(f);
f = (f ^ neg) - neg; /* ensure |f| is positive */
neg = cneg_n(u_, u, neg, n);
hi = umul_n(u_, u_, f, n) - (f&neg);
/* |v|*|g_| */
neg = 0 - MSB(g);
g = (g ^ neg) - neg; /* ensure |g| is positive */
neg = cneg_n(v_, v, neg, n);
hi += umul_n(v_, v_, g, n) - (g&neg);
/* |u|*|f_| + |v|*|g_| */
hi += add_n(ret, u_, v_, n);
return hi;
}
static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[],
const limb_t mod[], const limb_t modx[], size_t n)
{
llimb_t limbx;
limb_t a[n], b[n], u[2*n], v[2*n], t[2*n];
limb_t a_[2], b_[2], sign, carry, top;
factors fg;
size_t i;
vec_copy(a, inp, sizeof(a));
vec_copy(b, mod, sizeof(b));
vec_zero(u, sizeof(u)); u[0] = 1;
vec_zero(v, sizeof(v));
for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) {
ab_approximation_n(a_, a, b_, b, n);
inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2);
(void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n);
(void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n);
vec_copy(a, t, sizeof(a));
smul_2n(t, u, fg.f0, v, fg.g0, 2*n);
smul_2n(v, u, fg.f1, v, fg.g1, 2*n);
vec_copy(u, t, sizeof(u));
}
inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2));
top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n);
sign = 0 - MSB(top); /* top is 1, 0 or -1 */
for (carry=0, i=0; i<n; i++) {
limbx = ret[n+i] + ((modx[i] & sign) + (llimb_t)carry);
ret[n+i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
top += carry;
sign = 0 - top; /* top is 1, 0 or -1 */
top |= sign;
for (i=0; i<n; i++)
a[i] = modx[i] & top;
(void)cneg_n(a, a, 0 - MSB(sign), n);
add_n(ret+n, ret+n, a, n);
}
#define CT_INVERSE_MOD_IMPL(bits) \
inline void ct_inverse_mod_##bits(vec##bits ret, const vec##bits inp, \
const vec##bits mod, const vec##bits modx) \
{ ct_inverse_mod_n(ret, inp, mod, modx, NLIMBS(bits)); }
CT_INVERSE_MOD_IMPL(256)
CT_INVERSE_MOD_IMPL(384)
/*
* Copy of inner_loop_n above, but with |L| updates.
*/
static limb_t legendre_loop_n(limb_t L, factors *fg, const limb_t a_[2],
const limb_t b_[2], size_t n)
{
llimb_t limbx;
limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1;
limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm;
a_lo = a_[0], a_hi = a_[1];
b_lo = b_[0], b_hi = b_[1];
while(n--) {
odd = 0 - (a_lo&1);
/* a_ -= b_ if a_ is odd */
t_lo = a_lo, t_hi = a_hi;
limbx = a_lo - (llimb_t)(b_lo & odd);
a_lo = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow);
a_hi = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS);
L += ((t_lo & b_lo) >> 1) & borrow;
/* negate a_-b_ if it borrowed */
a_lo ^= borrow;
a_hi ^= borrow;
limbx = a_lo + (llimb_t)(borrow & 1);
a_lo = (limb_t)limbx;
a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1;
/* b_=a_ if a_-b_ borrowed */
b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo;
b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi;
/* exchange f0 and f1 if a_-b_ borrowed */
xorm = (f0 ^ f1) & borrow;
f0 ^= xorm;
f1 ^= xorm;
/* exchange g0 and g1 if a_-b_ borrowed */
xorm = (g0 ^ g1) & borrow;
g0 ^= xorm;
g1 ^= xorm;
/* subtract if a_ was odd */
f0 -= f1 & odd;
g0 -= g1 & odd;
f1 <<= 1;
g1 <<= 1;
a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1);
a_hi >>= 1;
L += (b_lo + 2) >> 2;
}
fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1;
return L;
}
static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n)
{
limb_t a[n], b[n], t[n];
limb_t a_[2], b_[2], neg, L = 0;
factors fg;
size_t i;
vec_copy(a, inp, sizeof(a));
vec_copy(b, mod, sizeof(b));
for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) {
ab_approximation_n(a_, a, b_, b, n);
L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2);
neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n);
(void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n);
vec_copy(a, t, sizeof(a));
L += (b[0] >> 1) & neg;
}
L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2));
return (L & 1) ^ 1;
}
#define CT_IS_SQR_MOD_IMPL(bits) \
inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \
const vec##bits mod) \
{ return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits)); }
CT_IS_SQR_MOD_IMPL(384)
/*
* |div_top| points at two most significant limbs of the dividend, |d_hi|
* and |d_lo| are two most significant limbs of the divisor. If divisor
* is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
* The divisor is required to be "bitwise left-aligned," and dividend's
* top limbs to be not larger than the divisor's. The latter limitation
* can be problematic in the first iteration of multi-precision division,
* where in most general case the condition would have to be "smaller."
* The subroutine considers four limbs, two of which are "overlapping,"
* hence the name... Another way to look at it is to think of the pair
* of the dividend's limbs being suffixed with a zero:
* +-------+-------+-------+
* R | | | 0 |
* +-------+-------+-------+
* +-------+-------+
* D | | |
* +-------+-------+
*/
limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi)
{
llimb_t Rx;
limb_t r_lo = div_top[0], r_hi = div_top[1];
limb_t Q = 0, mask, borrow, rx;
size_t i;
for (i = 0; i < LIMB_T_BITS; i++) {
/* "borrow, Rx = R - D" */
Rx = (llimb_t)r_lo - d_lo;
rx = (limb_t)Rx;
borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1;
Rx = r_hi - (d_hi + (llimb_t)borrow);
borrow = (limb_t)(Rx >> LIMB_T_BITS);
/* "if (R >= D) R -= D" */
r_lo = ((r_lo ^ rx) & borrow) ^ rx;
rx = (limb_t)Rx;
r_hi = ((r_hi ^ rx) & borrow) ^ rx;
Q <<= 1;
Q |= ~borrow & 1;
/* "D >>= 1" */
d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1);
d_hi >>= 1;
}
mask = 0 - MSB(Q); /* does it overflow? */
/* "borrow, Rx = R - D" */
Rx = (llimb_t)r_lo - d_lo;
rx = (limb_t)Rx;
borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1;
Rx = r_hi - (d_hi + (llimb_t)borrow);
borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1;
Q <<= 1;
Q |= borrow ^ 1;
return (Q | mask);
}
static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor,
limb_t quotient, size_t n)
{
llimb_t limbx;
limb_t tmp[n+1], carry, mask, borrow;
size_t i;
/* divisor*quotient */
for (carry=0, i=0; i<n; i++) {
limbx = (quotient * (llimb_t)divisor[i]) + carry;
tmp[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS);
}
tmp[i] = carry;
/* remainder = dividend - divisor*quotient */
for (borrow=0, i=0; i<=n; i++) {
limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow);
tmp[i] = (limb_t)limbx;
borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
mask = 0 - borrow;
/* if quotient was off by one, add divisor to the remainder */
for (carry=0, i=0; i<n; i++) {
limbx = tmp[i] + ((divisor[i] & mask) + (llimb_t)carry);
div_rem[i] = (limb_t)limbx;
carry = (limb_t)(limbx >> LIMB_T_BITS) & 1;
}
return (div_rem[i] = quotient + mask);
}
inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor,
limb_t quotient)
{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128)); }
inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor,
limb_t quotient)
{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64)); }
/*
* Unlock reference implementations in vect.c
*/
#define mul_by_8_mod_384 mul_by_8_mod_384
#define mul_by_8_mod_384x mul_by_8_mod_384x
#define mul_by_3_mod_384x mul_by_3_mod_384x
#define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x
#define add_mod_384x add_mod_384x
#define sub_mod_384x sub_mod_384x
#define lshift_mod_384x lshift_mod_384x
#define sqr_mont_384x sqr_mont_384x
inline void vec_prefetch(const void *ptr, size_t len)
{ (void)ptr; (void)len; }
/*
* SHA-256
*/
#define ROTR(x,n) ((x)>>n | (x)<<(32-n))
#define Sigma0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22))
#define Sigma1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25))
#define sigma0(x) (ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3))
#define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10))
#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
void blst_sha256_block_data_order(unsigned int *v, const void *inp,
size_t blocks)
{
static const unsigned int K256[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2;
const unsigned char *data = inp;
size_t round;
a = v[0];
b = v[1];
c = v[2];
d = v[3];
e = v[4];
f = v[5];
g = v[6];
h = v[7];
while (blocks--) {
for (round = 0; round < 16; round++) {
l = (unsigned int)data[0] << 24;
l |= (unsigned int)data[1] << 16;
l |= (unsigned int)data[2] << 8;
l |= (unsigned int)data[3];
data += 4;
T1 = X[round] = l;
T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round];
T2 = Sigma0(a) + Maj(a, b, c);
h = g;
g = f;
f = e;
e = d + T1;
d = c;
c = b;
b = a;
a = T1 + T2;
}
for (; round < 64; round++) {
s0 = X[(round + 1) & 0x0f];
s0 = sigma0(s0);
s1 = X[(round + 14) & 0x0f];
s1 = sigma1(s1);
T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf];
T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round];
T2 = Sigma0(a) + Maj(a, b, c);
h = g;
g = f;
f = e;
e = d + T1;
d = c;
c = b;
b = a;
a = T1 + T2;
}
a += v[0]; v[0] = a;
b += v[1]; v[1] = b;
c += v[2]; v[2] = c;
d += v[3]; v[3] = d;
e += v[4]; v[4] = e;
f += v[5]; v[5] = f;
g += v[6]; v[6] = g;
h += v[7]; v[7] = h;
}
}
#undef ROTR
#undef Sigma0
#undef Sigma1
#undef sigma0
#undef sigma1
#undef Ch
#undef Maj
void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8])
{
size_t i;
for (i=0; i<8; i++)
dst[i] = src[i];
}
void blst_sha256_emit(unsigned char md[32], const unsigned int h[8])
{
size_t i;
for (i=0; i<8; i++, md+=4) {
unsigned int h_i = h[i];
md[0] = (unsigned char)(h_i >> 24);
md[1] = (unsigned char)(h_i >> 16);
md[2] = (unsigned char)(h_i >> 8);
md[3] = (unsigned char)h_i;
}
}
void blst_sha256_bcopy(void *dst_, const void *src_, size_t len)
{
unsigned char *dst = dst_;
const unsigned char *src = src_;
size_t i;
for (i=0; i<len; i++)
dst[i] = src[i];
}