ftu/blst/recip.c
2022-09-09 02:47:49 -04:00

140 lines
4.1 KiB
C

/*
* Copyright Supranational LLC
* Licensed under the Apache License, Version 2.0, see LICENSE for details.
* SPDX-License-Identifier: Apache-2.0
*/
#include "fields.h"
#ifdef __OPTIMIZE_SIZE__
/*
* 608 multiplications for scalar inversion modulo BLS12-381 prime, 32%
* more than corresponding optimal addition-chain, plus mispredicted
* branch penalties on top of that... The addition chain below was
* measured to be >50% faster.
*/
static void flt_reciprocal_fp(vec384 out, const vec384 inp)
{
static const byte BLS12_381_P_minus_2[] = {
TO_BYTES(0xb9feffffffffaaa9), TO_BYTES(0x1eabfffeb153ffff),
TO_BYTES(0x6730d2a0f6b0f624), TO_BYTES(0x64774b84f38512bf),
TO_BYTES(0x4b1ba7b6434bacd7), TO_BYTES(0x1a0111ea397fe69a)
};
exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0);
}
#else
# define sqr(ret,a) sqr_fp(ret,a)
# define mul(ret,a,b) mul_fp(ret,a,b)
# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b)
# include "recip-addchain.h"
static void flt_reciprocal_fp(vec384 out, const vec384 inp)
{
RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384);
}
# undef RECIPROCAL_MOD_BLS12_381_P
# undef sqr_n_mul
# undef mul
# undef sqr
#endif
static void flt_reciprocal_fp2(vec384x out, const vec384x inp)
{
vec384 t0, t1;
/*
* |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i
*/
sqr_fp(t0, inp[0]);
sqr_fp(t1, inp[1]);
add_fp(t0, t0, t1);
flt_reciprocal_fp(t1, t0);
mul_fp(out[0], inp[0], t1);
mul_fp(out[1], inp[1], t1);
neg_fp(out[1], out[1]);
}
static void reciprocal_fp(vec384 out, const vec384 inp)
{
static const vec384 Px8 = { /* left-aligned value of the modulus */
TO_LIMB_T(0xcff7fffffffd5558), TO_LIMB_T(0xf55ffff58a9ffffd),
TO_LIMB_T(0x39869507b587b120), TO_LIMB_T(0x23ba5c279c2895fb),
TO_LIMB_T(0x58dd3db21a5d66bb), TO_LIMB_T(0xd0088f51cbff34d2)
};
#ifdef __BLST_NO_ASM__
# define RRx4 BLS12_381_RR
#else
static const vec384 RRx4 = { /* (4<<768)%P */
TO_LIMB_T(0x5f7e7cd070d107c2), TO_LIMB_T(0xec839a9ac49c13c8),
TO_LIMB_T(0x6933786f44f4ef0b), TO_LIMB_T(0xd6bf8b9c676be983),
TO_LIMB_T(0xd3adaaaa4dcefb06), TO_LIMB_T(0x12601bc1d82bc175)
};
#endif
union { vec768 x; vec384 r[2]; } temp;
ct_inverse_mod_383(temp.x, inp, BLS12_381_P, Px8);
redc_mont_384(temp.r[0], temp.x, BLS12_381_P, p0);
mul_mont_384(temp.r[0], temp.r[0], RRx4, BLS12_381_P, p0);
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
/* sign goes straight to flt_reciprocal */
mul_mont_384(temp.r[1], temp.r[0], inp, BLS12_381_P, p0);
if (vec_is_equal(temp.r[1], BLS12_381_Rx.p, sizeof(vec384)) |
vec_is_zero(temp.r[1], sizeof(vec384)))
vec_copy(out, temp.r[0], sizeof(vec384));
else
flt_reciprocal_fp(out, inp);
#else
vec_copy(out, temp.r[0], sizeof(vec384));
#endif
#undef RRx4
}
void blst_fp_inverse(vec384 out, const vec384 inp)
{ reciprocal_fp(out, inp); }
void blst_fp_eucl_inverse(vec384 ret, const vec384 a)
{ reciprocal_fp(ret, a); }
static void reciprocal_fp2(vec384x out, const vec384x inp)
{
vec384 t0, t1;
/*
* |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i
*/
sqr_fp(t0, inp[0]);
sqr_fp(t1, inp[1]);
add_fp(t0, t0, t1);
reciprocal_fp(t1, t0);
mul_fp(out[0], inp[0], t1);
mul_fp(out[1], inp[1], t1);
neg_fp(out[1], out[1]);
}
void blst_fp2_inverse(vec384x out, const vec384x inp)
{ reciprocal_fp2(out, inp); }
void blst_fp2_eucl_inverse(vec384x out, const vec384x inp)
{ reciprocal_fp2(out, inp); }
static void reciprocal_fr(vec256 out, const vec256 inp)
{
static const vec256 rx2 = { /* left-aligned value of the modulus */
TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd),
TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90),
};
vec512 temp;
ct_inverse_mod_256(temp, inp, BLS12_381_r, rx2);
redc_mont_256(out, temp, BLS12_381_r, r0);
mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0);
}
void blst_fr_inverse(vec256 out, const vec256 inp)
{ reciprocal_fr(out, inp); }
void blst_fr_eucl_inverse(vec256 out, const vec256 inp)
{ reciprocal_fr(out, inp); }