ftu/blst/fields.h
2022-09-09 02:47:49 -04:00

212 lines
6.8 KiB
C

/*
* Copyright Supranational LLC
* Licensed under the Apache License, Version 2.0, see LICENSE for details.
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef __BLS12_381_ASM_FIELDS_H__
#define __BLS12_381_ASM_FIELDS_H__
#include "vect.h"
#include "consts.h"
#ifndef __CUDA_ARCH__
/*
* BLS12-381-specifc Fp shortcuts to assembly.
*/
static inline void add_fp(vec384 ret, const vec384 a, const vec384 b)
{ add_mod_384(ret, a, b, BLS12_381_P); }
static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b)
{ sub_mod_384(ret, a, b, BLS12_381_P); }
static inline void mul_by_3_fp(vec384 ret, const vec384 a)
{ mul_by_3_mod_384(ret, a, BLS12_381_P); }
static inline void mul_by_8_fp(vec384 ret, const vec384 a)
{ mul_by_8_mod_384(ret, a, BLS12_381_P); }
static inline void lshift_fp(vec384 ret, const vec384 a, size_t count)
{ lshift_mod_384(ret, a, count, BLS12_381_P); }
static inline void rshift_fp(vec384 ret, const vec384 a, size_t count)
{ rshift_mod_384(ret, a, count, BLS12_381_P); }
static inline void div_by_2_fp(vec384 ret, const vec384 a)
{ div_by_2_mod_384(ret, a, BLS12_381_P); }
static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b)
{ mul_mont_384(ret, a, b, BLS12_381_P, p0); }
static inline void sqr_fp(vec384 ret, const vec384 a)
{ sqr_mont_384(ret, a, BLS12_381_P, p0); }
static inline void cneg_fp(vec384 ret, const vec384 a, bool_t flag)
{ cneg_mod_384(ret, a, flag, BLS12_381_P); }
static inline void from_fp(vec384 ret, const vec384 a)
{ from_mont_384(ret, a, BLS12_381_P, p0); }
static inline void redc_fp(vec384 ret, const vec768 a)
{ redc_mont_384(ret, a, BLS12_381_P, p0); }
/*
* BLS12-381-specifc Fp2 shortcuts to assembly.
*/
static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b)
{ add_mod_384x(ret, a, b, BLS12_381_P); }
static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b)
{ sub_mod_384x(ret, a, b, BLS12_381_P); }
static inline void mul_by_3_fp2(vec384x ret, const vec384x a)
{ mul_by_3_mod_384x(ret, a, BLS12_381_P); }
static inline void mul_by_8_fp2(vec384x ret, const vec384x a)
{ mul_by_8_mod_384x(ret, a, BLS12_381_P); }
static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count)
{
lshift_mod_384(ret[0], a[0], count, BLS12_381_P);
lshift_mod_384(ret[1], a[1], count, BLS12_381_P);
}
static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b)
{ mul_mont_384x(ret, a, b, BLS12_381_P, p0); }
static inline void sqr_fp2(vec384x ret, const vec384x a)
{ sqr_mont_384x(ret, a, BLS12_381_P, p0); }
static inline void cneg_fp2(vec384x ret, const vec384x a, bool_t flag)
{
cneg_mod_384(ret[0], a[0], flag, BLS12_381_P);
cneg_mod_384(ret[1], a[1], flag, BLS12_381_P);
}
#define vec_load_global vec_copy
static void reciprocal_fp(vec384 out, const vec384 inp);
static void flt_reciprocal_fp(vec384 out, const vec384 inp);
static bool_t recip_sqrt_fp(vec384 out, const vec384 inp);
static bool_t sqrt_fp(vec384 out, const vec384 inp);
static void reciprocal_fp2(vec384x out, const vec384x inp);
static void flt_reciprocal_fp2(vec384x out, const vec384x inp);
static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp,
const vec384x recip_ZZZ, const vec384x magic_ZZZ);
static bool_t sqrt_fp2(vec384x out, const vec384x inp);
static bool_t sqrt_align_fp2(vec384x out, const vec384x ret,
const vec384x sqrt, const vec384x inp);
typedef vec384x vec384fp2;
typedef vec384fp2 vec384fp6[3];
typedef vec384fp6 vec384fp12[2];
static void sqr_fp12(vec384fp12 ret, const vec384fp12 a);
static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a);
static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b);
static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a,
const vec384fp6 xy00z0);
static void conjugate_fp12(vec384fp12 a);
static void inverse_fp12(vec384fp12 ret, const vec384fp12 a);
/* caveat lector! |n| has to be non-zero and not more than 3! */
static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n);
#else
extern "C" {
__device__ void mul_fp(vec384 ret, const vec384 a, const vec384 b);
__device__ void sqr_fp(vec384 ret, const vec384 a);
__device__ void add_fp(vec384 ret, const vec384 a, const vec384 b);
__device__ void sub_fp(vec384 ret, const vec384 a, const vec384 b);
__device__ void cneg_fp(vec384 ret, const vec384 ap, unsigned int flag);
__device__ void rshift_fp(vec384 ret, const vec384 a, unsigned int cnt);
__device__ void lshift_fp(vec384 ret, const vec384 a, unsigned int cnt);
__device__ void mul_by_3_fp(vec384 ret, const vec384 a);
__device__ void from_fp(vec384 ret, const vec384 a);
#pragma diag_suppress 3151
__device__ void mul_384(vec768 ret, const vec384 a, const vec384 b);
__device__ void sqr_384(vec768 ret, const vec384 a);
#pragma diag_default 3151
__device__ void redc_fp(vec384 ret, const vec768 a);
__device__ void add_fpx2(vec768 ret, const vec768 a, const vec768 b);
__device__ void sub_fpx2(vec768 ret, const vec768 a, const vec768 b);
__device__ void vec_load_global(limb_t *ret, const limb_t *a,
unsigned int sz = 48);
}
static inline void mul_by_8_fp(vec384 ret, const vec384 a)
{ lshift_fp(ret, a, 3); }
static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b)
{
add_fp(ret[0], a[0], b[0]);
add_fp(ret[1], a[1], b[1]);
}
static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b)
{
sub_fp(ret[0], a[0], b[0]);
sub_fp(ret[1], a[1], b[1]);
}
static inline void mul_by_3_fp2(vec384x ret, const vec384x a)
{
mul_by_3_fp(ret[0], a[0]);
mul_by_3_fp(ret[1], a[1]);
}
static inline void mul_by_8_fp2(vec384x ret, const vec384x a)
{
lshift_fp(ret[0], a[0], 3);
lshift_fp(ret[1], a[1], 3);
}
static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count)
{
lshift_fp(ret[0], a[0], count);
lshift_fp(ret[1], a[1], count);
}
static inline void cneg_fp2(vec384x ret, const vec384x a, limb_t flag)
{
cneg_fp(ret[0], a[0], flag);
cneg_fp(ret[1], a[1], flag);
}
static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b)
{
vec384 aa, bb, cc;
add_fp(aa, a[0], a[1]);
add_fp(bb, b[0], b[1]);
mul_fp(bb, bb, aa);
mul_fp(aa, a[0], b[0]);
mul_fp(cc, a[1], b[1]);
sub_fp(ret[0], aa, cc);
sub_fp(ret[1], bb, aa);
sub_fp(ret[1], ret[1], cc);
}
static inline void sqr_fp2(vec384x ret, const vec384x a)
{
vec384 t0, t1;
add_fp(t0, a[0], a[1]);
sub_fp(t1, a[0], a[1]);
mul_fp(ret[1], a[0], a[1]);
add_fp(ret[1], ret[1], ret[1]);
mul_fp(ret[0], t0, t1);
}
#endif
#define neg_fp(r,a) cneg_fp((r),(a),1)
#define neg_fp2(r,a) cneg_fp2((r),(a),1)
#endif /* __BLS12_381_ASM_FIELDS_H__ */