ftu/blst/bulk_addition.c
2022-09-09 02:47:49 -04:00

169 lines
5.6 KiB
C

/*
* Copyright Supranational LLC
* Licensed under the Apache License, Version 2.0, see LICENSE for details.
* SPDX-License-Identifier: Apache-2.0
*/
#include "fields.h"
#include "point.h"
/*
* This implementation uses explicit addition formula:
*
* λ = (Y₂-Y₁)/(X₂-X₁)
* X₃ = λ²-(X₁+X₂)
* Y₃ = λ⋅(X₁-X₃)-Y₁
*
* But since we don't know if we'll have to add point to itself, we need
* to eventually resort to corresponding doubling formula:
*
* λ = 3X₁²/2Y₁
* X₃ = λ²-2X₁
* Y₃ = λ⋅(X₁-X₃)-Y₁
*
* The formulae use prohibitively expensive inversion, but whenever we
* have a lot of affine points to accumulate, we can amortize the cost
* by applying Montgomery's batch inversion approach. As a result,
* asymptotic[!] per-point cost for addition is as small as 5M+1S. For
* comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things
* considered, the improvement coefficient varies from 60% to 85%
* depending on platform and curve.
*
* THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an
* application that requires constant time-ness, speak up!]
*/
/*
* Calculate λ's numerator and denominator.
*
* input: A x1 y1 -
* B x2 y2 -
* output:
* if A!=B: A x1 y1 (x2-x1)*mul_acc
* B x2+x1 y2-y1 (x2-x1)
*
* if A==B: A x y 2y*mul_acc
* B 2x 3*x^2 2y
*
* if A==-B: A 0 0 1*mul_acc
* B 0 3*x^2 0
*/
#define HEAD(ptype, bits, field, one) \
static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \
{ \
ptype *A = AB, *B = AB+1; \
limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \
vec_is_zero(B, sizeof(ptype##_affine)); \
static const vec##bits zero = { 0 }; \
\
sub_##field(B->Z, B->X, A->X); /* X2-X1 */ \
add_##field(B->X, B->X, A->X); /* X2+X1 */ \
add_##field(A->Z, B->Y, A->Y); /* Y2+Y1 */ \
sub_##field(B->Y, B->Y, A->Y); /* Y2-Y1 */ \
if (vec_is_zero(B->Z, sizeof(B->Z))) { /* X2==X1 */ \
inf = vec_is_zero(A->Z, sizeof(A->Z)); \
vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \
sqr_##field(B->Y, A->X); \
mul_by_3_##field(B->Y, B->Y); /* 3*X1^2 */ \
vec_copy(B->Z, A->Z, sizeof(B->Z)); /* 2*Y1 */ \
} /* B->Y is numenator */ \
/* B->Z is denominator */ \
vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \
vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \
vec_select(A->Z, one, B->Z, sizeof(A->Z), inf); \
vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \
if (mul_acc != NULL) \
mul_##field(A->Z, A->Z, mul_acc); /* chain multiplication */\
}
/*
* Calculate λ and resulting coordinates.
*
* input: A x1 y1 -
* B x2+x1 nominator -
* lambda 1/denominator
* output: D x3=(nom/den)^2-(x2+x1) y3=(nom/den)(x1-x3)-y1
*/
#define TAIL(ptype, bits, field, one) \
static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \
{ \
ptype *A = AB, *B = AB+1; \
vec##bits llambda; \
limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \
\
mul_##field(lambda, lambda, B->Y); /* λ = (Y2-Y1)/(X2-X1) */ \
/* alt. 3*X1^2/2*Y1 */ \
sqr_##field(llambda, lambda); \
sub_##field(D->X, llambda, B->X); /* X3 = λ^2-X1-X2 */ \
\
sub_##field(D->Y, A->X, D->X); \
mul_##field(D->Y, D->Y, lambda); \
sub_##field(D->Y, D->Y, A->Y); /* Y3 = λ*(X1-X3)-Y1 */ \
\
vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \
vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \
}
/*
* |points[]| is volatile buffer with |X|s and |Y|s initially holding
* input affine coordinates, and with |Z|s being used as additional
* temporary storage [unrelated to Jacobian coordinates]. |sum| is
* in-/output, initialize to infinity accordingly.
*/
#define ADDITION_BTREE(prefix, ptype, bits, field, one) \
HEAD(ptype, bits, field, one) \
TAIL(ptype, bits, field, one) \
static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \
{ \
ptype *dst; \
void *mul_acc; \
size_t i; \
\
while (n >= 16) { \
if (n & 1) \
ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \
n /= 2; \
for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \
ptype##_head(points, mul_acc); \
\
reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \
\
for (dst = points, i = n; --i;) { \
dst--; points -= 2; \
mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \
ptype##_tail(dst, points, points[-2].Z); \
mul_##field(points[-2].Z, points[0].Z, points[1].Z); \
} \
dst--; points -= 2; \
ptype##_tail(dst, points, points[0].Z); \
points = dst; \
} \
while (n--) \
ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \
} \
\
void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \
size_t npoints) \
{ \
/* Performance with 288K scratch is within 1-2-3% from optimal */ \
const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 2048 : 1024; \
ptype *scratch = alloca((npoints > stride ? stride : npoints) * \
sizeof(ptype)); \
const ptype##_affine *point = NULL; \
\
vec_zero(sum, sizeof(*sum)); \
while (npoints) { \
size_t i, j = npoints > stride ? stride : npoints; \
for (i=0; i<j; i++) { \
point = *points ? *points++ : point+1; \
vec_copy(&scratch[i], point, sizeof(*point)); \
} \
ptype##s_accumulate(sum, scratch, j); \
npoints -= j; \
} \
}
ADDITION_BTREE(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p2)
ADDITION_BTREE(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2)