ftu/blst/bulk_addition.c

/*
 * Copyright Supranational LLC
 * Licensed under the Apache License, Version 2.0, see LICENSE for details.
 * SPDX-License-Identifier: Apache-2.0
 */

#include "fields.h"
#include "point.h"

/*
 * This implementation uses explicit addition formula:
 *
 * λ = (Y₂-Y₁)/(X₂-X₁)
 * X₃ = λ²-(X₁+X₂)
 * Y₃ = λ⋅(X₁-X₃)-Y₁
 *
 * But since we don't know if we'll have to add point to itself, we need
 * to eventually resort to corresponding doubling formula:
 *
 * λ = 3X₁²/2Y₁
 * X₃ = λ²-2X₁
 * Y₃ = λ⋅(X₁-X₃)-Y₁
 *
 * The formulae use prohibitively expensive inversion, but whenever we
 * have a lot of affine points to accumulate, we can amortize the cost
 * by applying Montgomery's batch inversion approach. As a result,
 * asymptotic[!] per-point cost for addition is as small as 5M+1S. For
 * comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things
 * considered, the improvement coefficient varies from 60% to 85%
 * depending on platform and curve.
 *
 * THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an
 * application that requires constant time-ness, speak up!]
 */

/*
 * Calculate λ's numerator and denominator.
 *
 * input:	A	x1	y1	-
 *		B	x2	y2	-
 * output:
 * if A!=B:	A	x1	y1	(x2-x1)*mul_acc
 *		B	x2+x1	y2-y1	(x2-x1)
 *
 * if A==B:	A	x	y	2y*mul_acc
 *		B	2x	3*x^2	2y
 *
 * if A==-B:	A	0	0	1*mul_acc
 *		B	0	3*x^2	0
 */
#define HEAD(ptype, bits, field, one) \
static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \
{ \
    ptype *A = AB, *B = AB+1; \
    limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \
                 vec_is_zero(B, sizeof(ptype##_affine));  \
    static const vec##bits zero = { 0 }; \
\
    sub_##field(B->Z, B->X, A->X);		/* X2-X1  */ \
    add_##field(B->X, B->X, A->X);		/* X2+X1  */ \
    add_##field(A->Z, B->Y, A->Y);		/* Y2+Y1  */ \
    sub_##field(B->Y, B->Y, A->Y);		/* Y2-Y1  */ \
    if (vec_is_zero(B->Z, sizeof(B->Z))) {	/* X2==X1 */ \
        inf = vec_is_zero(A->Z, sizeof(A->Z));	\
        vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \
        sqr_##field(B->Y, A->X);		\
        mul_by_3_##field(B->Y, B->Y);		/* 3*X1^2 */ \
        vec_copy(B->Z, A->Z, sizeof(B->Z));	/* 2*Y1   */ \
    }						/* B->Y is numenator    */ \
						/* B->Z is denominator  */ \
    vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \
    vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \
    vec_select(A->Z, one,  B->Z, sizeof(A->Z), inf); \
    vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \
    if (mul_acc != NULL) \
        mul_##field(A->Z, A->Z, mul_acc);	/* chain multiplication */\
}

/*
 * Calculate λ and resulting coordinates.
 *
 * input:	A		x1			y1		-
 *		B		x2+x1			nominator	-
 * 		lambda		1/denominator
 * output:	D		x3=(nom/den)^2-(x2+x1)	y3=(nom/den)(x1-x3)-y1
 */
#define TAIL(ptype, bits, field, one) \
static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \
{ \
    ptype *A = AB, *B = AB+1; \
    vec##bits llambda; \
    limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \
\
    mul_##field(lambda, lambda, B->Y);		/* λ = (Y2-Y1)/(X2-X1)  */ \
						/* alt. 3*X1^2/2*Y1     */ \
    sqr_##field(llambda, lambda); \
    sub_##field(D->X, llambda, B->X);		/* X3 = λ^2-X1-X2       */ \
\
    sub_##field(D->Y, A->X, D->X);   \
    mul_##field(D->Y, D->Y, lambda); \
    sub_##field(D->Y, D->Y, A->Y);		/* Y3 = λ*(X1-X3)-Y1    */ \
\
    vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \
    vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \
}

/*
 * |points[]| is volatile buffer with |X|s and |Y|s initially holding
 * input affine coordinates, and with |Z|s being used as additional
 * temporary storage [unrelated to Jacobian coordinates]. |sum| is
 * in-/output, initialize to infinity accordingly.
 */
#define ADDITION_BTREE(prefix, ptype, bits, field, one) \
HEAD(ptype, bits, field, one) \
TAIL(ptype, bits, field, one) \
static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \
{ \
    ptype *dst; \
    void *mul_acc; \
    size_t i; \
\
    while (n >= 16) { \
        if (n & 1) \
            ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \
        n /= 2; \
        for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \
            ptype##_head(points, mul_acc); \
\
        reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \
\
        for (dst = points, i = n; --i;) { \
            dst--; points -= 2; \
            mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \
            ptype##_tail(dst, points, points[-2].Z); \
            mul_##field(points[-2].Z, points[0].Z, points[1].Z); \
        } \
        dst--; points -= 2; \
        ptype##_tail(dst, points, points[0].Z); \
        points = dst; \
    } \
    while (n--) \
        ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \
} \
\
void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \
                               size_t npoints) \
{ \
    /* Performance with 288K scratch is within 1-2-3% from optimal */ \
    const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 2048 : 1024; \
    ptype *scratch = alloca((npoints > stride ? stride : npoints) * \
                            sizeof(ptype)); \
    const ptype##_affine *point = NULL; \
\
    vec_zero(sum, sizeof(*sum)); \
    while (npoints) { \
        size_t i, j = npoints > stride ? stride : npoints; \
        for (i=0; i<j; i++) { \
            point = *points ? *points++ : point+1; \
            vec_copy(&scratch[i], point, sizeof(*point)); \
        } \
        ptype##s_accumulate(sum, scratch, j); \
        npoints -= j; \
    } \
}

ADDITION_BTREE(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p2)

ADDITION_BTREE(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
initial stuff 2022-09-09 06:47:49 +00:00			`/*`
			`* Copyright Supranational LLC`
			`* Licensed under the Apache License, Version 2.0, see LICENSE for details.`
			`* SPDX-License-Identifier: Apache-2.0`
			`*/`

			`#include "fields.h"`
			`#include "point.h"`

			`/*`
			`* This implementation uses explicit addition formula:`
			`*`
			`* λ = (Y₂-Y₁)/(X₂-X₁)`
			`* X₃ = λ²-(X₁+X₂)`
			`* Y₃ = λ⋅(X₁-X₃)-Y₁`
			`*`
			`* But since we don't know if we'll have to add point to itself, we need`
			`* to eventually resort to corresponding doubling formula:`
			`*`
			`* λ = 3X₁²/2Y₁`
			`* X₃ = λ²-2X₁`
			`* Y₃ = λ⋅(X₁-X₃)-Y₁`
			`*`
			`* The formulae use prohibitively expensive inversion, but whenever we`
			`* have a lot of affine points to accumulate, we can amortize the cost`
			`* by applying Montgomery's batch inversion approach. As a result,`
			`* asymptotic[!] per-point cost for addition is as small as 5M+1S. For`
			`* comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things`
			`* considered, the improvement coefficient varies from 60% to 85%`
			`* depending on platform and curve.`
			`*`
			`* THIS IMPLEMENTATION IS NOT CONSTANT-TIME. [But if there is an`
			`* application that requires constant time-ness, speak up!]`
			`*/`

			`/*`
			`* Calculate λ's numerator and denominator.`
			`*`
			`* input: A x1 y1 -`
			`* B x2 y2 -`
			`* output:`
			`* if A!=B: A x1 y1 (x2-x1)*mul_acc`
			`* B x2+x1 y2-y1 (x2-x1)`
			`*`
			`* if A==B: A x y 2y*mul_acc`
			`* B 2x 3*x^2 2y`
			`*`
			`* if A==-B: A 0 0 1*mul_acc`
			`* B 0 3*x^2 0`
			`*/`
			`#define HEAD(ptype, bits, field, one) \`
			`static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \`
			`{ \`
			`ptype A = AB, B = AB+1; \`
			`limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) \| \`
			`vec_is_zero(B, sizeof(ptype##_affine)); \`
			`static const vec##bits zero = { 0 }; \`
			`\`
			`sub_##field(B->Z, B->X, A->X); /* X2-X1 */ \`
			`add_##field(B->X, B->X, A->X); /* X2+X1 */ \`
			`add_##field(A->Z, B->Y, A->Y); /* Y2+Y1 */ \`
			`sub_##field(B->Y, B->Y, A->Y); /* Y2-Y1 */ \`
			`if (vec_is_zero(B->Z, sizeof(B->Z))) { /* X2==X1 */ \`
			`inf = vec_is_zero(A->Z, sizeof(A->Z)); \`
			`vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \`
			`sqr_##field(B->Y, A->X); \`
			`mul_by_3_##field(B->Y, B->Y); /* 3X1^2 / \`
			`vec_copy(B->Z, A->Z, sizeof(B->Z)); /* 2Y1 / \`
			`} /* B->Y is numenator */ \`
			`/* B->Z is denominator */ \`
			`vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \`
			`vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \`
			`vec_select(A->Z, one, B->Z, sizeof(A->Z), inf); \`
			`vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \`
			`if (mul_acc != NULL) \`
			`mul_##field(A->Z, A->Z, mul_acc); /* chain multiplication */\`
			`}`

			`/*`
			`* Calculate λ and resulting coordinates.`
			`*`
			`* input: A x1 y1 -`
			`* B x2+x1 nominator -`
			`* lambda 1/denominator`
			`* output: D x3=(nom/den)^2-(x2+x1) y3=(nom/den)(x1-x3)-y1`
			`*/`
			`#define TAIL(ptype, bits, field, one) \`
			`static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \`
			`{ \`
			`ptype A = AB, B = AB+1; \`
			`vec##bits llambda; \`
			`limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \`
			`\`
			`mul_##field(lambda, lambda, B->Y); /* λ = (Y2-Y1)/(X2-X1) */ \`
			`/* alt. 3X1^2/2Y1 */ \`
			`sqr_##field(llambda, lambda); \`
			`sub_##field(D->X, llambda, B->X); /* X3 = λ^2-X1-X2 */ \`
			`\`
			`sub_##field(D->Y, A->X, D->X); \`
			`mul_##field(D->Y, D->Y, lambda); \`
			`sub_##field(D->Y, D->Y, A->Y); /* Y3 = λ(X1-X3)-Y1 / \`
			`\`
			`vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \`
			`vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \`
			`}`

			`/*`
			`* \|points[]\| is volatile buffer with \|X\|s and \|Y\|s initially holding`
			`* input affine coordinates, and with \|Z\|s being used as additional`
			`* temporary storage [unrelated to Jacobian coordinates]. \|sum\| is`
			`* in-/output, initialize to infinity accordingly.`
			`*/`
			`#define ADDITION_BTREE(prefix, ptype, bits, field, one) \`
			`HEAD(ptype, bits, field, one) \`
			`TAIL(ptype, bits, field, one) \`
			`static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \`
			`{ \`
			`ptype *dst; \`
			`void *mul_acc; \`
			`size_t i; \`
			`\`
			`while (n >= 16) { \`
			`if (n & 1) \`
			`ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \`
			`n /= 2; \`
			`for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \`
			`ptype##_head(points, mul_acc); \`
			`\`
			`reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \`
			`\`
			`for (dst = points, i = n; --i;) { \`
			`dst--; points -= 2; \`
			`mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \`
			`ptype##_tail(dst, points, points[-2].Z); \`
			`mul_##field(points[-2].Z, points[0].Z, points[1].Z); \`
			`} \`
			`dst--; points -= 2; \`
			`ptype##_tail(dst, points, points[0].Z); \`
			`points = dst; \`
			`} \`
			`while (n--) \`
			`ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \`
			`} \`
			`\`
			`void prefix##s_add(ptype sum, const ptype##_affine const points[], \`
			`size_t npoints) \`
			`{ \`
			`/* Performance with 288K scratch is within 1-2-3% from optimal */ \`
			`const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 2048 : 1024; \`
			`ptype scratch = alloca((npoints > stride ? stride : npoints) \`
			`sizeof(ptype)); \`
			`const ptype##_affine *point = NULL; \`
			`\`
			`vec_zero(sum, sizeof(*sum)); \`
			`while (npoints) { \`
			`size_t i, j = npoints > stride ? stride : npoints; \`
			`for (i=0; i<j; i++) { \`
			`point = points ? points++ : point+1; \`
			`vec_copy(&scratch[i], point, sizeof(*point)); \`
			`} \`
			`ptype##s_accumulate(sum, scratch, j); \`
			`npoints -= j; \`
			`} \`
			`}`

			`ADDITION_BTREE(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p2)`

			`ADDITION_BTREE(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2)`