commit 943c07066e737a935b3b7f345610cdb5077d7633 Author: John Doe <johndoe@example.com> Date: Fri Sep 9 02:47:49 2022 -0400 initial stuff diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b3d8c2f --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,13 @@ +{ + "files.associations": { + "vector": "c", + "memory": "c", + "optional": "c", + "string_view": "c", + "string": "c", + "system_error": "c", + "thread": "c", + "typeindex": "c", + "variant": "c" + } +} \ No newline at end of file diff --git a/assembly.o b/assembly.o new file mode 100644 index 0000000..5ade239 Binary files /dev/null and b/assembly.o differ diff --git a/blindsig.c b/blindsig.c new file mode 100644 index 0000000..219290b --- /dev/null +++ b/blindsig.c @@ -0,0 +1,182 @@ +// This is a (very rough) test of BLST blind signatures based on run.me from BLST's Python example code +// Do not trust this to be secure, also this doesn't do a lot of the sanity checking yet + +#include <stdio.h> +#include <string.h> +#include <time.h> +#include "blst/blst.h" + +const byte dst[] = "MY-DST"; +double time_taken; +clock_t t; + +byte signer_private_key[32]; +byte signer_public_key[96]; + +void printbytes(byte *toprint, int length){ + for(int i=0;i<length;i++){ + printf("%.2x ", toprint[i]); + } + printf("\n"); +} + +void signer_key_setup(){ + blst_scalar sk; + blst_p2 pk; + blst_p2_affine pk_affine; + + byte myikm[32] = {'*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*'}; + + // On signer's side: + printf("IKM: "); + printbytes(myikm, 32); + + blst_keygen(&sk, myikm, 32, 0, 0); + + blst_bendian_from_scalar(signer_private_key, &sk); + printf("Secret Key: "); + printbytes(signer_private_key, 32); + + blst_sk_to_pk_in_g2(&pk, &sk); + + blst_p2_to_affine(&pk_affine, &pk); + + blst_p2_affine_compress(signer_public_key, &pk_affine); + printf("Compressed Public Key (affine): "); + printbytes(signer_public_key, 96); +} + +void signer(byte *compressed_signature, byte *msg_for_wire){ + blst_scalar sk; + blst_p1 msg, signature; + blst_p1_affine msg_affine; + byte debug_print_buf[256]; + + // get the secret key as a scalar + blst_scalar_from_bendian(&sk, signer_private_key); + + // Deserialize the message - it's already a serialized P1 point, we don't need to (literally) rehash it + blst_p1_uncompress(&msg_affine, msg_for_wire); + + // i do not know why deserializing always gives you affine points + blst_p1_from_affine(&msg, &msg_affine); + + // Confirm the message point is in the G1 group + assert(blst_p1_in_g1(&msg)); + + // sign with it + blst_sign_pk_in_g2(&signature, &msg, &sk); + + // Serialize and print the signature + blst_p1_serialize(debug_print_buf, &signature); + printf("Signature: "); + printbytes(debug_print_buf, 96); + + // Compress and print the signature + blst_p1_compress(compressed_signature, &signature); + printf("Compressed Signature: "); + printbytes(compressed_signature, 48); +} + +void verifier(byte *compressed_signature, byte *msg){ + blst_p1_affine sig; + blst_p2_affine pk; + + blst_p1_uncompress(&sig, compressed_signature); + blst_p2_uncompress(&pk, signer_public_key); + + BLST_ERROR returned; + + // TODO: check if in g2 group + + returned = blst_core_verify_pk_in_g2(&pk, &sig, 1, msg, strlen((char *) msg), dst, strlen((char *) dst), signer_public_key, 96); + + if(returned == BLST_SUCCESS){ + printf("Verified!\n"); + }else{ + printf("Not verified!\n"); + } +} + +// main is the "user" in this test +int main(){ + byte debug_print_buf[256]; + byte compressed_blinded_signature[48]; + byte compressed_signature[48]; + byte msg[] = "assertion"; + byte blinding_r_bytes[32] = {'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R'}; + blst_scalar blinding_r, inverse_blinding_r; + blst_p1 hash, msg_for_wire; + byte msg_for_wire_bytes[96]; + blst_p1_affine returned_signature_affine; + blst_p1 returned_signature, unblinded_signature; + + printf("msg is now %s\n", msg); + + // Set up the signer's keys first so that we can know its public key + signer_key_setup(); + + // Get a hash of the message - we put the signer's public key in aug here, I don't know why + blst_hash_to_g1(&hash, msg, strlen((char *) msg), dst, strlen((char *) dst), signer_public_key, 96); + + printf("HASH: "); + blst_p1_serialize(debug_print_buf, &hash); + printbytes(debug_print_buf, 96); + + // Get a BLST scalar of your "random" (LOL) blinding factor r + blst_scalar_from_bendian(&blinding_r, blinding_r_bytes); + + printf("R BYTES: "); + printbytes(blinding_r_bytes, 32); + + // Blind the message by signing it with the blinding factor R as if it was a secret key + blst_sign_pk_in_g2(&msg_for_wire, &hash, &blinding_r); + + // Serialize the blinded message to send it over the wire + blst_p1_compress(msg_for_wire_bytes, &msg_for_wire); + + printf("Blinded and compressed for wire: "); + printbytes(msg_for_wire_bytes, 48); + + // Send the message off to be signed and get the results back + signer(compressed_blinded_signature, msg_for_wire_bytes); + + printf("COMPRESSED BLINDED SIG: "); + printbytes(compressed_blinded_signature, 48); + + // We now have the signature back. returned_signature is a blst_p1_affine because this is pk_in_g2. + blst_p1_uncompress(&returned_signature_affine, compressed_blinded_signature); + + // Convert the uncompressed returned signature from an affine to a P1 + blst_p1_from_affine(&returned_signature, &returned_signature_affine); + + // Confirm the signature point is in the G1 group + assert(blst_p1_in_g1(&returned_signature)); + + printf("RETURNED SIGNATURE: "); + blst_p1_serialize(debug_print_buf, &returned_signature); + printbytes(debug_print_buf, 96); + + // Get the inverse of R. We'll need this to unblind the signature. + blst_sk_inverse(&inverse_blinding_r, &blinding_r); + + // Print the inverse of R + printf("INVERSE R: "); + blst_bendian_from_scalar(debug_print_buf, &inverse_blinding_r); + printbytes(debug_print_buf, 32); + + // Sign the blinded signature we get back from the signer with the inverse of the blinding factor + blst_sign_pk_in_g2(&unblinded_signature, &returned_signature, &inverse_blinding_r); + + blst_p1_compress(compressed_signature, &unblinded_signature); + + printf("UNBLINDED SIGNATURE: "); + printbytes(compressed_signature, 48); + + //msg[8] = 'A'; + + printf("msg is now %s\n", msg); + + // Now on verifier's side (after compressed_signature, serialized_public_key, and msg are passed over the network) + verifier(compressed_signature, msg); +} \ No newline at end of file diff --git a/blst/aggregate.c b/blst/aggregate.c new file mode 100644 index 0000000..f2c4be7 --- /dev/null +++ b/blst/aggregate.c @@ -0,0 +1,674 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Usage pattern on single-processor system is + * + * blst_pairing_init(ctx, hash_or_encode, DST); + * blst_pairing_aggregate_pk_in_g1(ctx, PK[0], aggregated_signature, msg[0]); + * blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, msg[1]); + * ... + * blst_pairing_commit(ctx); + * blst_pairing_finalverify(ctx, NULL); + * + *********************************************************************** + * Usage pattern on multi-processor system is + * + * blst_pairing_init(pk[0], hash_or_encode, DST); + * blst_pairing_init(pk[1], hash_or_encode, DST); + * ... + * start threads each processing an N/nthreads slice of PKs and messages: + * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+0], NULL, msg[i*n+0]); + * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+1], NULL, msg[i*n+1]); + * ... + * blst_pairing_commit(pkx); + * ... + * meanwhile in main thread + * blst_fp12 gtsig; + * blst_aggregated_in_g2(>sig, aggregated_signature); + * join threads and merge their contexts: + * blst_pairing_merge(pk[0], pk[1]); + * blst_pairing_merge(pk[0], pk[2]); + * ... + * blst_pairing_finalverify(pk[0], gtsig); + */ + +#ifndef N_MAX +# define N_MAX 8 +#endif + +typedef union { POINTonE1 e1; POINTonE2 e2; } AggregatedSignature; +typedef struct { + unsigned int ctrl; + unsigned int nelems; + const void *DST; + size_t DST_len; + vec384fp12 GT; + AggregatedSignature AggrSign; + POINTonE2_affine Q[N_MAX]; + POINTonE1_affine P[N_MAX]; +} PAIRING; + +enum { AGGR_UNDEFINED = 0, + AGGR_MIN_SIG = 1, + AGGR_MIN_PK = 2, + AGGR_SIGN_SET = 0x10, + AGGR_GT_SET = 0x20, + AGGR_HASH_OR_ENCODE = 0x40 }; +#define MIN_SIG_OR_PK (AGGR_MIN_SIG | AGGR_MIN_PK) + +static const size_t sizeof_pairing = (sizeof(PAIRING) + 7) & ~(size_t)7; + +size_t blst_pairing_sizeof(void) +{ return sizeof_pairing; } + +void blst_pairing_init(PAIRING *ctx, int hash_or_encode, + const void *DST, size_t DST_len) +{ + ctx->ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx->nelems = 0; + ctx->DST = (uptr_t)DST==(uptr_t)((byte *)ctx+sizeof_pairing) ? (void *)42 + : DST; + ctx->DST_len = DST_len; +} + +static const void *pairing_get_dst(const PAIRING *ctx) +{ return (uptr_t)ctx->DST==(uptr_t)42 ? (const byte *)ctx+sizeof_pairing + : ctx->DST; +} + +const void *blst_pairing_get_dst(const PAIRING *ctx) +{ return pairing_get_dst(ctx); } + +#define FROM_AFFINE(out,in) do { \ + vec_copy((out)->X, in->X, 2*sizeof(in->X)), \ + vec_select((out)->Z, in->X, BLS12_381_Rx.p, sizeof(in->X), \ + vec_is_zero(in->X, 2*sizeof(in->X))); } while(0) + +/* + * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated + * signature vetification as discussed at + * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407. + * Usage pattern is not finalized yet, because (sig != NULL) is better and + * will be handled separately... + */ +static BLST_ERROR PAIRING_Aggregate_PK_in_G2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_groupcheck, + const POINTonE1_affine *sig, + size_t sig_groupcheck, + const byte *scalar, size_t nbits, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ + if (ctx->ctrl & AGGR_MIN_PK) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= AGGR_MIN_SIG; + + /* + * Since we don't know if the signature is individual or aggregated, + * the only sensible thing to do is to skip over infinite one and + * count on the corresponding infinite public key to be rejected, + * in case the signature is non-aggregated that is. + */ + if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { + POINTonE1 *S = &ctx->AggrSign.e1; + POINTonE1 P[1]; + + FROM_AFFINE(P, sig); + + if (sig_groupcheck && !POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->ctrl & AGGR_SIGN_SET) { + if (nbits != 0 && scalar != NULL) { + POINTonE1_mult_w5(P, P, scalar, nbits); + POINTonE1_dadd(S, S, P, NULL); + } else { + POINTonE1_dadd_affine(S, S, sig); + } + } else { + ctx->ctrl |= AGGR_SIGN_SET; + if (nbits != 0 && scalar != NULL) + POINTonE1_mult_w5(S, P, scalar, nbits); + else + vec_copy(S, P, sizeof(P)); + } + } + + if (PK != NULL) { + unsigned int n; + POINTonE1 H[1]; + const void *DST = pairing_get_dst(ctx); + + /* + * Reject infinite public keys. + */ + if (vec_is_zero(PK, sizeof(*PK))) + return BLST_PK_IS_INFINITY; + + if (pk_groupcheck) { + POINTonE2 P[1]; + + FROM_AFFINE(P, PK); + if (!POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + } + + if (ctx->ctrl & AGGR_HASH_OR_ENCODE) + Hash_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + else + Encode_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + + if (nbits != 0 && scalar != NULL) + POINTonE1_mult_w5(H, H, scalar, nbits); + + POINTonE1_from_Jacobian(H, H); + + n = ctx->nelems; + vec_copy(ctx->Q + n, PK, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, H, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1_affine *signature, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1_affine *sig, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, sig, 1, scalar, nbits, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_grpchk, + const POINTonE1_affine *signature, + size_t sig_grpchk, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, signature, sig_grpchk, + NULL, 0, msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_grpchk, + const POINTonE1_affine *sig, + size_t sig_grpchk, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, sig, sig_grpchk, + scalar, nbits, + msg, msg_len, aug, aug_len); +} + +static BLST_ERROR PAIRING_Aggregate_PK_in_G1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_groupcheck, + const POINTonE2_affine *sig, + size_t sig_groupcheck, + const byte *scalar, size_t nbits, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ + if (ctx->ctrl & AGGR_MIN_SIG) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= AGGR_MIN_PK; + + /* + * Since we don't know if the signature is individual or aggregated, + * the only sensible thing to do is to skip over infinite one and + * count on the corresponding infinite public key to be rejected, + * in case the signature is non-aggregated that is. + */ + if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { + POINTonE2 *S = &ctx->AggrSign.e2; + POINTonE2 P[1]; + + FROM_AFFINE(P, sig); + + if (sig_groupcheck && !POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->ctrl & AGGR_SIGN_SET) { + if (nbits != 0 && scalar != NULL) { + + POINTonE2_mult_w5(P, P, scalar, nbits); + POINTonE2_dadd(S, S, P, NULL); + } else { + POINTonE2_dadd_affine(S, S, sig); + } + } else { + ctx->ctrl |= AGGR_SIGN_SET; + if (nbits != 0 && scalar != NULL) + POINTonE2_mult_w5(S, P, scalar, nbits); + else + vec_copy(S, P, sizeof(P)); + } + } + + if (PK != NULL) { + unsigned int n; + POINTonE2 H[1]; + const void *DST = pairing_get_dst(ctx); + + /* + * Reject infinite public keys. + */ + if (vec_is_zero(PK, sizeof(*PK))) + return BLST_PK_IS_INFINITY; + + if (pk_groupcheck) { + POINTonE1 P[1]; + + FROM_AFFINE(P, PK); + if (!POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + } + + if (ctx->ctrl & AGGR_HASH_OR_ENCODE) + Hash_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + else + Encode_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + + POINTonE2_from_Jacobian(H, H); + + if (nbits != 0 && scalar != NULL) { + POINTonE1 pk[1]; + + FROM_AFFINE(pk, PK); + POINTonE1_mult_w5(pk, pk, scalar, nbits); + POINTonE1_from_Jacobian(pk, pk); + PK = (const POINTonE1_affine *)pk; + } + + n = ctx->nelems; + vec_copy(ctx->Q + n, H, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, PK, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2_affine *signature, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2_affine *sig, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, sig, 1, scalar, nbits, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_grpchk, + const POINTonE2_affine *signature, + size_t sig_grpchk, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, signature, sig_grpchk, + NULL, 0, msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_grpchk, + const POINTonE2_affine *sig, + size_t sig_grpchk, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, sig, sig_grpchk, + scalar, nbits, + msg, msg_len, aug, aug_len); +} + +static void PAIRING_Commit(PAIRING *ctx) +{ + unsigned int n; + + if ((n = ctx->nelems) != 0) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + ctx->nelems = 0; + } +} + +void blst_pairing_commit(PAIRING *ctx) +{ PAIRING_Commit(ctx); } + +BLST_ERROR blst_pairing_merge(PAIRING *ctx, const PAIRING *ctx1) +{ + if ((ctx->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED + && (ctx1->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED + && (ctx->ctrl & ctx1->ctrl & MIN_SIG_OR_PK) == 0) + return BLST_AGGR_TYPE_MISMATCH; + + /* context producers are expected to have called blst_pairing_commit */ + if (ctx->nelems || ctx1->nelems) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= ctx1->ctrl & MIN_SIG_OR_PK; + + switch (ctx->ctrl & MIN_SIG_OR_PK) { + case AGGR_MIN_SIG: + if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { + POINTonE1_dadd(&ctx->AggrSign.e1, &ctx->AggrSign.e1, + &ctx1->AggrSign.e1, NULL); + } else if (ctx1->ctrl & AGGR_SIGN_SET) { + ctx->ctrl |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e1, &ctx1->AggrSign.e1, + sizeof(ctx->AggrSign.e1)); + } + break; + case AGGR_MIN_PK: + if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { + POINTonE2_dadd(&ctx->AggrSign.e2, &ctx->AggrSign.e2, + &ctx1->AggrSign.e2, NULL); + } else if (ctx1->ctrl & AGGR_SIGN_SET) { + ctx->ctrl |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e2, &ctx1->AggrSign.e2, + sizeof(ctx->AggrSign.e2)); + } + break; + case AGGR_UNDEFINED: + break; + default: + return BLST_AGGR_TYPE_MISMATCH; + } + + if (ctx->ctrl & ctx1->ctrl & AGGR_GT_SET) { + mul_fp12(ctx->GT, ctx->GT, ctx1->GT); + } else if (ctx1->ctrl & AGGR_GT_SET) { + ctx->ctrl |= AGGR_GT_SET; + vec_copy(ctx->GT, ctx1->GT, sizeof(ctx->GT)); + } + + return BLST_SUCCESS; +} + +static bool_t PAIRING_FinalVerify(const PAIRING *ctx, const vec384fp12 GTsig) +{ + vec384fp12 GT; + + if (!(ctx->ctrl & AGGR_GT_SET)) + return 0; + + if (GTsig != NULL) { + vec_copy(GT, GTsig, sizeof(GT)); + } else if (ctx->ctrl & AGGR_SIGN_SET) { + AggregatedSignature AggrSign; + + switch (ctx->ctrl & MIN_SIG_OR_PK) { + case AGGR_MIN_SIG: + POINTonE1_from_Jacobian(&AggrSign.e1, &ctx->AggrSign.e1); + miller_loop_n(GT, (const POINTonE2_affine *)&BLS12_381_G2, + (const POINTonE1_affine *)&AggrSign.e1, 1); + break; + case AGGR_MIN_PK: + POINTonE2_from_Jacobian(&AggrSign.e2, &ctx->AggrSign.e2); + miller_loop_n(GT, (const POINTonE2_affine *)&AggrSign.e2, + (const POINTonE1_affine *)&BLS12_381_G1, 1); + break; + default: + return 0; + } + } else { + /* + * The aggregated signature was infinite, relation between the + * hashes and the public keys has to be VERY special... + */ + vec_copy(GT, BLS12_381_Rx.p12, sizeof(GT)); + } + + conjugate_fp12(GT); + mul_fp12(GT, GT, ctx->GT); + final_exp(GT, GT); + + /* return GT==1 */ + return vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0])); +} + +int blst_pairing_finalverify(const PAIRING *ctx, const vec384fp12 GTsig) +{ return (int)PAIRING_FinalVerify(ctx, GTsig); } + +int blst_fp12_finalverify(const vec384fp12 GT1, const vec384fp12 GT2) +{ + vec384fp12 GT; + + vec_copy(GT, GT1, sizeof(GT)); + conjugate_fp12(GT); + mul_fp12(GT, GT, GT2); + final_exp(GT, GT); + + /* return GT==1 */ + return (int)(vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0]))); +} + +void blst_pairing_raw_aggregate(PAIRING *ctx, const POINTonE2_affine *q, + const POINTonE1_affine *p) +{ + unsigned int n; + + if (vec_is_zero(q, sizeof(*q)) & vec_is_zero(p, sizeof(*p))) + return; + + n = ctx->nelems; + vec_copy(ctx->Q + n, q, sizeof(*q)); + vec_copy(ctx->P + n, p, sizeof(*p)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; +} + +vec384fp12 *blst_pairing_as_fp12(PAIRING *ctx) +{ + PAIRING_Commit(ctx); + return (vec384fp12 *)ctx->GT; +} + +/* + * PAIRING context-free entry points. + * + * To perform FastAggregateVerify, aggregate all public keys and + * signatures with corresponding blst_aggregate_in_g{12}, convert + * result to affine and call suitable blst_core_verify_pk_in_g{12} + * or blst_aggregated_in_g{12}... + */ +BLST_ERROR blst_aggregate_in_g1(POINTonE1 *out, const POINTonE1 *in, + const unsigned char *zwire) +{ + POINTonE1 P[1]; + BLST_ERROR ret; + + ret = POINTonE1_Deserialize_Z((POINTonE1_affine *)P, zwire); + + if (ret != BLST_SUCCESS) + return ret; + + if (vec_is_zero(P, sizeof(POINTonE1_affine))) { + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); + + if (!POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) + vec_copy(out, P, sizeof(P)); + else + POINTonE1_dadd_affine(out, in, (POINTonE1_affine *)P); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_aggregate_in_g2(POINTonE2 *out, const POINTonE2 *in, + const unsigned char *zwire) +{ + POINTonE2 P[1]; + BLST_ERROR ret; + + ret = POINTonE2_Deserialize_Z((POINTonE2_affine *)P, zwire); + + if (ret != BLST_SUCCESS) + return ret; + + if (vec_is_zero(P, sizeof(POINTonE2_affine))) { + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); + + if (!POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) { + vec_copy(out, P, sizeof(P)); + } else { + POINTonE2_dadd_affine(out, in, (POINTonE2_affine *)P); + } + return BLST_SUCCESS; +} + +void blst_aggregated_in_g1(vec384fp12 ret, const POINTonE1_affine *sig) +{ miller_loop_n(ret, (const POINTonE2_affine *)&BLS12_381_G2, sig, 1); } + +void blst_aggregated_in_g2(vec384fp12 ret, const POINTonE2_affine *sig) +{ miller_loop_n(ret, sig, (const POINTonE1_affine *)&BLS12_381_G1, 1); } + +BLST_ERROR blst_core_verify_pk_in_g1(const POINTonE1_affine *pk, + const POINTonE2_affine *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx.nelems = 0; + ctx.DST = DST; + ctx.DST_len = DST_len; + + ret = PAIRING_Aggregate_PK_in_G1(&ctx, pk, 1, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} + +BLST_ERROR blst_core_verify_pk_in_g2(const POINTonE2_affine *pk, + const POINTonE1_affine *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx.nelems = 0; + ctx.DST = DST; + ctx.DST_len = DST_len; + + ret = PAIRING_Aggregate_PK_in_G2(&ctx, pk, 1, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} diff --git a/blst/asm/add_mod_256-armv8.pl b/blst/asm/add_mod_256-armv8.pl new file mode 100755 index 0000000..34d9145 --- /dev/null +++ b/blst/asm/add_mod_256-armv8.pl @@ -0,0 +1,412 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); + +@mod=map("x$_",(4..7)); +@a=map("x$_",(8..11)); +@b=map("x$_",(12..15)); +@t=map("x$_",(16,17,1..3)); + +$code.=<<___; +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,%function +.align 5 +add_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + adds @a[0],@a[0],@b[0] + ldp @b[2],@b[3],[$b_ptr,#16] + adcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + adcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@t[3],lo + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size add_mod_256,.-add_mod_256 + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,%function +.align 5 +mul_by_3_mod_256: + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + + adds @a[0],@b[0],@b[0] + ldp @mod[0],@mod[1],[$b_ptr] + adcs @a[1],@b[1],@b[1] + ldp @mod[2],@mod[3],[$b_ptr,#16] + adcs @a[2],@b[2],@b[2] + adcs @a[3],@b[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + csel @a[3],@a[3],@t[3],lo + + adds @a[0],@a[0],@b[0] + adcs @a[1],@a[1],@b[1] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@t[3],lo + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,%function +.align 5 +lshift_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + +.Loop_lshift_mod_256: + adds @a[0],@a[0],@a[0] + sub $b_ptr,$b_ptr,#1 + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adc @t[4],xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + + cbnz $b_ptr,.Loop_lshift_mod_256 + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size lshift_mod_256,.-lshift_mod_256 + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,%function +.align 5 +rshift_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + +.Loop_rshift: + adds @b[0],@a[0],@mod[0] + sub $b_ptr,$b_ptr,#1 + adcs @b[1],@a[1],@mod[1] + adcs @b[2],@a[2],@mod[2] + adcs @b[3],@a[3],@mod[3] + adc @t[4],xzr,xzr + tst @a[0],#1 + + csel @b[0],@b[0],@a[0],ne + csel @b[1],@b[1],@a[1],ne + csel @b[2],@b[2],@a[2],ne + csel @b[3],@b[3],@a[3],ne + csel @t[4],@t[4],xzr,ne + + extr @a[0],@b[1],@b[0],#1 + extr @a[1],@b[2],@b[1],#1 + extr @a[2],@b[3],@b[2],#1 + extr @a[3],@t[4],@b[3],#1 + + cbnz $b_ptr,.Loop_rshift + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size rshift_mod_256,.-rshift_mod_256 + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,%function +.align 5 +cneg_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @mod[0],@mod[1],[$n_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + subs @b[0],@mod[0],@a[0] + ldp @mod[2],@mod[3],[$n_ptr,#16] + orr @mod[0],@a[0],@a[1] + sbcs @b[1],@mod[1],@a[1] + orr @mod[1],@a[2],@a[3] + sbcs @b[2],@mod[2],@a[2] + orr @t[4],@mod[0],@mod[1] + sbc @b[3],@mod[3],@a[3] + + cmp @t[4],#0 + csetm @t[4],ne + ands $b_ptr,$b_ptr,@t[4] + + csel @a[0],@a[0],@b[0],eq + csel @a[1],@a[1],@b[1],eq + csel @a[2],@a[2],@b[2],eq + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@b[3],eq + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size cneg_mod_256,.-cneg_mod_256 + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,%function +.align 5 +sub_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + subs @a[0],@a[0],@b[0] + ldp @b[2],@b[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + sbcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + sbcs @a[3],@a[3],@b[3] + sbc @t[4],xzr,xzr + + and @mod[0],@mod[0],@t[4] + and @mod[1],@mod[1],@t[4] + adds @a[0],@a[0],@mod[0] + and @mod[2],@mod[2],@t[4] + adcs @a[1],@a[1],@mod[1] + and @mod[3],@mod[3],@t[4] + adcs @a[2],@a[2],@mod[2] + stp @a[0],@a[1],[$r_ptr] + adc @a[3],@a[3],@mod[3] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size sub_mod_256,.-sub_mod_256 + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,%function +.align 5 +check_mod_256: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + subs xzr,@a[0],@mod[0] + sbcs xzr,@a[1],@mod[1] + orr @a[0],@a[0],@a[1] + sbcs xzr,@a[2],@mod[2] + orr @a[0],@a[0],@a[2] + sbcs xzr,@a[3],@mod[3] + orr @a[0],@a[0],@a[3] + sbc $a_ptr,xzr,xzr + + cmp @a[0],#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,$a_ptr + + ret +.size check_mod_256,.-check_mod_256 + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,%function +.align 5 +add_n_check_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @b[0],@b[0] + rev @a[1],@a[1] + rev @b[1],@b[1] + rev @a[2],@a[2] + rev @b[2],@b[2] + rev @a[3],@a[3] + rev @b[3],@b[3] +#endif + + adds @a[0],@a[0],@b[0] + ldp @mod[0],@mod[1],[$n_ptr] + adcs @a[1],@a[1],@b[1] + ldp @mod[2],@mod[3],[$n_ptr,#16] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + csel @a[3],@a[3],@t[3],lo + + orr @t[0], @a[0], @a[1] + orr @t[1], @a[2], @a[3] + orr @t[0], @t[0], @t[1] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + mov @t[1], #1 + cmp @t[0], #0 + csel x0, @t[1], xzr, ne + + ret +.size add_n_check_mod_256,.-add_n_check_mod_256 + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,%function +.align 5 +sub_n_check_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @b[0],@b[0] + rev @a[1],@a[1] + rev @b[1],@b[1] + rev @a[2],@a[2] + rev @b[2],@b[2] + rev @a[3],@a[3] + rev @b[3],@b[3] +#endif + + subs @a[0],@a[0],@b[0] + sbcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + sbcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + sbcs @a[3],@a[3],@b[3] + sbc @t[4],xzr,xzr + + and @mod[0],@mod[0],@t[4] + and @mod[1],@mod[1],@t[4] + adds @a[0],@a[0],@mod[0] + and @mod[2],@mod[2],@t[4] + adcs @a[1],@a[1],@mod[1] + and @mod[3],@mod[3],@t[4] + adcs @a[2],@a[2],@mod[2] + adc @a[3],@a[3],@mod[3] + + orr @t[0], @a[0], @a[1] + orr @t[1], @a[2], @a[3] + orr @t[0], @t[0], @t[1] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + mov @t[1], #1 + cmp @t[0], #0 + csel x0, @t[1], xzr, ne + + ret +.size sub_n_check_mod_256,.-sub_n_check_mod_256 +___ + +print $code; + +close STDOUT; diff --git a/blst/asm/add_mod_256-x86_64.pl b/blst/asm/add_mod_256-x86_64.pl new file mode 100755 index 0000000..1d656fb --- /dev/null +++ b/blst/asm/add_mod_256-x86_64.pl @@ -0,0 +1,547 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx"); +$b_ptr = "%rbx"; + +{ ############################################################## 256 bits add +my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12)); + +$code.=<<___; +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,\@function,4,"unwind" +.align 32 +add_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loaded_a_add_mod_256: + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + mov @acc[0], @acc[4] + adc 8*2($b_org), @acc[2] + mov @acc[1], @acc[5] + adc 8*3($b_org), @acc[3] + sbb $b_org, $b_org + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, $b_org + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_256,.-add_mod_256 + +######################################################################## +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,\@function,3,"unwind" +.align 32 +mul_by_3_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov $b_org,$n_ptr + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov $a_ptr,$b_org + mov 8*3($a_ptr), @acc[3] + + call __lshift_mod_256 + mov 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,\@abi-omnipotent +.align 32 +__lshift_mod_256: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + mov @acc[0], @acc[4] + adc @acc[2], @acc[2] + mov @acc[1], @acc[5] + adc @acc[3], @acc[3] + sbb @acc[8], @acc[8] + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, @acc[8] + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + cmovc @acc[6], @acc[2] + cmovc @acc[7], @acc[3] + + ret +.size __lshift_mod_256,.-__lshift_mod_256 + +######################################################################## +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,\@function,4,"unwind" +.align 32 +lshift_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loop_lshift_mod_256: + call __lshift_mod_256 + dec %edx + jnz .Loop_lshift_mod_256 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + +######################################################################## +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,\@function,4,"unwind" +.align 32 +rshift_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[7] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loop_rshift_mod_256: + mov @acc[7], @acc[0] + and \$1, @acc[7] + mov 8*0($n_ptr), @acc[4] + neg @acc[7] + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + + and @acc[7], @acc[4] + and @acc[7], @acc[5] + and @acc[7], @acc[6] + and 8*3($n_ptr), @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + sbb @acc[4], @acc[4] + + shr \$1, @acc[0] + mov @acc[1], @acc[7] + shr \$1, @acc[1] + mov @acc[2], @acc[6] + shr \$1, @acc[2] + mov @acc[3], @acc[5] + shr \$1, @acc[3] + + shl \$63, @acc[7] + shl \$63, @acc[6] + or @acc[0], @acc[7] + shl \$63, @acc[5] + or @acc[6], @acc[1] + shl \$63, @acc[4] + or @acc[5], @acc[2] + or @acc[4], @acc[3] + + dec %edx + jnz .Loop_rshift_mod_256 + + mov @acc[7], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + +######################################################################## +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,\@function,4,"unwind" +.align 32 +cneg_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[8] # load a[0:3] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov @acc[8], @acc[0] + mov 8*3($a_ptr), @acc[3] + or @acc[1], @acc[8] + or @acc[2], @acc[8] + or @acc[3], @acc[8] + mov \$-1, @acc[7] + + mov 8*0($n_ptr), @acc[4] # load n[0:3] + cmovnz @acc[7], @acc[8] # mask = a[0:3] ? -1 : 0 + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + and @acc[8], @acc[4] # n[0:3] &= mask + mov 8*3($n_ptr), @acc[7] + and @acc[8], @acc[5] + and @acc[8], @acc[6] + and @acc[8], @acc[7] + + sub @acc[0], @acc[4] # a[0:3] ? n[0:3]-a[0:3] : 0-0 + sbb @acc[1], @acc[5] + sbb @acc[2], @acc[6] + sbb @acc[3], @acc[7] + + or $b_org, $b_org # check condition flag + + cmovz @acc[0], @acc[4] # flag ? n[0:3]-a[0:3] : a[0:3] + cmovz @acc[1], @acc[5] + mov @acc[4], 8*0($r_ptr) + cmovz @acc[2], @acc[6] + mov @acc[5], 8*1($r_ptr) + cmovz @acc[3], @acc[7] + mov @acc[6], 8*2($r_ptr) + mov @acc[7], 8*3($r_ptr) + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + +######################################################################## +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,\@function,4,"unwind" +.align 32 +sub_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[4] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[5] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[6] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[7] + sbb $b_org, $b_org + + and $b_org, @acc[4] + and $b_org, @acc[5] + and $b_org, @acc[6] + and $b_org, @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 + +######################################################################## +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,\@function,2,"unwind" +.align 32 +check_mod_256: +.cfi_startproc + mov 8*0($r_ptr), %rax + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + + mov %rax, @acc[0] # see if it's zero + or @acc[1], %rax + or @acc[2], %rax + or @acc[3], %rax + + sub 8*0($a_ptr), @acc[0] # does subtracting modulus borrow? + sbb 8*1($a_ptr), @acc[1] + sbb 8*2($a_ptr), @acc[2] + sbb 8*3($a_ptr), @acc[3] + sbb $a_ptr, $a_ptr + + mov \$1, %rdx + cmp \$0, %rax + cmovne %rdx, %rax + and $a_ptr, %rax +.cfi_epilogue + ret +.cfi_endproc +.size check_mod_256,.-check_mod_256 + +######################################################################## +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,\@function,4,"unwind" +.align 32 +add_n_check_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + mov @acc[0], @acc[4] + adc 8*2($b_org), @acc[2] + mov @acc[1], @acc[5] + adc 8*3($b_org), @acc[3] + sbb $b_org, $b_org + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, $b_org + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + or @acc[1], @acc[0] + or @acc[3], @acc[2] + or @acc[2], @acc[0] + mov \$1, %rax + cmovz @acc[0], %rax + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size add_n_check_mod_256,.-add_n_check_mod_256 + +######################################################################## +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,\@function,4,"unwind" +.align 32 +sub_n_check_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[4] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[5] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[6] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[7] + sbb $b_org, $b_org + + and $b_org, @acc[4] + and $b_org, @acc[5] + and $b_org, @acc[6] + and $b_org, @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + or @acc[1], @acc[0] + or @acc[3], @acc[2] + or @acc[2], @acc[0] + mov \$1, %rax + cmovz @acc[0], %rax + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sub_n_check_mod_256,.-sub_n_check_mod_256 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/add_mod_384-armv8.pl b/blst/asm/add_mod_384-armv8.pl new file mode 100755 index 0000000..c6b2a53 --- /dev/null +++ b/blst/asm/add_mod_384-armv8.pl @@ -0,0 +1,872 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); + +@mod=map("x$_",(4..9)); +@a=map("x$_",(10..15)); +@b=map("x$_",(16,17,19..22)); +$carry=$n_ptr; + +$code.=<<___; +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,%function +.align 5 +add_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[4],@b[5],[$b_ptr,#32] + +__add_mod_384_ab_are_loaded: + adds @a[0],@a[0],@b[0] + adcs @a[1],@a[1],@b[1] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adcs @a[4],@a[4],@b[4] + adcs @a[5],@a[5],@b[5] + adc $carry,xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs @b[4],@a[4],@mod[4] + sbcs @b[5],@a[5],@mod[5] + sbcs xzr,$carry,xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + csel @a[4],@a[4],@b[4],lo + csel @a[5],@a[5],@b[5],lo + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,%function +.align 5 +add_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384 + + stp @a[0],@a[1],[$r_ptr] + add $a_ptr,$a_ptr,#48 + stp @a[2],@a[3],[$r_ptr,#16] + add $b_ptr,$b_ptr,#48 + stp @a[4],@a[5],[$r_ptr,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size add_mod_384x,.-add_mod_384x + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,%function +.align 5 +rshift_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + +.Loop_rshift_mod_384: + sub $b_ptr,$b_ptr,#1 + bl __rshift_mod_384 + cbnz $b_ptr,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,%function +.align 5 +__rshift_mod_384: + sbfx @b[5],@a[0],#0,#1 + and @b[0],@b[5],@mod[0] + and @b[1],@b[5],@mod[1] + adds @a[0],@a[0],@b[0] + and @b[2],@b[5],@mod[2] + adcs @a[1],@a[1],@b[1] + and @b[3],@b[5],@mod[3] + adcs @a[2],@a[2],@b[2] + and @b[4],@b[5],@mod[4] + adcs @a[3],@a[3],@b[3] + and @b[5],@b[5],@mod[5] + adcs @a[4],@a[4],@b[4] + extr @a[0],@a[1],@a[0],#1 // a[0:5] >>= 1 + adcs @a[5],@a[5],@b[5] + extr @a[1],@a[2],@a[1],#1 + adc @b[5],xzr,xzr + extr @a[2],@a[3],@a[2],#1 + extr @a[3],@a[4],@a[3],#1 + extr @a[4],@a[5],@a[4],#1 + extr @a[5],@b[5],@a[5],#1 + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,%function +.align 5 +div_by_2_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size div_by_2_mod_384,.-div_by_2_mod_384 + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,%function +.align 5 +lshift_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + +.Loop_lshift_mod_384: + sub $b_ptr,$b_ptr,#1 + bl __lshift_mod_384 + cbnz $b_ptr,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,%function +.align 5 +__lshift_mod_384: + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $carry,xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs @b[4],@a[4],@mod[4] + sbcs @b[5],@a[5],@mod[5] + sbcs xzr,$carry,xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + csel @a[4],@a[4],@b[4],lo + csel @a[5],@a[5],@b[5],lo + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,%function +.align 5 +mul_by_3_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,%function +.align 5 +mul_by_8_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,%function +.align 5 +mul_by_3_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + + bl __add_mod_384_ab_are_loaded + + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr,#48] + ldp @b[2],@b[3],[$a_ptr,#64] + ldp @b[4],@b[5],[$a_ptr,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,%function +.align 5 +mul_by_8_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,%function +.align 5 +cneg_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @mod[0],@mod[1],[$n_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @mod[2],@mod[3],[$n_ptr,#16] + + subs @b[0],@mod[0],@a[0] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @mod[4],@mod[5],[$n_ptr,#32] + orr $carry,@a[0],@a[1] + sbcs @b[1],@mod[1],@a[1] + orr $carry,$carry,@a[2] + sbcs @b[2],@mod[2],@a[2] + orr $carry,$carry,@a[3] + sbcs @b[3],@mod[3],@a[3] + orr $carry,$carry,@a[4] + sbcs @b[4],@mod[4],@a[4] + orr $carry,$carry,@a[5] + sbc @b[5],@mod[5],@a[5] + + cmp $carry,#0 + csetm $carry,ne + ands $b_ptr,$b_ptr,$carry + + csel @a[0],@a[0],@b[0],eq + csel @a[1],@a[1],@b[1],eq + csel @a[2],@a[2],@b[2],eq + csel @a[3],@a[3],@b[3],eq + stp @a[0],@a[1],[$r_ptr] + csel @a[4],@a[4],@b[4],eq + stp @a[2],@a[3],[$r_ptr,#16] + csel @a[5],@a[5],@b[5],eq + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size cneg_mod_384,.-cneg_mod_384 + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,%function +.align 5 +sub_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[4],@b[5],[$b_ptr,#32] + + subs @a[0],@a[0],@b[0] + sbcs @a[1],@a[1],@b[1] + sbcs @a[2],@a[2],@b[2] + sbcs @a[3],@a[3],@b[3] + sbcs @a[4],@a[4],@b[4] + sbcs @a[5],@a[5],@b[5] + sbc $carry,xzr,xzr + + and @b[0],@mod[0],$carry + and @b[1],@mod[1],$carry + adds @a[0],@a[0],@b[0] + and @b[2],@mod[2],$carry + adcs @a[1],@a[1],@b[1] + and @b[3],@mod[3],$carry + adcs @a[2],@a[2],@b[2] + and @b[4],@mod[4],$carry + adcs @a[3],@a[3],@b[3] + and @b[5],@mod[5],$carry + adcs @a[4],@a[4],@b[4] + adc @a[5],@a[5],@b[5] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,%function +.align 5 +sub_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384 + + stp @a[0],@a[1],[$r_ptr] + add $a_ptr,$a_ptr,#48 + stp @a[2],@a[3],[$r_ptr,#16] + add $b_ptr,$b_ptr,#48 + stp @a[4],@a[5],[$r_ptr,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size sub_mod_384x,.-sub_mod_384x + +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,%function +.align 5 +mul_by_1_plus_i_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + add $b_ptr,$a_ptr,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x + +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,%function +.align 5 +sgn0_pty_mod_384: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @a[4],@a[5],[$r_ptr,#32] + + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + + and $r_ptr,@a[0],#1 + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $carry,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $carry,$carry,xzr + + mvn $carry,$carry + and $carry,$carry,#2 + orr $r_ptr,$r_ptr,$carry + + ret +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,%function +.align 5 +sgn0_pty_mod_384x: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @a[4],@a[5],[$r_ptr,#32] + + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + + and $b_ptr,@a[0],#1 + orr $n_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $n_ptr,$n_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $n_ptr,$n_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $n_ptr,$n_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $n_ptr,$n_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @b[0],xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc @b[0],@b[0],xzr + + ldp @a[0],@a[1],[$r_ptr,#48] + ldp @a[2],@a[3],[$r_ptr,#64] + ldp @a[4],@a[5],[$r_ptr,#80] + + mvn @b[0],@b[0] + and @b[0],@b[0],#2 + orr $b_ptr,$b_ptr,@b[0] + + and $r_ptr,@a[0],#1 + orr $a_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $a_ptr,$a_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $a_ptr,$a_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $a_ptr,$a_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $a_ptr,$a_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @b[0],xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc @b[0],@b[0],xzr + + mvn @b[0],@b[0] + and @b[0],@b[0],#2 + orr $r_ptr,$r_ptr,@b[0] + + cmp $n_ptr,#0 + csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp $a_ptr,#0 + csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and $n_ptr,$n_ptr,#1 + and $a_ptr,$a_ptr,#2 + orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity + + ret +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +___ +if (1) { +sub vec_select { +my $sz = shift; +my @v=map("v$_",(0..5,16..21)); + +$code.=<<___; +.globl vec_select_$sz +.hidden vec_select_$sz +.type vec_select_$sz,%function +.align 5 +vec_select_$sz: + dup v6.2d, $n_ptr + ld1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48 +___ +for($i=0; $i<$sz-48; $i+=48) { +$code.=<<___; + bit @v[0].16b, @v[3].16b, v6.16b + ld1 {@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48 + bit @v[1].16b, @v[4].16b, v6.16b + ld1 {@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48 + bit @v[2].16b, @v[5].16b, v6.16b + st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48 +___ + @v = @v[6..11,0..5]; +} +$code.=<<___; + bit @v[0].16b, @v[3].16b, v6.16b + bit @v[1].16b, @v[4].16b, v6.16b + bit @v[2].16b, @v[5].16b, v6.16b + st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr] + ret +.size vec_select_$sz,.-vec_select_$sz +___ +} +vec_select(48); +vec_select(96); +vec_select(192); +vec_select(144); +vec_select(288); +} + +{ +my ($inp, $end, $step) = map("x$_", (0..2)); + +$code.=<<___; +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,%function +.align 5 +vec_prefetch: + add $end, $end, $inp + sub $end, $end, #1 + mov $step, #64 + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + prfm pldl1keep, [$inp] + ret +.size vec_prefetch,.-vec_prefetch +___ +} + +print $code; + +close STDOUT; diff --git a/blst/asm/add_mod_384-x86_64.pl b/blst/asm/add_mod_384-x86_64.pl new file mode 100755 index 0000000..88dde45 --- /dev/null +++ b/blst/asm/add_mod_384-x86_64.pl @@ -0,0 +1,1430 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 384 bits add +my @acc=map("%r$_",(8..15, "ax", "bx", "bp")); + push(@acc, $a_ptr); + +$code.=<<___; +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,\@function,4,"unwind" +.align 32 +add_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __add_mod_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,\@abi-omnipotent +.align 32 +__add_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__add_mod_384_a_is_loaded: + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,\@function,4,"unwind" +.align 32 +add_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$24, %rsp +.cfi_adjust_cfa_offset 24 +.cfi_end_prologue + + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + lea 48($a_ptr), $a_ptr # a->im + lea 48($b_org), $b_org # b->im + lea 48($r_ptr), $r_ptr # ret->im + call __add_mod_384 # add_mod_384(ret->im, a->im, b->im, mod); + + mov 8*0(%rsp), $a_ptr # a->re + mov 8*1(%rsp), $b_org # b->re + lea -48($r_ptr), $r_ptr # ret->re + call __add_mod_384 # add_mod_384(ret->re, a->re, b->re, mod); + + mov 24+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 24+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 24+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 24+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 24+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 24+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 24+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + +######################################################################## +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,\@function,4,"unwind" +.align 32 +rshift_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +.Loop_rshift_mod_384: + call __rshift_mod_384 + dec %edx + jnz .Loop_rshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,\@abi-omnipotent +.align 32 +__rshift_mod_384: + mov \$1, @acc[11] + mov 8*0($n_ptr), @acc[6] + and @acc[0], @acc[11] + mov 8*1($n_ptr), @acc[7] + neg @acc[11] + mov 8*2($n_ptr), @acc[8] + and @acc[11], @acc[6] + mov 8*3($n_ptr), @acc[9] + and @acc[11], @acc[7] + mov 8*4($n_ptr), @acc[10] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], @acc[10] + adc @acc[5], @acc[11] + sbb @acc[5], @acc[5] + + shr \$1, @acc[6] + mov @acc[7], @acc[0] + shr \$1, @acc[7] + mov @acc[8], @acc[1] + shr \$1, @acc[8] + mov @acc[9], @acc[2] + shr \$1, @acc[9] + mov @acc[10], @acc[3] + shr \$1, @acc[10] + mov @acc[11], @acc[4] + shr \$1, @acc[11] + shl \$63, @acc[0] + shl \$63, @acc[1] + or @acc[6], @acc[0] + shl \$63, @acc[2] + or @acc[7], @acc[1] + shl \$63, @acc[3] + or @acc[8], @acc[2] + shl \$63, @acc[4] + or @acc[9], @acc[3] + shl \$63, @acc[5] + or @acc[10], @acc[4] + or @acc[11], @acc[5] + + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,\@function,3,"unwind" +.align 32 +div_by_2_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov $b_org, $n_ptr + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + call __rshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size div_by_2_mod_384,.-div_by_2_mod_384 + +######################################################################## +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,\@function,4,"unwind" +.align 32 +lshift_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +.Loop_lshift_mod_384: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $r_ptr, $r_ptr + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov (%rsp), $r_ptr + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + cmovc @acc[9], @acc[3] + cmovc @acc[10], @acc[4] + cmovc @acc[11], @acc[5] + + dec %edx + jnz .Loop_lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,\@abi-omnipotent +.align 32 +__lshift_mod_384: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + cmovc @acc[9], @acc[3] + cmovc @acc[10], @acc[4] + cmovc @acc[11], @acc[5] + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +######################################################################## +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,\@function,3,"unwind" +.align 32 +mul_by_3_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + + mov (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,\@function,3,"unwind" +.align 32 +mul_by_8_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +######################################################################## +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + + mov (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov (%rsp), $a_ptr + lea 8*6($r_ptr), $r_ptr + + mov 8*6($a_ptr), @acc[0] + mov 8*7($a_ptr), @acc[1] + mov 8*8($a_ptr), @acc[2] + mov 8*9($a_ptr), @acc[3] + mov 8*10($a_ptr), @acc[4] + mov 8*11($a_ptr), @acc[5] + + call __lshift_mod_384 + + mov \$8*6, $b_org + add (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov (%rsp), $a_ptr + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 48+8*0($a_ptr), @acc[0] + mov 48+8*1($a_ptr), @acc[1] + mov 48+8*2($a_ptr), @acc[2] + mov 48+8*3($a_ptr), @acc[3] + mov 48+8*4($a_ptr), @acc[4] + mov 48+8*5($a_ptr), @acc[5] + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 48+8*0($r_ptr) + mov @acc[1], 48+8*1($r_ptr) + mov @acc[2], 48+8*2($r_ptr) + mov @acc[3], 48+8*3($r_ptr) + mov @acc[4], 48+8*4($r_ptr) + mov @acc[5], 48+8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +######################################################################## +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,\@function,4,"unwind" +.align 32 +cneg_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $b_org # condition flag +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), $b_org # load a[0:5] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov $b_org, @acc[0] + mov 8*3($a_ptr), @acc[3] + or @acc[1], $b_org + mov 8*4($a_ptr), @acc[4] + or @acc[2], $b_org + mov 8*5($a_ptr), @acc[5] + or @acc[3], $b_org + mov \$-1, @acc[11] + or @acc[4], $b_org + or @acc[5], $b_org + + mov 8*0($n_ptr), @acc[6] # load n[0:5] + cmovnz @acc[11], $b_org # mask = a[0:5] ? -1 : 0 + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + and $b_org, @acc[6] # n[0:5] &= mask + mov 8*3($n_ptr), @acc[9] + and $b_org, @acc[7] + mov 8*4($n_ptr), @acc[10] + and $b_org, @acc[8] + mov 8*5($n_ptr), @acc[11] + and $b_org, @acc[9] + mov 0(%rsp), $n_ptr # restore condition flag + and $b_org, @acc[10] + and $b_org, @acc[11] + + sub @acc[0], @acc[6] # a[0:5] ? n[0:5]-a[0:5] : 0-0 + sbb @acc[1], @acc[7] + sbb @acc[2], @acc[8] + sbb @acc[3], @acc[9] + sbb @acc[4], @acc[10] + sbb @acc[5], @acc[11] + + or $n_ptr, $n_ptr # check condition flag + + cmovz @acc[0], @acc[6] # flag ? n[0:5]-a[0:5] : a[0:5] + cmovz @acc[1], @acc[7] + cmovz @acc[2], @acc[8] + mov @acc[6], 8*0($r_ptr) + cmovz @acc[3], @acc[9] + mov @acc[7], 8*1($r_ptr) + cmovz @acc[4], @acc[10] + mov @acc[8], 8*2($r_ptr) + cmovz @acc[5], @acc[11] + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + +######################################################################## +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,\@function,4,"unwind" +.align 32 +sub_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sub_mod_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,\@abi-omnipotent +.align 32 +__sub_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,\@function,4,"unwind" +.align 32 +sub_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$24, %rsp +.cfi_adjust_cfa_offset 24 +.cfi_end_prologue + + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + lea 48($a_ptr), $a_ptr # a->im + lea 48($b_org), $b_org # b->im + lea 48($r_ptr), $r_ptr # ret->im + call __sub_mod_384 # sub_mod_384(ret->im, a->im, b->im, mod); + + mov 8*0(%rsp), $a_ptr # a->re + mov 8*1(%rsp), $b_org # b->re + lea -48($r_ptr), $r_ptr # ret->re + call __sub_mod_384 # sub_mod_384(ret->re, a->re, b->re, mod); + + mov 24+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 24+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 24+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 24+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 24+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 24+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 24+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +___ +} +{ ###################################################### ret = a * (1 + i) +my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx"); +my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp")); + +$code.=<<___; +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$56, %rsp +.cfi_adjust_cfa_offset 56 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + mov $r_ptr, 8*6(%rsp) # offload r_ptr + sbb $r_ptr, $r_ptr + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $a_ptr, $a_ptr + + mov @acc[0], 8*0(%rsp) # offload a->re + a->im [without carry] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1(%rsp) + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2(%rsp) + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3(%rsp) + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4(%rsp) + and $a_ptr, @acc[0] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5(%rsp) + and $a_ptr, @acc[1] + mov 8*5($n_ptr), @acc[5] + and $a_ptr, @acc[2] + and $a_ptr, @acc[3] + and $a_ptr, @acc[4] + and $a_ptr, @acc[5] + mov 8*6(%rsp), $a_ptr # restore r_ptr + + add @acc[0], @acc[6] + mov 8*0(%rsp), @acc[0] # restore a->re + a->im + adc @acc[1], @acc[7] + mov 8*1(%rsp), @acc[1] + adc @acc[2], @acc[8] + mov 8*2(%rsp), @acc[2] + adc @acc[3], @acc[9] + mov 8*3(%rsp), @acc[3] + adc @acc[4], @acc[10] + mov 8*4(%rsp), @acc[4] + adc @acc[5], @acc[11] + mov 8*5(%rsp), @acc[5] + + mov @acc[6], 8*0($a_ptr) # ret->re = a->re - a->im + mov @acc[0], @acc[6] + mov @acc[7], 8*1($a_ptr) + mov @acc[8], 8*2($a_ptr) + mov @acc[1], @acc[7] + mov @acc[9], 8*3($a_ptr) + mov @acc[10], 8*4($a_ptr) + mov @acc[2], @acc[8] + mov @acc[11], 8*5($a_ptr) + + sub 8*0($n_ptr), @acc[0] + mov @acc[3], @acc[9] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[4], @acc[10] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($a_ptr) # ret->im = a->re + a->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($a_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($a_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($a_ptr) + mov @acc[4], 8*10($a_ptr) + mov @acc[5], 8*11($a_ptr) + + mov 56+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 56+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 56+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 56+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 56+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 56+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 56+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +___ +} +{ ###################################################### +my ($r_ptr,$n_ptr) = ("%rdi","%rsi"); +my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp")); + +$code.=<<___; +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,\@function,2,"unwind" +.align 32 +sgn0_pty_mod_384: +.cfi_startproc +.cfi_end_prologue + mov 8*0($r_ptr), @acc[0] + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + + xor %rax, %rax + mov @acc[0], $r_ptr + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, $r_ptr + and \$2, %rax + or $r_ptr, %rax # pack sign and parity + +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,\@function,2,"unwind" +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*6($r_ptr), @acc[0] # sgn0(a->im) + mov 8*7($r_ptr), @acc[1] + mov 8*8($r_ptr), @acc[2] + mov 8*9($r_ptr), @acc[3] + mov 8*10($r_ptr), @acc[4] + mov 8*11($r_ptr), @acc[5] + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), %rax # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + mov 8*0(%rax), @acc[0] + mov 8*1(%rax), @acc[1] + mov 8*2(%rax), @acc[2] + mov 8*3(%rax), @acc[3] + mov 8*4(%rax), @acc[4] + mov 8*5(%rax), @acc[5] + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp), %rbx +.cfi_restore %rbx + mov 16(%rsp), %rbp +.cfi_restore %rbp + lea 24(%rsp), %rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +___ +} +if (0) { +my $inp = $win64 ? "%rcx" : "%rdi"; +$code.=<<___; +.globl nbits_384 +.hidden nbits_384 +.type nbits_384,\@abi-omnipotent +.align 32 +nbits_384: + mov 8*5($inp), %r8 + mov 8*4($inp), %r9 + mov 8*3($inp), %r10 + mov 8*2($inp), %r11 + mov \$-1, %rdx + mov \$127, %eax + bsr %r8, %r8 + cmovnz %rdx,%r9 + cmovz %rax,%r8 + bsr %r9, %r9 + cmovnz %rdx,%r10 + cmovz %rax,%r9 + xor \$63,%r8 + bsr %r10, %r10 + cmovnz %rdx, %r11 + cmovz %rax, %r10 + xor \$63,%r9 + add %r8, %r9 + mov 8*1($inp), %r8 + bsr %r11, %r11 + cmovnz %rdx, %r8 + cmovz %rax, %r11 + xor \$63, %r10 + add %r9, %r10 + mov 8*0($inp), %r9 + bsr %r8, %r8 + cmovnz %rdx, %r9 + cmovz %rax, %r8 + xor \$63, %r11 + add %r10, %r11 + bsr %r9, %r9 + cmovz %rax, %r9 + xor \$63, %r8 + add %r11, %r8 + xor \$63, %r9 + add %r8, %r9 + mov \$384, %eax + sub %r9, %rax + ret +.size nbits_384,.-nbits_384 +___ +} + +if (1) { +my ($out, $inp1, $inp2, $select) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9d") + : ("%rdi", "%rsi", "%rdx", "%ecx"); + +sub vec_select { +my $sz = shift; +my $half = $sz/2; +my ($xmm0,$xmm1,$xmm2,$xmm3)=map("%xmm$_",(0..3)); + +$code.=<<___; +.globl vec_select_$sz +.hidden vec_select_$sz +.type vec_select_$sz,\@abi-omnipotent +.align 32 +vec_select_$sz: + movd $select, %xmm5 + pxor %xmm4,%xmm4 + pshufd \$0,%xmm5,%xmm5 # broadcast + movdqu ($inp1),$xmm0 + lea $half($inp1),$inp1 + pcmpeqd %xmm4,%xmm5 + movdqu ($inp2),$xmm1 + lea $half($inp2),$inp2 + pcmpeqd %xmm5,%xmm4 + lea $half($out),$out +___ +for($i=0; $i<$sz-16; $i+=16) { +$code.=<<___; + pand %xmm4,$xmm0 + movdqu $i+16-$half($inp1),$xmm2 + pand %xmm5,$xmm1 + movdqu $i+16-$half($inp2),$xmm3 + por $xmm1,$xmm0 + movdqu $xmm0,$i-$half($out) +___ + ($xmm0,$xmm1,$xmm2,$xmm3)=($xmm2,$xmm3,$xmm0,$xmm1); +} +$code.=<<___; + pand %xmm4,$xmm0 + pand %xmm5,$xmm1 + por $xmm1,$xmm0 + movdqu $xmm0,$i-$half($out) + ret +.size vec_select_$sz,.-vec_select_$sz +___ +} +vec_select(48); +vec_select(96); +vec_select(192); +vec_select(144); +vec_select(288); +} + +{ +my ($inp, $end) = $win64 ? ("%rcx", "%rdx") : ("%rdi", "%rsi"); + +$code.=<<___; +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,\@abi-omnipotent +.align 32 +vec_prefetch: + leaq -1($inp,$end), $end + mov \$64, %rax + xor %r8, %r8 + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + prefetchnta ($inp) + ret +.size vec_prefetch,.-vec_prefetch +___ +} +print $code; +close STDOUT; diff --git a/blst/asm/add_mod_384x384-x86_64.pl b/blst/asm/add_mod_384x384-x86_64.pl new file mode 100755 index 0000000..6ee3cf8 --- /dev/null +++ b/blst/asm/add_mod_384x384-x86_64.pl @@ -0,0 +1,260 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +############################################################ 384x384 add/sub +# Double-width addition/subtraction modulo n<<384, as opposite to +# naively expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +{ +my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.text + +.type __add_mod_384x384,\@abi-omnipotent +.align 32 +__add_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + add 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + adc 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + adc 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + adc 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + adc 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + adc 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + adc 8*6($b_org), @acc[6] + mov @acc[1], 8*1($r_ptr) + adc 8*7($b_org), @acc[7] + mov @acc[2], 8*2($r_ptr) + adc 8*8($b_org), @acc[8] + mov @acc[4], 8*4($r_ptr) + mov @acc[6], @acc[0] + adc 8*9($b_org), @acc[9] + mov @acc[3], 8*3($r_ptr) + mov @acc[7], @acc[1] + adc 8*10($b_org), @acc[10] + mov @acc[5], 8*5($r_ptr) + mov @acc[8], @acc[2] + adc 8*11($b_org), @acc[11] + mov @acc[9], @acc[3] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[10], @acc[4] + sbb 8*2($n_ptr), @acc[8] + sbb 8*3($n_ptr), @acc[9] + sbb 8*4($n_ptr), @acc[10] + mov @acc[11], @acc[5] + sbb 8*5($n_ptr), @acc[11] + sbb \$0, $b_org + + cmovc @acc[0], @acc[6] + cmovc @acc[1], @acc[7] + cmovc @acc[2], @acc[8] + mov @acc[6], 8*6($r_ptr) + cmovc @acc[3], @acc[9] + mov @acc[7], 8*7($r_ptr) + cmovc @acc[4], @acc[10] + mov @acc[8], 8*8($r_ptr) + cmovc @acc[5], @acc[11] + mov @acc[9], 8*9($r_ptr) + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.type __sub_mod_384x384,\@abi-omnipotent +.align 32 +__sub_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,\@function,4,"unwind" +.align 32 +add_mod_384x384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __add_mod_384x384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,\@function,4,"unwind" +.align 32 +sub_mod_384x384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sub_mod_384x384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/arm-xlate.pl b/blst/asm/arm-xlate.pl new file mode 100755 index 0000000..5028a62 --- /dev/null +++ b/blst/asm/arm-xlate.pl @@ -0,0 +1,381 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ARM assembler distiller/adapter by \@dot-asm. + +use strict; + +################################################################ +# Recognized "flavour"-s are: +# +# linux[32|64] GNU assembler, effectively pass-through +# ios[32|64] global symbols' decorations, PIC tweaks, etc. +# win[32|64] Visual Studio armasm-specific directives +# coff[32|64] e.g. clang --target=arm-windows ... +# +my $flavour = shift; + $flavour = "linux" if (!$flavour or $flavour eq "void"); + +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +my %GLOBALS; +my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0; +my $in_proc; # used with 'windows' flavour + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch +my $fpu = sub { } if ($flavour !~ /linux/); # omit .fpu + +my $rodata = sub { + SWITCH: for ($flavour) { + /linux/ && return ".section\t.rodata"; + /ios/ && return ".section\t__TEXT,__const"; + /coff/ && return ".section\t.rdata,\"dr\""; + /win/ && return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8"; + last; + } +}; + +my $hidden = sub { + if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } +} if ($flavour !~ /linux/); + +my $comm = sub { + my @args = split(/,\s*/,shift); + my $name = @args[0]; + my $global = \$GLOBALS{$name}; + my $ret; + + if ($flavour =~ /ios32/) { + $ret = ".comm\t_$name,@args[1]\n"; + $ret .= ".non_lazy_symbol_pointer\n"; + $ret .= "$name:\n"; + $ret .= ".indirect_symbol\t_$name\n"; + $ret .= ".long\t0\n"; + $ret .= ".previous"; + $name = "_$name"; + } elsif ($flavour =~ /win/) { + $ret = "\tCOMMON\t|$name|,@args[1]"; + } elsif ($flavour =~ /coff/) { + $ret = ".comm\t$name,@args[1]"; + } else { + $ret = ".comm\t".join(',',@args); + } + + $$global = $name; + $ret; +}; + +my $globl = sub { + my $name = shift; + my $global = \$GLOBALS{$name}; + my $ret; + + SWITCH: for ($flavour) { + /ios/ && do { $name = "_$name"; last; }; + /win/ && do { $ret = ""; last; }; + } + + $ret = ".globl $name" if (!defined($ret)); + $$global = $name; + $ret; +}; +my $global = $globl; + +my $extern = sub { + &$globl(@_); + if ($flavour =~ /win/) { + return "\tEXTERN\t@_"; + } + return; # return nothing +}; + +my $type = sub { + my $arg = join(',',@_); + my $ret; + + SWITCH: for ($flavour) { + /ios32/ && do { if ($arg =~ /(\w+),\s*%function/) { + $ret = "#ifdef __thumb2__\n" . + ".thumb_func $1\n" . + "#endif"; + } + last; + }; + /win/ && do { if ($arg =~ /(\w+),\s*%(function|object)/) { + my $type = "[DATA]"; + if ($2 eq "function") { + $in_proc = $1; + $type = "[FUNC]"; + } + $ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type" + : ""; + } + last; + }; + /coff/ && do { if ($arg =~ /(\w+),\s*%function/) { + $ret = ".def $1;\n". + ".type 32;\n". + ".endef"; + } + last; + }; + } + return $ret; +} if ($flavour !~ /linux/); + +my $size = sub { + if ($in_proc && $flavour =~ /win/) { + $in_proc = undef; + return "\tENDP"; + } +} if ($flavour !~ /linux/); + +my $inst = sub { + if ($flavour =~ /win/) { "\tDCDU\t".join(',',@_); } + else { ".long\t".join(',',@_); } +} if ($flavour !~ /linux/); + +my $asciz = sub { + my $line = join(",",@_); + if ($line =~ /^"(.*)"$/) + { if ($flavour =~ /win/) { + "\tDCB\t$line,0\n\tALIGN\t4"; + } else { + ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; + } + } else { ""; } +}; + +my $align = sub { + "\tALIGN\t".2**@_[0]; +} if ($flavour =~ /win/); + $align = sub { + ".p2align\t".@_[0]; +} if ($flavour =~ /coff/); + +my $byte = sub { + "\tDCB\t".join(',',@_); +} if ($flavour =~ /win/); + +my $short = sub { + "\tDCWU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $word = sub { + "\tDCDU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $long = $word if ($flavour =~ /win/); + +my $quad = sub { + "\tDCQU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $skip = sub { + "\tSPACE\t".shift; +} if ($flavour =~ /win/); + +my $code = sub { + "\tCODE@_[0]"; +} if ($flavour =~ /win/); + +my $thumb = sub { # .thumb should appear prior .text in source + "# define ARM THUMB\n" . + "\tTHUMB"; +} if ($flavour =~ /win/); + +my $text = sub { + "\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM"); +} if ($flavour =~ /win/); + +my $syntax = sub {} if ($flavour =~ /win/); # omit .syntax + +my $rva = sub { + # .rva directive comes in handy only on 32-bit Windows, i.e. it can + # be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections. + # However! Corresponding compilers don't seem to bet on PIC, which + # raises the question why would assembler programmer have to jump + # through the hoops? But just in case, it would go as following: + # + # ldr r1,.LOPENSSL_armcap + # ldr r2,.LOPENSSL_armcap+4 + # adr r0,.LOPENSSL_armcap + # bic r1,r1,#1 ; de-thumb-ify link.exe's ideas + # sub r0,r0,r1 ; r0 is image base now + # ldr r0,[r0,r2] + # ... + #.LOPENSSL_armcap: + # .rva .LOPENSSL_armcap ; self-reference + # .rva OPENSSL_armcap_P ; real target + # + # Non-position-independent [and ISA-neutral] alternative is so much + # simpler: + # + # ldr r0,.LOPENSSL_armcap + # ldr r0,[r0] + # ... + #.LOPENSSL_armcap: + # .long OPENSSL_armcap_P + # + "\tDCDU\t@_[0]\n\tRELOC\t2" +} if ($flavour =~ /win(?!64)/); + +################################################################ +# some broken instructions in Visual Studio armasm[64]... + +my $it = sub {} if ($flavour =~ /win32/); # omit 'it' + +my $ext = sub { + "\text8\t".join(',',@_); +} if ($flavour =~ /win64/); + +my $csel = sub { + my ($args,$comment) = split(m|\s*//|,shift); + my @regs = split(m|,\s*|,$args); + my $cond = pop(@regs); + + "\tcsel$cond\t".join(',',@regs); +} if ($flavour =~ /win64/); + +my $csetm = sub { + my ($args,$comment) = split(m|\s*//|,shift); + my @regs = split(m|,\s*|,$args); + my $cond = pop(@regs); + + "\tcsetm$cond\t".join(',',@regs); +} if ($flavour =~ /win64/); + +# ... then conditional branch instructions are also broken, but +# maintaining all the variants is tedious, so I kludge-fix it +# elsewhere... +################################################################ +my $adrp = sub { + my ($args,$comment) = split(m|\s*//|,shift); + "\tadrp\t$args\@PAGE"; +} if ($flavour =~ /ios64/); + +my $paciasp = sub { + ($flavour =~ /linux/) ? "\t.inst\t0xd503233f" + : &$inst(0xd503233f); +}; + +my $autiasp = sub { + ($flavour =~ /linux/) ? "\t.inst\t0xd50323bf" + : &$inst(0xd50323bf); +}; + +sub range { + my ($r,$sfx,$start,$end) = @_; + + join(",",map("$r$_$sfx",($start..$end))); +} + +sub expand_line { + my $line = shift; + my @ret = (); + + pos($line)=0; + + while ($line =~ m/\G[^@\/\{\"]*/g) { + if ($line =~ m/\G(@|\/\/|$)/gc) { + last; + } + elsif ($line =~ m/\G\{/gc) { + my $saved_pos = pos($line); + $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e; + pos($line) = $saved_pos; + $line =~ m/\G[^\}]*\}/g; + } + elsif ($line =~ m/\G\"/gc) { + $line =~ m/\G[^\"]*\"/g; + } + } + + $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; + + if ($flavour =~ /win/) { + # adjust alignment hints, "[rN,:32]" -> "[rN@32]" + $line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/; + # adjust local labels, ".Lwhatever" -> "|$Lwhatever|" + $line =~ s/\.(L\w{2,})/|\$$1|/g; + # omit "#:lo12:" on win64 + $line =~ s/#:lo12://; + } elsif ($flavour =~ /coff(?!64)/) { + $line =~ s/\.L(\w{2,})/(\$ML$1)/g; + } elsif ($flavour =~ /ios64/) { + $line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/; + } + + return $line; +} + +while(my $line=<>) { + + # fix up assembler-specific commentary delimiter + $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/); + + if ($line =~ m/^\s*(#|@|;|\/\/)/) { print $line; next; } + + $line =~ s|/\*.*\*/||; # get rid of C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning... + $line =~ s|\s+$||; # ... and at the end + + { + $line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel + $line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels); + } + + { + $line =~ s|(^[\.\w]+)\:\s*||; + my $label = $1; + if ($label) { + $label = ($GLOBALS{$label} or $label); + if ($flavour =~ /win/) { + $label =~ s|^\.L(?=\w)|\$L|; + printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : ""); + } else { + $label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/); + printf "%s:", $label; + } + } + } + + if ($line !~ m/^[#@;]/) { + $line =~ s|^\s*(\.?)(\S+)\s*||; + my $c = $1; $c = "\t" if ($c eq ""); + my $mnemonic = $2; + my $opcode; + if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) { + $opcode = eval("\$$1_$2"); + } else { + $opcode = eval("\$$mnemonic"); + } + + my $arg=expand_line($line); + + if (ref($opcode) eq 'CODE') { + $line = &$opcode($arg); + } elsif ($mnemonic) { + if ($flavour =~ /win64/) { + # "b.cond" -> "bcond", kludge-fix:-( + $mnemonic =~ s/^b\.([a-z]{2}$)/b$1/; + } + $line = $c.$mnemonic; + $line.= "\t$arg" if ($arg ne ""); + } + } + + print $line if ($line); + print "\n"; +} + +print "\tEND\n" if ($flavour =~ /win/); + +close STDOUT; diff --git a/blst/asm/ct_inverse_mod_256-armv8.pl b/blst/asm/ct_inverse_mod_256-armv8.pl new file mode 100755 index 0000000..ced8c6c --- /dev/null +++ b/blst/asm/ct_inverse_mod_256-armv8.pl @@ -0,0 +1,586 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 - +# on Cortex-A57. +# +# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, +# const vec256 modx); +# +$python_ref.=<<'___'; +def ct_inverse_mod_256(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 512 // k - 1): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smul_256_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smul_512x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 512 % k + k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 512 % k + k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + mod <<= 512 - mod.bit_length() # align to the left + if v < 0: + v += mod + if v < 0: + v += mod + elif v == 1<<512 + v -= mod + + return v & (2**512 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); +my @acc=map("x$_",(4..11)); +my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17)); +my $cnt = $n_ptr; +my @t = map("x$_",(19..26)); +my ($a_lo, $b_lo) = @acc[3,7]; + +$frame = 16+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256, %function +.align 5 +ct_inverse_mod_256: + paciasp + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #$frame + + ldp @acc[0], @acc[1], [$in_ptr,#8*0] + ldp @acc[2], @acc[3], [$in_ptr,#8*2] + + add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot + and $in_ptr, $in_ptr, #-512 // in the frame... + str $out_ptr, [sp] + + ldp @acc[4], @acc[5], [$n_ptr,#8*0] + ldp @acc[6], @acc[7], [$n_ptr,#8*2] + + stp @acc[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*2] + stp @acc[4], @acc[5], [$in_ptr,#8*4] // copy modulus to |b| + stp @acc[6], @acc[7], [$in_ptr,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str $f0,[$out_ptr,#8*8] // initialize |u| with |f0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str $f0, [$out_ptr,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr @acc[4], [$in_ptr,#8*8] // |u| + ldr @acc[5], [$in_ptr,#8*13] // |v| + madd @acc[0], $f_, @acc[4], xzr // |u|*|f0| + madd @acc[0], $g_, @acc[5], @acc[0] // |v|*|g0| + str @acc[0], [$out_ptr,#8*4] + asr @acc[1], @acc[0], #63 // sign extenstion + stp @acc[1], @acc[1], [$out_ptr,#8*5] + stp @acc[1], @acc[1], [$out_ptr,#8*7] + + madd @acc[0], $f0, @acc[4], xzr // |u|*|f1| + madd @acc[0], $g0, @acc[5], @acc[0] // |v|*|g1| + str @acc[0], [$out_ptr,#8*9] + asr @acc[1], @acc[0], #63 // sign extenstion + stp @acc[1], @acc[1], [$out_ptr,#8*10] + stp @acc[1], @acc[1], [$out_ptr,#8*12] +___ +for($i=2; $i<15; $i++) { +$code.=<<___; + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add $out_ptr, $out_ptr, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc @t[3], @t[3], @t[4] + str @t[3], [$out_ptr,#8*4] + + mov $f_, $f0 // corrected |f1| + mov $g_, $g0 // corrected |g1| + add $out_ptr, $out_ptr, #8*5 // pointer to destination |v| + bl __smul_256x63 +___ +$code.=<<___ if ($i>7); + bl __smul_512x63_tail +___ +$code.=<<___ if ($i<=7); + adc @t[3], @t[3], @t[4] + stp @t[3], @t[3], [$out_ptr,#8*4] + stp @t[3], @t[3], [$out_ptr,#8*6] +___ +} +$code.=<<___; + ////////////////////////////////////////// two[!] last iterations + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr $a_lo, [$in_ptr,#8*0] // just load + ldr $b_lo, [$in_ptr,#8*4] + bl __inner_loop_62_256 + + mov $f_, $f1 + mov $g_, $g1 + ldr $out_ptr, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh @t[1], @acc[3], $g_ // figure out top-most limb + ldp @acc[4], @acc[5], [$nx_ptr,#8*0] + adc @t[4], @t[4], @t[6] + ldp @acc[6], @acc[7], [$nx_ptr,#8*2] + + add @t[1], @t[1], @t[4] // @t[1] is 1, 0 or -1 + asr @t[0], @t[1], #63 // sign as mask + + and @t[4], @acc[4], @t[0] // add mod<<256 conditionally + and @t[5], @acc[5], @t[0] + adds @acc[0], @acc[0], @t[4] + and @t[6], @acc[6], @t[0] + adcs @acc[1], @acc[1], @t[5] + and @t[7], @acc[7], @t[0] + adcs @acc[2], @acc[2], @t[6] + adcs @acc[3], @t[3], @t[7] + adc @t[1], @t[1], xzr // @t[1] is 1, 0 or -1 + + neg @t[0], @t[1] + orr @t[1], @t[1], @t[0] // excess bit or sign as mask + asr @t[0], @t[0], #63 // excess bit as mask + + and @acc[4], @acc[4], @t[1] // mask |mod| + and @acc[5], @acc[5], @t[1] + and @acc[6], @acc[6], @t[1] + and @acc[7], @acc[7], @t[1] + + eor @acc[4], @acc[4], @t[0] // conditionally negate |mod| + eor @acc[5], @acc[5], @t[0] + adds @acc[4], @acc[4], @t[0], lsr#63 + eor @acc[6], @acc[6], @t[0] + adcs @acc[5], @acc[5], xzr + eor @acc[7], @acc[7], @t[0] + adcs @acc[6], @acc[6], xzr + adc @acc[7], @acc[7], xzr + + adds @acc[0], @acc[0], @acc[4] // final adjustment for |mod|<<256 + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + stp @acc[0], @acc[1], [$out_ptr,#8*4] + adc @acc[3], @acc[3], @acc[7] + stp @acc[2], @acc[3], [$out_ptr,#8*6] + + add sp, sp, #$frame + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + autiasp + ret +.size ct_inverse_mod_256,.-ct_inverse_mod_256 + +//////////////////////////////////////////////////////////////////////// +.type __smul_256x63, %function +.align 5 +__smul_256x63: +___ +for($j=0; $j<2; $j++) { +my $f_ = $f_; $f_ = $g_ if ($j); +my @acc = @acc; @acc = @acc[4..7] if ($j); +my $k = 8*8+8*5*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) + asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) + ldr @t[3+$j], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) + sub $f_, $f_, $f1 + eor @acc[1], @acc[1], $f1 + adds @acc[0], @acc[0], $f1, lsr#63 + eor @acc[2], @acc[2], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + eor @t[3+$j], @t[3+$j], $f1 + umulh @t[0], @acc[0], $f_ + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $f_ + adcs @t[3+$j], @t[3+$j], xzr + umulh @t[2], @acc[2], $f_ +___ +$code.=<<___ if ($j!=0); + adc $g1, xzr, xzr // used in __smul_512x63_tail +___ +$code.=<<___; + mul @acc[0], @acc[0], $f_ + cmp $f_, #0 + mul @acc[1], @acc[1], $f_ + csel @t[3+$j], @t[3+$j], xzr, ne + mul @acc[2], @acc[2], $f_ + adds @acc[1], @acc[1], @t[0] + mul @t[5+$j], @acc[3], $f_ + adcs @acc[2], @acc[2], @t[1] + adcs @t[5+$j], @t[5+$j], @t[2] +___ +$code.=<<___ if ($j==0); + adc @t[7], xzr, xzr +___ +} +$code.=<<___; + adc @t[7], @t[7], xzr + + adds @acc[0], @acc[0], @acc[4] + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @t[5], @t[5], @t[6] + stp @acc[2], @t[5], [$out_ptr,#8*2] + + ret +.size __smul_256x63,.-__smul_256x63 + +.type __smul_512x63_tail, %function +.align 5 +__smul_512x63_tail: + umulh @t[5], @acc[3], $f_ + ldp @acc[1], @acc[2], [$in_ptr,#8*18] // load rest of |v| + adc @t[7], @t[7], xzr + ldr @acc[3], [$in_ptr,#8*20] + and @t[3], @t[3], $f_ + + umulh @acc[7], @acc[7], $g_ // resume |v|*|g1| chain + + sub @t[5], @t[5], @t[3] // tie up |u|*|f1| chain + asr @t[6], @t[5], #63 + + eor @acc[1], @acc[1], $f1 // conditionally negate rest of |v| + eor @acc[2], @acc[2], $f1 + adds @acc[1], @acc[1], $g1 + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + umulh @t[0], @t[4], $g_ + adc @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $g_ + add @acc[7], @acc[7], @t[7] + umulh @t[2], @acc[2], $g_ + + mul @acc[0], @t[4], $g_ + mul @acc[1], @acc[1], $g_ + adds @acc[0], @acc[0], @acc[7] + mul @acc[2], @acc[2], $g_ + adcs @acc[1], @acc[1], @t[0] + mul @t[3], @acc[3], $g_ + adcs @acc[2], @acc[2], @t[1] + adcs @t[3], @t[3], @t[2] + adc @t[4], xzr, xzr // used in the final step + + adds @acc[0], @acc[0], @t[5] + adcs @acc[1], @acc[1], @t[6] + adcs @acc[2], @acc[2], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*4] + adcs @t[3], @t[3], @t[6] // carry is used in the final step + stp @acc[2], @t[3], [$out_ptr,#8*6] + + ret +.size __smul_512x63_tail,.-__smul_512x63_tail + +.type __smul_256_n_shift_by_31, %function +.align 5 +__smul_256_n_shift_by_31: +___ +for($j=0; $j<2; $j++) { +my $f0 = $f0; $f0 = $g0 if ($j); +my @acc = @acc; @acc = @acc[4..7] if ($j); +my $k = 8*4*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) + asr @t[5], $f0, #63 // |f0|'s sign as mask (or |g0|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor @t[6], $f0, @t[5] // conditionally negate |f0| (or |g0|) + + eor @acc[0], @acc[0], @t[5] // conditionally negate |a| (or |b|) + sub @t[6], @t[6], @t[5] + eor @acc[1], @acc[1], @t[5] + adds @acc[0], @acc[0], @t[5], lsr#63 + eor @acc[2], @acc[2], @t[5] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[5] + umulh @t[0], @acc[0], @t[6] + adcs @acc[2], @acc[2], xzr + umulh @t[1], @acc[1], @t[6] + adc @acc[3], @acc[3], xzr + umulh @t[2], @acc[2], @t[6] + and @t[5], @t[5], @t[6] + umulh @t[3+$j], @acc[3], @t[6] + neg @t[5], @t[5] + + mul @acc[0], @acc[0], @t[6] + mul @acc[1], @acc[1], @t[6] + mul @acc[2], @acc[2], @t[6] + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], @t[1] + adcs @acc[3], @acc[3], @t[2] + adc @t[3+$j], @t[3+$j], @t[5] +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[4] + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + adcs @acc[3], @acc[3], @acc[7] + adc @acc[4], @t[3], @t[4] + + extr @acc[0], @acc[1], @acc[0], #31 + extr @acc[1], @acc[2], @acc[1], #31 + extr @acc[2], @acc[3], @acc[2], #31 + asr @t[4], @acc[4], #63 // result's sign as mask + extr @acc[3], @acc[4], @acc[3], #31 + + eor @acc[0], @acc[0], @t[4] // ensure the result is positive + eor @acc[1], @acc[1], @t[4] + adds @acc[0], @acc[0], @t[4], lsr#63 + eor @acc[2], @acc[2], @t[4] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[4] + adcs @acc[2], @acc[2], xzr + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adc @acc[3], @acc[3], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + + eor $f0, $f0, @t[4] // adjust |f/g| accordingly + eor $g0, $g0, @t[4] + sub $f0, $f0, @t[4] + sub $g0, $g0, @t[4] + + ret +.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 +___ + +{ +my @a = @acc[0..3]; +my @b = @acc[4..7]; +my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]); + +$code.=<<___; +.type __ab_approximation_31_256, %function +.align 4 +__ab_approximation_31_256: + ldp @a[2], @a[3], [$in_ptr,#8*2] + ldp @b[2], @b[3], [$in_ptr,#8*6] + ldp @a[0], @a[1], [$in_ptr,#8*0] + ldp @b[0], @b[1], [$in_ptr,#8*4] + +.Lab_approximation_31_256_loaded: + orr @t[0], @a[3], @b[3] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + csel @a[2], @a[2], @a[1], ne + orr @t[0], @a[3], @b[3] // and ones before top-most, ... + csel @b[2], @b[2], @b[1], ne + + cmp @t[0], #0 + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + csel @a[2], @a[2], @a[0], ne + orr @t[0], @a[3], @b[3] // and one more, ... + csel @b[2], @b[2], @b[0], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + neg @t[1], @t[0] + + lslv @a[3], @a[3], @t[0] // align high limbs to the left + lslv @b[3], @b[3], @t[0] + lsrv @a[2], @a[2], @t[1] + lsrv @b[2], @b[2], @t[1] + and @a[2], @a[2], @t[1], asr#6 + and @b[2], @b[2], @t[1], asr#6 + orr $a_lo, @a[3], @a[2] + orr $b_lo, @b[3], @b[2] + + bfxil $a_lo, @a[0], #0, #31 + bfxil $b_lo, @b[0], #0, #31 + + b __inner_loop_31_256 + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 + +.type __inner_loop_31_256, %function +.align 4 +__inner_loop_31_256: + mov $cnt, #31 + mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov $bias,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + and @t[0], $b_lo, @t[3] + sub @t[1], $b_lo, $a_lo // |b_|-|a_| + subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $fg1 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| + csel $fg0, $fg0, @t[0], hs + lsr $a_lo, $a_lo, #1 + and @t[0], $fg1, @t[3] + and @t[1], $bias, @t[3] + sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add $fg1, $fg1, $fg1 // |f1|<<=1 + add $fg0, $fg0, @t[1] + sub $fg1, $fg1, $bias + cbnz $cnt, .Loop_31_256 + + mov $bias, #0x7FFFFFFF + ubfx $f0, $fg0, #0, #32 + ubfx $g0, $fg0, #32, #32 + ubfx $f1, $fg1, #0, #32 + ubfx $g1, $fg1, #32, #32 + sub $f0, $f0, $bias // remove bias + sub $g0, $g0, $bias + sub $f1, $f1, $bias + sub $g1, $g1, $bias + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256, %function +.align 4 +__inner_loop_62_256: + mov $f0, #1 // |f0|=1 + mov $g0, #0 // |g0|=0 + mov $f1, #0 // |f1|=0 + mov $g1, #1 // |g1|=1 + +.Loop_62_256: + sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + and @t[0], $b_lo, @t[3] + sub @t[1], $b_lo, $a_lo // |b_|-|a_| + subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $f0 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov @t[1], $g0 + csel $f0, $f0, $f1, hs // exchange |f0| and |f1| + csel $f1, $f1, @t[0], hs + csel $g0, $g0, $g1, hs // exchange |g0| and |g1| + csel $g1, $g1, @t[1], hs + lsr $a_lo, $a_lo, #1 + and @t[0], $f1, @t[3] + and @t[1], $g1, @t[3] + add $f1, $f1, $f1 // |f1|<<=1 + add $g1, $g1, $g1 // |g1|<<=1 + sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) + cbnz $cnt, .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 +___ +} + +foreach(split("\n",$code)) { + s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/; + print $_,"\n"; +} +close STDOUT; diff --git a/blst/asm/ct_inverse_mod_256-x86_64.pl b/blst/asm/ct_inverse_mod_256-x86_64.pl new file mode 100755 index 0000000..24ab545 --- /dev/null +++ b/blst/asm/ct_inverse_mod_256-x86_64.pl @@ -0,0 +1,837 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake. +# +# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, +# const vec256 modx); +# +$python_ref.=<<'___'; +def ct_inverse_mod_256(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 512 // k - 1): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulq_256_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulq_512x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 512 % k + k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 512 % k + k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + mod <<= 512 - mod.bit_length() # align to the left + if v < 0: + v += mod + if v < 0: + v += mod + elif v == 1<<512 + v -= mod + + return v & (2**512 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15)); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edx"; + +$frame = 8*6+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256,\@function,4,"unwind" +.align 32 +ct_inverse_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*6+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + + mov 8*0($n_ptr), @acc[4] # load modulus + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + mov 8*3($n_ptr), @acc[7] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + + mov @acc[4], 8*4(%rax) # copy modulus to |b| + mov @acc[5], 8*5(%rax) + mov @acc[6], 8*6(%rax) + mov @acc[7], 8*7(%rax) + mov %rax, $in_ptr + + ################################# first iteration + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*0(%rsp) # corrected |f0| + #mov $g0, 8*1(%rsp) # corrected |g0| + mov $f0, 8*8($out_ptr) # initialize |u| with |f0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*2(%rsp) # corrected |f1| + #mov $g0, 8*3(%rsp) # corrected |g1| + mov $f0, 8*9($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + mov $f0, 8*0(%rsp) # corrected |f0| + mov $g0, 8*1(%rsp) # corrected |g0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*2(%rsp) # corrected |f1| + #mov $g0, 8*3(%rsp) # corrected |g1| + + mov 8*8($in_ptr), @acc[0] # |u| + mov 8*13($in_ptr), @acc[4] # |v| + mov @acc[0], @acc[1] + imulq 8*0(%rsp), @acc[0] # |u|*|f0| + mov @acc[4], @acc[5] + imulq 8*1(%rsp), @acc[4] # |v|*|g0| + add @acc[4], @acc[0] + mov @acc[0], 8*4($out_ptr) # destination |u| + sar \$63, @acc[0] # sign extension + mov @acc[0], 8*5($out_ptr) + mov @acc[0], 8*6($out_ptr) + mov @acc[0], 8*7($out_ptr) + mov @acc[0], 8*8($out_ptr) + lea 8*8($in_ptr), $in_ptr # make in_ptr "rewindable" with xor + + imulq $f0, @acc[1] # |u|*|f1| + imulq $g0, @acc[5] # |v|*|g1| + add @acc[5], @acc[1] + mov @acc[1], 8*9($out_ptr) # destination |v| + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + mov @acc[1], 8*12($out_ptr) + mov @acc[1], 8*13($out_ptr) +___ +for($i=2; $i<15; $i++) { +my $smul_512x63 = $i>8 ? "__smulq_512x63" + : "__smulq_256x63"; +$code.=<<___; + xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + mov $f0, 8*0(%rsp) # corrected |f0| + mov $g0, 8*1(%rsp) # corrected |g0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + mov $f0, 8*2(%rsp) # corrected |f1| + mov $g0, 8*3(%rsp) # corrected |g1| + + mov 8*0(%rsp), $f0 # |f0| + mov 8*1(%rsp), $g0 # |g0| + lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*4($out_ptr), $out_ptr # pointer to destination |u| + call __smulq_256x63 + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*5($out_ptr),$out_ptr # pointer to destination |v| + call $smul_512x63 +___ +$code.=<<___ if ($i==8); + sar \$63, %rbp # sign extension + mov %rbp, 8*5($out_ptr) + mov %rbp, 8*6($out_ptr) + mov %rbp, 8*7($out_ptr) +___ +} +$code.=<<___; + ################################# two[!] last iterations in one go + xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$47, $cnt # 31 + 512 % 31 + #call __ab_approximation_31 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + #xor @acc[1], @acc[1] # |a_hi| + mov 8*4($in_ptr), @acc[2] # |b_lo| + #xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + #mov $f1, 8*2(%rsp) + #mov $g1, 8*3(%rsp) + + #mov 8*0(%rsp), $f0 # |f0| + #mov 8*1(%rsp), $g0 # |g0| + lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulq_256x63 + + #mov 8*2(%rsp), $f0 # |f1| + #mov 8*3(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original |out_ptr| + call __smulq_512x63 + adc %rbp, %rdx # the excess limb of the result + + mov 8*5(%rsp), $in_ptr # original |nx_ptr| + mov %rdx, %rax + sar \$63, %rdx # result's sign as mask + + mov %rdx, @acc[0] # mask |modulus| + mov %rdx, @acc[1] + and 8*0($in_ptr), @acc[0] + mov %rdx, @acc[2] + and 8*1($in_ptr), @acc[1] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), %rdx + + add @acc[0], @acc[4] # conditionally add |modulus|<<256 + adc @acc[1], @acc[5] + adc @acc[2], @acc[6] + adc %rdx, @acc[7] + adc \$0, %rax + + mov %rax, %rdx + neg %rax + or %rax, %rdx # excess bit or sign as mask + sar \$63, %rax # excess bit as mask + + mov %rdx, @acc[0] # mask |modulus| + mov %rdx, @acc[1] + and 8*0($in_ptr), @acc[0] + mov %rdx, @acc[2] + and 8*1($in_ptr), @acc[1] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), %rdx + + xor %rax, @acc[0] # conditionally negate |modulus| + xor %rcx, %rcx + xor %rax, @acc[1] + sub %rax, %rcx + xor %rax, @acc[2] + xor %rax, %rdx + add %rcx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, %rdx + + add @acc[0], @acc[4] # final adjustment for |modulus|<<256 + adc @acc[1], @acc[5] + adc @acc[2], @acc[6] + adc %rdx, @acc[7] + + mov @acc[4], 8*4($out_ptr) # store absolute value + mov @acc[5], 8*5($out_ptr) + mov @acc[6], 8*6($out_ptr) + mov @acc[7], 8*7($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_inverse_mod_256,.-ct_inverse_mod_256 +___ +######################################################################## +# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers +# to the maximum bit-length of the *result*, and "63" - to the maximum +# bit-length of the |f?| and |g?| single-limb multiplicands. However! +# The latter should not be taken literally, as they are always chosen so +# that "bad things" don't happen. For example, there comes a point when +# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we +# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is +# because past that point |f0| is always 1 and |g0| is always 0. And, +# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to +# perform full-width |u|*|f1| multiplication, half-width one with sign +# extension is sufficient... +$code.=<<___; +.type __smulq_512x63,\@abi-omnipotent +.align 32 +__smulq_512x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), %rbp # sign limb + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit + + xor $f0, %rbx # conditionally negate |f0| + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |u| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, %rbp + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, %rbp + + mulq %rbx # |u|*|f0| + mov %rax, 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov @acc[$i], 8*$i($out_ptr) + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + and %rbx, %rbp + neg %rbp + mulq %rbx + add %rax, @acc[3] + adc %rdx, %rbp + mov @acc[3], 8*3($out_ptr) + + mov 8*5($in_ptr), @acc[0] # load |v| + mov 8*6($in_ptr), @acc[1] + mov 8*7($in_ptr), @acc[2] + mov 8*8($in_ptr), @acc[3] + mov 8*9($in_ptr), @acc[4] + mov 8*10($in_ptr), @acc[5] + mov 8*11($in_ptr), @acc[6] + mov 8*12($in_ptr), @acc[7] + + mov $g0, $f0 + sar \$63, $f0 # |g0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |g0|'s sign as bit + + xor $f0, $g0 # conditionally negate |g0| + add %rax, $g0 + + xor $f0, @acc[0] # conditionally negate |v| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + xor $f0, @acc[6] + xor $f0, @acc[7] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + + mulq $g0 + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<7; $i++) { +$code.=<<___; + mulq $g0 + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + imulq $g0 + add %rax, @acc[7] + adc \$0, %rdx # used in the final step + + mov %rbp, %rbx + sar \$63, %rbp # sign extension + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc %rbx, @acc[4] + adc %rbp, @acc[5] + adc %rbp, @acc[6] + adc %rbp, @acc[7] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + mov @acc[6], 8*6($out_ptr) + mov @acc[7], 8*7($out_ptr) + + ret +.size __smulq_512x63,.-__smulq_512x63 + +.type __smulq_256x63,\@abi-omnipotent +.align 32 +__smulq_256x63: +___ +for($j=0; $j<2; $j++) { +my $k = 8*5*$j; +my @acc=@acc; @acc=@acc[4..7] if($j); +my $top="%rbp"; $top=$g0 if($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), $top # sign/excess limb + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $f0, %rbx # conditionally negate |f0| + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |u| (or |v|) + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, $top + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, $top + + mulq %rbx + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + and %rbx, $top + neg $top + mulq %rbx + add %rax, @acc[3] + adc %rdx, $top +___ +$code.=<<___ if ($j==0); + mov $g0, $f0 +___ +} +$code.=<<___; + add @acc[4], @acc[0] # accumulate |u|*|f0| + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + adc %rcx, %rbp + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov %rbp, 8*4($out_ptr) + + ret +.size __smulq_256x63,.-__smulq_256x63 +___ +######################################################################## +# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of +# the names refers to maximum bit-lengths of |a| and |b|. As already +# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always +# chosen so that "bad things" don't happen. For example, so that the +# sum of the products doesn't overflow, and that the final result is +# never wider than inputs... +{ +$code.=<<___; +.type __smulq_256_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulq_256_n_shift_by_31: + mov $f0, 8*0($out_ptr) # offload |f0| + mov $g0, 8*1($out_ptr) # offload |g0| + mov $f0, %rbp +___ +for($j=0; $j<2; $j++) { +my $k = 8*4*$j; +my @acc=@acc; @acc=@acc[4..7] if ($j); +my $f0="%rbp"; $f0=$g0 if ($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $f0, %rbx # conditionally negate |f0| (or |g0|) + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |a| (or |b|) + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + + mulq %rbx + mov %rax, @acc[0] + mov @acc[1], %rax + and %rbx, $f0 + neg $f0 + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + mulq %rbx + add %rax, @acc[3] + adc %rdx, $f0 +___ +} +$code.=<<___; + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + adc $g0, %rbp + + mov 8*0($out_ptr), $f0 # restore original |f0| + mov 8*1($out_ptr), $g0 # restore original |g0| + + shrd \$31, @acc[1], @acc[0] + shrd \$31, @acc[2], @acc[1] + shrd \$31, @acc[3], @acc[2] + shrd \$31, %rbp, @acc[3] + + sar \$63, %rbp # sign as mask + xor %rax, %rax + sub %rbp, %rax # sign as bit + + xor %rbp, @acc[0] # conditionally negate the result + xor %rbp, @acc[1] + xor %rbp, @acc[2] + xor %rbp, @acc[3] + add %rax, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + + xor %rbp, $f0 # conditionally negate |f0| + xor %rbp, $g0 # conditionally negate |g0| + add %rax, $f0 + add %rax, $g0 + + ret +.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 +___ +} + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); +my ($a_, $b_) = ($a_lo, $b_lo); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_31_256,\@abi-omnipotent +.align 32 +__ab_approximation_31_256: + mov 8*3($in_ptr), @a[2] # load |a| in reverse order + mov 8*7($in_ptr), @b[2] # load |b| in reverse order + mov 8*2($in_ptr), @a[1] + mov 8*6($in_ptr), @b[1] + mov 8*1($in_ptr), @a[0] + mov 8*5($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*0($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*4($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[2] + cmovz @b[0], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + mov \$0x7FFFFFFF, %eax + and %rax, @a[0] + and %rax, @b[0] + not %rax + and %rax, @a[2] + and %rax, @b[2] + or @a[2], @a[0] + or @b[2], @b[0] + + jmp __inner_loop_31_256 + + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 +___ +} +$code.=<<___; +.type __inner_loop_31_256,\@abi-omnipotent +.align 32 # comment and punish Coffee Lake by up to 40% +__inner_loop_31_256: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + mov \$0x7FFFFFFF7FFFFFFF, $bias + +.Loop_31_256: + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + mov $fg0, $t2 + mov $fg1, $t3 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + + shr \$1, $a_ # |a_|>>=1 + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + sub $bias, $fg1 + sub \$1, $cnt + jnz .Loop_31_256 + + shr \$32, $bias + mov %ecx, %edx # $fg0, $f0 + mov ${fg1}d, ${f1}d + shr \$32, $g0 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256,\@abi-omnipotent +.align 32 +__inner_loop_62_256: + mov $cnt, %r15d + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov $f0, $g1 # |g1|=1 + mov $f0, %r14 + +.Loop_62_256: + xor $t0, $t0 + test %r14, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t1 + cmovnz $b_lo, $t0 + sub $a_lo, $t1 # |b_|-|a_| + mov $a_lo, $t2 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t2, $b_lo # |b_| = |a_| + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shr \$1, $a_lo + test %r14, $t2 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, %r15d + jnz .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/ct_inverse_mod_384-armv8.pl b/blst/asm/ct_inverse_mod_384-armv8.pl new file mode 100755 index 0000000..268bf9d --- /dev/null +++ b/blst/asm/ct_inverse_mod_384-armv8.pl @@ -0,0 +1,610 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >12x better [on +# Cortex cores] than modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 62 + w = 64 + mask = (1 << w) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_62 + n = max(a.bit_length(), b.bit_length()) + if n < 128: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_62 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smul_383_n_shift_by_62 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smul_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); +my @acc=map("x$_",(3..14)); +my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21)); +my $cnt = $n_ptr; +my @t = map("x$_",(22..28,2)); +my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11]; + +$frame = 16+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383, %function +.align 5 +ct_inverse_mod_383: + paciasp + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #$frame + + ldp @t[0], @acc[1], [$in_ptr,#8*0] + ldp @acc[2], @acc[3], [$in_ptr,#8*2] + ldp @acc[4], @acc[5], [$in_ptr,#8*4] + + add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot + and $in_ptr, $in_ptr, #-512 // in the frame... + stp $out_ptr, $nx_ptr, [sp] + + ldp @acc[6], @acc[7], [$n_ptr,#8*0] + ldp @acc[8], @acc[9], [$n_ptr,#8*2] + ldp @acc[10], @acc[11], [$n_ptr,#8*4] + + stp @t[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*2] + stp @acc[4], @acc[5], [$in_ptr,#8*4] + stp @acc[6], @acc[7], [$in_ptr,#8*6] // copy modulus to |b| + stp @acc[8], @acc[9], [$in_ptr,#8*8] + stp @acc[10], @acc[11], [$in_ptr,#8*10] + + ////////////////////////////////////////// first iteration + mov $cnt, #62 + bl .Lab_approximation_62_loaded + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str $f0,[$out_ptr,#8*12] // initialize |u| with |f0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str $f0, [$out_ptr,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #62 + bl __ab_approximation_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr @acc[4], [$in_ptr,#8*12] // |u| + ldr @acc[5], [$in_ptr,#8*18] // |v| + mul @acc[0], $f_, @acc[4] // |u|*|f0| + smulh @acc[1], $f_, @acc[4] + mul @acc[2], $g_, @acc[5] // |v|*|g0| + smulh @acc[3], $g_, @acc[5] + adds @acc[0], @acc[0], @acc[2] + adc @acc[1], @acc[1], @acc[3] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + asr @acc[2], @acc[1], #63 // sign extenstion + stp @acc[2], @acc[2], [$out_ptr,#8*8] + stp @acc[2], @acc[2], [$out_ptr,#8*10] + + mul @acc[0], $f0, @acc[4] // |u|*|f1| + smulh @acc[1], $f0, @acc[4] + mul @acc[2], $g0, @acc[5] // |v|*|g1| + smulh @acc[3], $g0, @acc[5] + adds @acc[0], @acc[0], @acc[2] + adc @acc[1], @acc[1], @acc[3] + stp @acc[0], @acc[1], [$out_ptr,#8*12] + asr @acc[2], @acc[1], #63 // sign extenstion + stp @acc[2], @acc[2], [$out_ptr,#8*14] + stp @acc[2], @acc[2], [$out_ptr,#8*16] +___ +for($i=2; $i<11; $i++) { +$code.=<<___; + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #62 + bl __ab_approximation_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add $out_ptr, $out_ptr, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov $f_, $f0 // corrected |f1| + mov $g_, $g0 // corrected |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to destination |v| + bl __smul_383x63 +___ +$code.=<<___ if ($i>5); + bl __smul_767x63_tail +___ +$code.=<<___ if ($i==5); + asr @t[5], @t[5], #63 // sign extension + stp @t[5], @t[5], [$out_ptr,#8*6] + stp @t[5], @t[5], [$out_ptr,#8*8] + stp @t[5], @t[5], [$out_ptr,#8*10] +___ +} +$code.=<<___; + ////////////////////////////////////////// iteration before last + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp $a_lo, $a_hi, [$in_ptr,#8*0] // just load + ldp $b_lo, $b_hi, [$in_ptr,#8*6] + bl __inner_loop_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + str $a_lo, [$out_ptr,#8*0] + str $b_lo, [$out_ptr,#8*6] + + mov $f_, $f0 // exact |f0| + mov $g_, $g0 // exact |g0| + mov $f0, $f1 + mov $g0, $g1 + add $out_ptr, $out_ptr, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov $f_, $f0 // exact |f1| + mov $g_, $g0 // exact |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr $a_lo, [$in_ptr,#8*0] // just load + eor $a_hi, $a_hi, $a_hi + ldr $b_lo, [$in_ptr,#8*6] + eor $b_hi, $b_hi, $b_hi + bl __inner_loop_62 + + mov $f_, $f1 + mov $g_, $g1 + ldp $out_ptr, $f0, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr @t[0], @acc[5], #63 // sign as mask + ldp @acc[6], @acc[7], [$f0,#8*0] + ldp @acc[8], @acc[9], [$f0,#8*2] + ldp @acc[10], @acc[11], [$f0,#8*4] + + and @acc[6], @acc[6], @t[0] // add mod<<384 conditionally + and @acc[7], @acc[7], @t[0] + adds @acc[0], @acc[0], @acc[6] + and @acc[8], @acc[8], @t[0] + adcs @acc[1], @acc[1], @acc[7] + and @acc[9], @acc[9], @t[0] + adcs @acc[2], @acc[2], @acc[8] + and @acc[10], @acc[10], @t[0] + adcs @acc[3], @acc[3], @acc[9] + and @acc[11], @acc[11], @t[0] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + adcs @acc[4], @acc[4], @acc[10] + stp @acc[2], @acc[3], [$out_ptr,#8*8] + adc @acc[5], @acc[5], @acc[11] + stp @acc[4], @acc[5], [$out_ptr,#8*10] + + add sp, sp, #$frame + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + autiasp + ret +.size ct_inverse_mod_383,.-ct_inverse_mod_383 + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.type __smul_383x63, %function +.align 5 +__smul_383x63: +___ +for($j=0; $j<2; $j++) { +my $f_ = $f_; $f_ = $g_ if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*12+8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) + asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) + sub $f_, $f_, $f1 + eor @acc[1], @acc[1], $f1 + adds @acc[0], @acc[0], $f1, lsr#63 + eor @acc[2], @acc[2], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], $f1 + adcs @acc[3], @acc[3], xzr + umulh @t[0], @acc[0], $f_ + eor @acc[5], @acc[5], $f1 + umulh @t[1], @acc[1], $f_ + adcs @acc[4], @acc[4], xzr + umulh @t[2], @acc[2], $f_ + adcs @acc[5], @acc[5], xzr + umulh @t[3], @acc[3], $f_ +___ +$code.=<<___ if ($j); + adc $g1, xzr, xzr // used in __smul_767x63_tail +___ +$code.=<<___; + umulh @t[4], @acc[4], $f_ + mul @acc[0], @acc[0], $f_ + mul @acc[1], @acc[1], $f_ + mul @acc[2], @acc[2], $f_ + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], $f_ + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], $f_ + adcs @acc[3], @acc[3], @t[2] + mul @t[5+$j],@acc[5], $f_ + adcs @acc[4], @acc[4], @t[3] + adcs @t[5+$j],@t[5+$j],@t[4] +___ +$code.=<<___ if ($j==0); + adc @t[7], xzr, xzr +___ +} +$code.=<<___; + adc @t[7], @t[7], xzr + + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], @acc[10] + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adcs @t[5], @t[5], @t[6] + stp @acc[4], @t[5], [$out_ptr,#8*4] + adc @t[6], @t[7], xzr // used in __smul_767x63_tail + + ret +.size __smul_383x63,.-__smul_383x63 + +.type __smul_767x63_tail, %function +.align 5 +__smul_767x63_tail: + smulh @t[5], @acc[5], $f_ + ldp @acc[0], @acc[1], [$in_ptr,#8*24] // load rest of |v| + umulh @acc[11],@acc[11], $g_ + ldp @acc[2], @acc[3], [$in_ptr,#8*26] + ldp @acc[4], @acc[5], [$in_ptr,#8*28] + + eor @acc[0], @acc[0], $f1 // conditionally negate rest of |v| + eor @acc[1], @acc[1], $f1 + eor @acc[2], @acc[2], $f1 + adds @acc[0], @acc[0], $g1 + eor @acc[3], @acc[3], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[4], @acc[4], $f1 + adcs @acc[2], @acc[2], xzr + eor @acc[5], @acc[5], $f1 + adcs @acc[3], @acc[3], xzr + umulh @t[0], @acc[0], $g_ + adcs @acc[4], @acc[4], xzr + umulh @t[1], @acc[1], $g_ + adc @acc[5], @acc[5], xzr + + umulh @t[2], @acc[2], $g_ + add @acc[11], @acc[11], @t[6] + umulh @t[3], @acc[3], $g_ + asr @t[6], @t[5], #63 + umulh @t[4], @acc[4], $g_ + mul @acc[0], @acc[0], $g_ + mul @acc[1], @acc[1], $g_ + mul @acc[2], @acc[2], $g_ + adds @acc[0], @acc[0], @acc[11] + mul @acc[3], @acc[3], $g_ + adcs @acc[1], @acc[1], @t[0] + mul @acc[4], @acc[4], $g_ + adcs @acc[2], @acc[2], @t[1] + mul @acc[5], @acc[5], $g_ + adcs @acc[3], @acc[3], @t[2] + adcs @acc[4], @acc[4], @t[3] + adc @acc[5], @acc[5], @t[4] + + adds @acc[0], @acc[0], @t[5] + adcs @acc[1], @acc[1], @t[6] + adcs @acc[2], @acc[2], @t[6] + adcs @acc[3], @acc[3], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + adcs @acc[4], @acc[4], @t[6] + stp @acc[2], @acc[3], [$out_ptr,#8*8] + adc @acc[5], @acc[5], @t[6] + stp @acc[4], @acc[5], [$out_ptr,#8*10] + + ret +.size __smul_767x63_tail,.-__smul_767x63_tail + +.type __smul_383_n_shift_by_62, %function +.align 5 +__smul_383_n_shift_by_62: +___ +for($j=0; $j<2; $j++) { +my $f0 = $f0; $f0 = $g0 if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) + asr @t[6], $f0, #63 // |f0|'s sign as mask (or |g0|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor @t[7], $f0, @t[6] // conditionally negate |f0| (or |g0|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], @t[6] // conditionally negate |a| (or |b|) + sub @t[7], @t[7], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + umulh @t[0], @acc[0], @t[7] + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], @t[7] + eor @acc[5], @acc[5], @t[6] + umulh @t[2], @acc[2], @t[7] + adcs @acc[4], @acc[4], xzr + umulh @t[3], @acc[3], @t[7] + adc @acc[5], @acc[5], xzr + + umulh @t[4], @acc[4], @t[7] + smulh @t[5+$j], @acc[5], @t[7] + mul @acc[0], @acc[0], @t[7] + mul @acc[1], @acc[1], @t[7] + mul @acc[2], @acc[2], @t[7] + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], @t[7] + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], @t[7] + adcs @acc[3], @acc[3], @t[2] + mul @acc[5], @acc[5], @t[7] + adcs @acc[4], @acc[4], @t[3] + adcs @acc[5], @acc[5] ,@t[4] + adc @t[5+$j], @t[5+$j], xzr +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + adcs @acc[4], @acc[4], @acc[10] + adcs @acc[5], @acc[5], @acc[11] + adc @acc[6], @t[5], @t[6] + + extr @acc[0], @acc[1], @acc[0], #62 + extr @acc[1], @acc[2], @acc[1], #62 + extr @acc[2], @acc[3], @acc[2], #62 + asr @t[6], @acc[6], #63 + extr @acc[3], @acc[4], @acc[3], #62 + extr @acc[4], @acc[5], @acc[4], #62 + extr @acc[5], @acc[6], @acc[5], #62 + + eor @acc[0], @acc[0], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + adcs @acc[3], @acc[3], xzr + eor @acc[5], @acc[5], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adc @acc[5], @acc[5], xzr + stp @acc[4], @acc[5], [$out_ptr,#8*4] + + eor $f0, $f0, @t[6] + eor $g0, $g0, @t[6] + sub $f0, $f0, @t[6] + sub $g0, $g0, @t[6] + + ret +.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 +___ + +{ +my @a = @acc[0..5]; +my @b = @acc[6..11]; + +$code.=<<___; +.type __ab_approximation_62, %function +.align 4 +__ab_approximation_62: + ldp @a[4], @a[5], [$in_ptr,#8*4] + ldp @b[4], @b[5], [$in_ptr,#8*10] + ldp @a[2], @a[3], [$in_ptr,#8*2] + ldp @b[2], @b[3], [$in_ptr,#8*8] + +.Lab_approximation_62_loaded: + orr @t[0], @a[5], @b[5] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[3], ne + orr @t[0], @a[5], @b[5] // ... ones before top-most, ... + csel @b[4], @b[4], @b[3], ne + + ldp @a[0], @a[1], [$in_ptr,#8*0] + ldp @b[0], @b[1], [$in_ptr,#8*6] + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[2], ne + orr @t[0], @a[5], @b[5] // ... and ones before that ... + csel @b[4], @b[4], @b[2], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[1], ne + orr @t[0], @a[5], @b[5] + csel @b[4], @b[4], @b[1], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + neg @t[1], @t[0] + + lslv @a[5], @a[5], @t[0] // align high limbs to the left + lslv @b[5], @b[5], @t[0] + lsrv @a[4], @a[4], @t[1] + lsrv @b[4], @b[4], @t[1] + and @a[4], @a[4], @t[1], asr#6 + and @b[4], @b[4], @t[1], asr#6 + orr @a[5], @a[5], @a[4] + orr @b[5], @b[5], @b[4] + + b __inner_loop_62 + ret +.size __ab_approximation_62,.-__ab_approximation_62 +___ +} +$code.=<<___; +.type __inner_loop_62, %function +.align 4 +__inner_loop_62: + mov $f0, #1 // |f0|=1 + mov $g0, #0 // |g0|=0 + mov $f1, #0 // |f1|=0 + mov $g1, #1 // |g1|=1 + +.Loop_62: + sbfx @t[6], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + subs @t[2], $b_lo, $a_lo // |b_|-|a_| + and @t[0], $b_lo, @t[6] + sbc @t[3], $b_hi, $a_hi + and @t[1], $b_hi, @t[6] + subs @t[4], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $f0 + sbcs @t[5], $a_hi, @t[1] + mov @t[1], $g0 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $b_hi, $b_hi, $a_hi, hs + csel $a_lo, @t[4], @t[2], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $a_hi, @t[5], @t[3], hs + csel $f0, $f0, $f1, hs // exchange |f0| and |f1| + csel $f1, $f1, @t[0], hs + csel $g0, $g0, $g1, hs // exchange |g0| and |g1| + csel $g1, $g1, @t[1], hs + extr $a_lo, $a_hi, $a_lo, #1 + lsr $a_hi, $a_hi, #1 + and @t[0], $f1, @t[6] + and @t[1], $g1, @t[6] + add $f1, $f1, $f1 // |f1|<<=1 + add $g1, $g1, $g1 // |g1|<<=1 + sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) + cbnz $cnt, .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 +___ + +print $code; +close STDOUT; diff --git a/blst/asm/ct_is_square_mod_384-armv8.pl b/blst/asm/ct_is_square_mod_384-armv8.pl new file mode 100755 index 0000000..dcf3ff8 --- /dev/null +++ b/blst/asm/ct_is_square_mod_384-armv8.pl @@ -0,0 +1,398 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast quadratic residue test as suggested in +# https://eprint.iacr.org/2020/972. Performance is >12x better [on +# Cortex cores] than modulus-specific Legendre symbol addition chain... +# +# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_is_square_mod_384(inp, mod): + a = inp + b = mod + L = 0 # only least significant bit, adding 1 makes up for sign change + + k = 30 + w = 32 + mask = (1 << w) - 1 + + for i in range(0, 768 // k - 1): + # __ab_approximation_30 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_30 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + # __smulq_384_n_shift_by_30 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if b < 0: + b = -b + if a < 0: + a = -a + L += (b % 4) >> 1 # |b| is always odd, the second bit + # tells the whole story + + if True: + for j in range(0, 768 % k + k): + if a & 1: + if a < b: + a, b = b, a + L += (a & b) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a = a-b + a = a >> 1 + L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + return (L & 1) ^ 1 +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2)); +my @acc=map("x$_",(3..14)); +my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20)); +my @t = map("x$_",(21..28)); +my ($a_, $b_) = @acc[5,11]; + +$frame = 2*256; + +$code.=<<___; +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384, %function +.align 5 +ct_is_square_mod_384: + paciasp + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #$frame + + ldp @acc[0], @acc[1], [x0,#8*0] // load input + ldp @acc[2], @acc[3], [x0,#8*2] + ldp @acc[4], @acc[5], [x0,#8*4] + + add $in_ptr, sp, #255 // find closest 256-byte-aligned spot + and $in_ptr, $in_ptr, #-256 // in the frame... + + ldp @acc[6], @acc[7], [x1,#8*0] // load modulus + ldp @acc[8], @acc[9], [x1,#8*2] + ldp @acc[10], @acc[11], [x1,#8*4] + + stp @acc[0], @acc[1], [$in_ptr,#8*6] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*8] + stp @acc[4], @acc[5], [$in_ptr,#8*10] + stp @acc[6], @acc[7], [$in_ptr,#8*0] // copy modulus to |b| + stp @acc[8], @acc[9], [$in_ptr,#8*2] + stp @acc[10], @acc[11], [$in_ptr,#8*4] + + eor $L, $L, $L // init the Legendre symbol + mov $cnt, #24 // 24 is 768/30-1 + b .Loop_is_square + +.align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub $cnt, $cnt, #1 + + eor $out_ptr, $in_ptr, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov $f1, $f0 // |f0| + mov $g1, $g0 // |g0| + add $out_ptr, $out_ptr, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp @acc[6], @acc[7], [$out_ptr,#-8*6] + eor $in_ptr, $in_ptr, #128 // flip-flop src |a|b| + and @t[6], @t[6], @acc[6] // if |a| was negative, + add $L, $L, @t[6], lsr#1 // adjust |L| + + cbnz $cnt, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr $a_, [$in_ptr,#8*6] // just load + mov $b_, @acc[6] // ldr $b_, [$in_ptr,#8*0] + mov $cnt, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, $L, #1 + eor x0, x0, #1 + + add sp, sp, #$frame + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + autiasp + ret +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smul_384_n_shift_by_30, %function +.align 5 +__smul_384_n_shift_by_30: +___ +for($j=0; $j<2; $j++) { +my $fx = $g1; $fx = $f1 if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |b| (or |a|) + asr @t[6], $fx, #63 // |g1|'s sign as mask (or |f1|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $fx, $fx, @t[6] // conditionally negate |g1| (or |f1|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], @t[6] // conditionally negate |b| (or |a|) + sub $fx, $fx, @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + umulh @t[0], @acc[0], $fx + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $fx + eor @acc[5], @acc[5], @t[6] + umulh @t[2], @acc[2], $fx + adcs @acc[4], @acc[4], xzr + umulh @t[3], @acc[3], $fx + adc @acc[5], @acc[5], xzr + + umulh @t[4], @acc[4], $fx + and @t[7], $fx, @t[6] + umulh @t[5+$j], @acc[5], $fx + neg @t[7], @t[7] + mul @acc[0], @acc[0], $fx + mul @acc[1], @acc[1], $fx + mul @acc[2], @acc[2], $fx + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], $fx + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], $fx + adcs @acc[3], @acc[3], @t[2] + mul @acc[5], @acc[5], $fx + adcs @acc[4], @acc[4], @t[3] + adcs @acc[5], @acc[5] ,@t[4] + adc @t[5+$j], @t[5+$j], @t[7] +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + adcs @acc[4], @acc[4], @acc[10] + adcs @acc[5], @acc[5], @acc[11] + adc @acc[6], @t[5], @t[6] + + extr @acc[0], @acc[1], @acc[0], #30 + extr @acc[1], @acc[2], @acc[1], #30 + extr @acc[2], @acc[3], @acc[2], #30 + asr @t[6], @acc[6], #63 + extr @acc[3], @acc[4], @acc[3], #30 + extr @acc[4], @acc[5], @acc[4], #30 + extr @acc[5], @acc[6], @acc[5], #30 + + eor @acc[0], @acc[0], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + adcs @acc[3], @acc[3], xzr + eor @acc[5], @acc[5], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adc @acc[5], @acc[5], xzr + stp @acc[4], @acc[5], [$out_ptr,#8*4] + + ret +.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 +___ + +{ +my @a = @acc[0..5]; +my @b = @acc[6..11]; +my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]); + +$code.=<<___; +.type __ab_approximation_30, %function +.align 4 +__ab_approximation_30: + ldp @b[4], @b[5], [$in_ptr,#8*4] // |a| is still in registers + ldp @b[2], @b[3], [$in_ptr,#8*2] + + orr @t[0], @a[5], @b[5] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[3], ne + orr @t[0], @a[5], @b[5] // ... ones before top-most, ... + csel @b[4], @b[4], @b[3], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[2], ne + orr @t[0], @a[5], @b[5] // ... and ones before that ... + csel @b[4], @b[4], @b[2], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[1], ne + orr @t[0], @a[5], @b[5] // and one more, ... + csel @b[4], @b[4], @b[1], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[0], ne + orr @t[0], @a[5], @b[5] + csel @b[4], @b[4], @b[0], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + neg @t[1], @t[0] + + lslv @a[5], @a[5], @t[0] // align high limbs to the left + lslv @b[5], @b[5], @t[0] + lsrv @a[4], @a[4], @t[1] + lsrv @b[4], @b[4], @t[1] + and @a[4], @a[4], @t[1], asr#6 + and @b[4], @b[4], @t[1], asr#6 + orr $a_, @a[5], @a[4] + orr $b_, @b[5], @b[4] + + bfxil $a_, @a[0], #0, #32 + bfxil $b_, @b[0], #0, #32 + + b __inner_loop_30 + ret +.size __ab_approximation_30,.-__ab_approximation_30 + +.type __inner_loop_30, %function +.align 4 +__inner_loop_30: + mov $cnt, #30 + mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov $bias,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting + and @t[4], $a_, $b_ + sub $cnt, $cnt, #1 + and @t[0], $b_, @t[3] + + sub @t[1], $b_, $a_ // |b_|-|a_| + subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + add @t[4], $L, @t[4], lsr#1 // L + (a_ & b_) >> 1 + mov @t[0], $fg1 + csel $b_, $b_, $a_, hs // |b_| = |a_| + csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| + csel $fg0, $fg0, @t[0], hs + csel $L, $L, @t[4], hs + lsr $a_, $a_, #1 + and @t[0], $fg1, @t[3] + and @t[1], $bias, @t[3] + add $t[2], $b_, #2 + sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add $fg1, $fg1, $fg1 // |f1|<<=1 + add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add $fg0, $fg0, @t[1] + sub $fg1, $fg1, $bias + + cbnz $cnt, .Loop_30 + + mov $bias, #0x7FFFFFFF + ubfx $f0, $fg0, #0, #32 + ubfx $g0, $fg0, #32, #32 + ubfx $f1, $fg1, #0, #32 + ubfx $g1, $fg1, #32, #32 + sub $f0, $f0, $bias // remove the bias + sub $g0, $g0, $bias + sub $f1, $f1, $bias + sub $g1, $g1, $bias + + ret +.size __inner_loop_30,.-__inner_loop_30 +___ +} + +$code.=<<___; +.type __inner_loop_48, %function +.align 4 +__inner_loop_48: +.Loop_48: + sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting + and @t[4], $a_, $b_ + sub $cnt, $cnt, #1 + and @t[0], $b_, @t[3] + sub @t[1], $b_, $a_ // |b_|-|a_| + subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + add @t[4], $L, @t[4], lsr#1 + csel $b_, $b_, $a_, hs // |b_| = |a_| + csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $L, $L, @t[4], hs + add $t[2], $b_, #2 + lsr $a_, $a_, #1 + add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz $cnt, .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 +___ + +print $code; +close STDOUT; diff --git a/blst/asm/ct_is_square_mod_384-x86_64.pl b/blst/asm/ct_is_square_mod_384-x86_64.pl new file mode 100755 index 0000000..40016ed --- /dev/null +++ b/blst/asm/ct_is_square_mod_384-x86_64.pl @@ -0,0 +1,494 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast quadratic residue test as suggested in +# https://eprint.iacr.org/2020/972. Performance is >5x better than +# modulus-specific Legendre symbol addition chain... +# +# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_is_square_mod_384(inp, mod): + a = inp + b = mod + L = 0 # only least significant bit, adding 1 makes up for sign change + + k = 30 + w = 32 + mask = (1 << w) - 1 + + for i in range(0, 768 // k - 1): + # __ab_approximation_30 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_30 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + # __smulq_384_n_shift_by_30 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if b < 0: + b = -b + if a < 0: + a = -a + L += (b % 4) >> 1 # |b| is always odd, the second bit + # tells the whole story + + if True: + for j in range(0, 768 % k + k): + if a & 1: + if a < b: + a, b = b, a + L += (a & b) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a = a-b + a = a >> 1 + L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + return (L & 1) ^ 1 +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr) = ("%rdi", "%rsi"); +my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx"); +my @acc=map("%r$_",(8..15)); +my $L = "%rbp"; + +$frame = 8*3+2*256; + +$code.=<<___; +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384,\@function,2,"unwind" +.align 32 +ct_is_square_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*3+255(%rsp), %rax # find closest 256-byte-aligned spot + and \$-256, %rax # in the frame... + + mov 8*0(%rdi), @acc[0] # load input + mov 8*1(%rdi), @acc[1] + mov 8*2(%rdi), @acc[2] + mov 8*3(%rdi), @acc[3] + mov 8*4(%rdi), @acc[4] + mov 8*5(%rdi), @acc[5] + + mov 8*0(%rsi), @acc[6] # load modulus + mov 8*1(%rsi), @acc[7] + mov 8*2(%rsi), %rbx + mov 8*3(%rsi), %rcx + mov 8*4(%rsi), %rdx + mov 8*5(%rsi), %rdi + mov %rax, $in_ptr # pointer to source |a|b| + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov %rbx, 8*8(%rax) + mov %rcx, 8*9(%rax) + mov %rdx, 8*10(%rax) + mov %rdi, 8*11(%rax) + + xor $L, $L # initialize the Legendre symbol + mov \$24, %ecx # 24 is 768/30-1 + jmp .Loop_is_square + +.align 32 +.Loop_is_square: + mov %ecx, 8*2(%rsp) # offload loop counter + + call __ab_approximation_30 + mov $f0, 8*0(%rsp) # offload |f0| and |g0| + mov $g0, 8*1(%rsp) + + mov \$128+8*6, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |b| + call __smulq_384_n_shift_by_30 + + mov 8*0(%rsp), $f1 # pop |f0| and |g0| + mov 8*1(%rsp), $g1 + lea -8*6($out_ptr),$out_ptr # pointer to destination |a| + call __smulq_384_n_shift_by_30 + + mov 8*2(%rsp), %ecx # re-load loop counter + xor \$128, $in_ptr # flip-flop pointer to source |a|b| + + and 8*6($out_ptr), @acc[6] # if |a| was negative, adjust |L| + shr \$1, @acc[6] + add @acc[6], $L + + sub \$1, %ecx + jnz .Loop_is_square + + ################################# last iteration + #call __ab_approximation_30 # |a| and |b| are exact, just load + #mov 8*0($in_ptr), @acc[0] # |a_| + mov 8*6($in_ptr), @acc[1] # |b_| + call __inner_loop_48 # 48 is 768%30+30 + + mov \$1, %rax + and $L, %rax + xor \$1, %rax # return value + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smulq_384_n_shift_by_30,\@abi-omnipotent +.align 32 +__smulq_384_n_shift_by_30: +___ +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, %rbx # |f1| (or |g1|) + sar \$63, %rdx # |f1|'s sign as mask (or |g1|'s) + xor %rax, %rax + sub %rdx, %rax # |f1|'s sign as bit (or |g1|'s) + + xor %rdx, %rbx # conditionally negate |f1| (or |g1|) + add %rax, %rbx + + xor %rdx, @acc[0] # conditionally negate |a| (or |b|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov %rdx, @acc[6+$j] + and %rbx, @acc[6+$j] + mulq %rbx # |a|*|f1| (or |b|*|g1|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + neg @acc[6+$j] + mulq %rbx + add %rax, @acc[5] + adc %rdx, @acc[6+$j] +___ +$code.=<<___ if ($j==0); + lea 8*6($in_ptr), $in_ptr # pointer to |b| + mov $g1, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + adc @acc[7], @acc[6] + + shrd \$30, @acc[1], @acc[0] + shrd \$30, @acc[2], @acc[1] + shrd \$30, @acc[3], @acc[2] + shrd \$30, @acc[4], @acc[3] + shrd \$30, @acc[5], @acc[4] + shrd \$30, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor %rbx, %rbx + sub @acc[6], %rbx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add %rbx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 +___ +{ +my ($a_, $b_) = @acc[0..1]; +my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15)); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t5); +my $cnt = "%edi"; +{ +my @a = @acc[0..5]; +my @b = (@a[1..3], $t4, $t5, $g0); + +$code.=<<___; +.type __ab_approximation_30,\@abi-omnipotent +.align 32 +__ab_approximation_30: + mov 8*11($in_ptr), @b[5] # load |b| in reverse order + mov 8*10($in_ptr), @b[4] + mov 8*9($in_ptr), @b[3] + + mov @a[5], %rax + or @b[5], %rax # check top-most limbs, ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[3], @a[4] + mov 8*8($in_ptr), @b[2] + cmovz @b[3], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... ones before top-most, ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[2], @a[4] + mov 8*7($in_ptr), @b[1] + cmovz @b[2], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... and ones before that ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[1], @a[4] + mov 8*6($in_ptr), @b[0] + cmovz @b[1], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... and ones before that ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[0], @a[4] + cmovz @b[0], @b[4] + + mov @a[5], %rax + or @b[5], %rax + bsr %rax, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[5] + cmovz @b[0], @b[5] + cmovz %rax, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[4], @a[5] # align second limb to the left + shldq %cl, @b[4], @b[5] + + mov \$0xFFFFFFFF00000000, %rax + mov @a[0]d, ${a_}d + mov @b[0]d, ${b_}d + and %rax, @a[5] + and %rax, @b[5] + or @a[5], ${a_} + or @b[5], ${b_} + + jmp __inner_loop_30 + + ret +.size __ab_approximation_30,.-__ab_approximation_30 +___ +} +$code.=<<___; +.type __inner_loop_30,\@abi-omnipotent +.align 32 +__inner_loop_30: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + lea -1($fg0), $bias # 0x7FFFFFFF7FFFFFFF + mov \$30, $cnt + +.Loop_30: + mov $a_, %rax + and $b_, %rax + shr \$1, %rax # (a_ & b_) >> 1 + + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + lea (%rax,$L), %rax # pre-"negate" |L| + mov $fg0, $t2 + mov $fg1, $t3 + mov $L, $t4 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + cmovb %rax, $L + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + cmovz $t4, $L + + lea 2($b_), %rax + shr \$1, $a_ # |a_|>>=1 + shr \$2, %rax + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + lea (%rax,$L), $L # "negate" |L| if |b|%8 is 3 or 5 + sub $bias, $fg1 + + sub \$1, $cnt + jnz .Loop_30 + + shr \$32, $bias + mov %ebx, %eax # $fg0 -> $f0 + shr \$32, $g0 + mov %ecx, %edx # $fg1 -> $f1 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret +.size __inner_loop_30,.-__inner_loop_30 + +.type __inner_loop_48,\@abi-omnipotent +.align 32 +__inner_loop_48: + mov \$48, $cnt # 48 is 768%30+30 + +.Loop_48: + mov $a_, %rax + and $b_, %rax + shr \$1, %rax # (a_ & b_) >> 1 + + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + lea (%rax,$L), %rax + mov $L, $t2 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb %rax, $L + + sub $b_, $a_ # |a_|-|b_| + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $L + + lea 2($b_), %rax + shr \$1, $a_ # |a_|>>=1 + shr \$2, %rax + add %rax, $L # "negate" |L| if |b|%8 is 3 or 5 + + sub \$1, $cnt + jnz .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/ctq_inverse_mod_384-x86_64.pl b/blst/asm/ctq_inverse_mod_384-x86_64.pl new file mode 100755 index 0000000..2be39d8 --- /dev/null +++ b/blst/asm/ctq_inverse_mod_384-x86_64.pl @@ -0,0 +1,886 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >5x better than +# modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 62 + w = 64 + mask = (1 << w) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_62 + n = max(a.bit_length(), b.bit_length()) + if n < 128: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_62 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulq_383_n_shift_by_62 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulq_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edi"; + +$frame = 8*11+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383,\@function,4,"unwind" +.align 32 +ct_inverse_mod_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov 8*0($n_ptr), @acc[6] # load modulus + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + mov 8*3($n_ptr), @acc[9] + mov 8*4($n_ptr), @acc[10] + mov 8*5($n_ptr), @acc[11] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov @acc[8], 8*8(%rax) + mov @acc[9], 8*9(%rax) + mov @acc[10], 8*10(%rax) + mov %rax, $in_ptr # pointer to source |a|b|1|0| + mov @acc[11], 8*11(%rax) + + ################################# first iteration + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*7(%rsp) # corrected |f0| + #mov $g0, 8*8(%rsp) # corrected |g0| + mov $f0, 8*12($out_ptr) # initialize |u| with |f0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + mov $f0, 8*12($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*12($in_ptr), %rax # |u| + mov 8*18($in_ptr), @acc[3] # |v| + mov $f0, %rbx + mov %rax, @acc[2] + imulq 8*7(%rsp) # |u|*|f0| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq 8*8(%rsp) # |v|*|g0| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*6($out_ptr) # destination |u| + mov @acc[1], 8*7($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*8($out_ptr) + mov @acc[1], 8*9($out_ptr) + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + lea 8*12($in_ptr),$in_ptr # make in_ptr "rewindable" with xor + + mov @acc[2], %rax + imulq %rbx # |u|*|f1| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq %rcx # |v|*|g1| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*12($out_ptr) # destination |v| + mov @acc[1], 8*13($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*14($out_ptr) + mov @acc[1], 8*15($out_ptr) + mov @acc[1], 8*16($out_ptr) + mov @acc[1], 8*17($out_ptr) +___ +for($i=2; $i<11; $i++) { +my $smul_767x63 = $i>5 ? "__smulq_767x63" + : "__smulq_383x63"; +$code.=<<___; + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + mov $f0, 8*9(%rsp) # corrected |f1| + mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*7(%rsp), $f0 # |f0| + mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + call __smulq_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call $smul_767x63 +___ +$code.=<<___ if ($i==5); + sar \$63, @acc[5] # sign extension + mov @acc[5], 8*6($out_ptr) + mov @acc[5], 8*7($out_ptr) + mov @acc[5], 8*8($out_ptr) + mov @acc[5], 8*9($out_ptr) + mov @acc[5], 8*10($out_ptr) + mov @acc[5], 8*11($out_ptr) +___ +} +$code.=<<___; + ################################# iteration before last + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + #call __ab_approximation_62 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + mov 8*1($in_ptr), @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + mov 8*7($in_ptr), @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + mov @acc[0], 8*0($out_ptr) + mov @acc[2], 8*6($out_ptr) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*12($out_ptr),$out_ptr # pointer to destination |u| + call __smulq_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call __smulq_767x63 + + ################################# last iteration + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$22, $cnt # 766 % 62 + #call __ab_approximation_62 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + xor @acc[1], @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + #mov $f1, 8*9(%rsp) + #mov $g1, 8*10(%rsp) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulq_383x63 + + #mov 8*9(%rsp), $f0 # |f1| + #mov 8*10(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original out_ptr + call __smulq_767x63 + + mov 8*5(%rsp), $in_ptr # original n_ptr + mov %rax, %rdx # top limb of the result + sar \$63, %rax # result's sign as mask + + mov %rax, @acc[0] # mask |modulus| + mov %rax, @acc[1] + mov %rax, @acc[2] + and 8*0($in_ptr), @acc[0] + and 8*1($in_ptr), @acc[1] + mov %rax, @acc[3] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), @acc[3] + mov %rax, @acc[4] + and 8*4($in_ptr), @acc[4] + and 8*5($in_ptr), %rax + + add @acc[0], @acc[6] # conditionally add |modulus|<<384 + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], %rcx + adc %rax, %rdx + + mov @acc[6], 8*6($out_ptr) # store absolute value + mov @acc[7], 8*7($out_ptr) + mov @acc[8], 8*8($out_ptr) + mov @acc[9], 8*9($out_ptr) + mov %rcx, 8*10($out_ptr) + mov %rdx, 8*11($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_inverse_mod_383,.-ct_inverse_mod_383 +___ +######################################################################## +# see corresponding commentary in ctx_inverse_mod_384-x86_64... +{ +my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); +my $fx = @acc[9]; + +$code.=<<___; +.type __smulq_767x63,\@abi-omnipotent +.align 32 +__smulq_767x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov $f0, $fx + sar \$63, $f0 # |f0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit + + mov $out_ptr, 8*1(%rsp) + mov $in_ptr, 8*2(%rsp) + lea 8*6($in_ptr), $in_ptr # pointer to |v| + + xor $f0, $fx # conditionally negate |f0| + add %rax, $fx + + xor $f0, @acc[0] # conditionally negate |u| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |u|*|f0| + mov %rax, 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] + mov @acc[$i], 8*$i($out_ptr) +___ +} +$code.=<<___; + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + mov @acc[5], 8*5($out_ptr) + mov %rdx, 8*6($out_ptr) + sar \$63, %rdx # sign extension + mov %rdx, 8*7($out_ptr) +___ +{ +my $fx=$in_ptr; +$code.=<<___; + mov $g0, $f0 # load |g0| + + mov 8*0($in_ptr), @acc[0] # load |v| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + mov 8*6($in_ptr), @acc[6] + mov 8*7($in_ptr), @acc[7] + mov 8*8($in_ptr), @acc[8] + mov 8*9($in_ptr), @acc[9] + mov 8*10($in_ptr), @acc[10] + mov 8*11($in_ptr), @acc[11] + + mov $f0, $fx # overrides in_ptr + sar \$63, $f0 # |g0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |g0|'s sign as bit + + xor $f0, $fx # conditionally negate |g0| + add %rax, $fx + + xor $f0, @acc[0] # conditionally negate |v| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + xor $f0, @acc[6] + xor $f0, @acc[7] + xor $f0, @acc[8] + xor $f0, @acc[9] + xor $f0, @acc[10] + xor $f0, @acc[11] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + adc \$0, @acc[8] + adc \$0, @acc[9] + adc \$0, @acc[10] + adc \$0, @acc[11] + + mulq $fx # |v|*|g0| + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<11; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + mov 8*1(%rsp), %rdx # out_ptr + imulq $fx, %rax + mov 8*2(%rsp), $in_ptr # restore original in_ptr + add @acc[11], %rax + + add 8*0(%rdx), @acc[0] # accumulate |u|*|f0| + adc 8*1(%rdx), @acc[1] + adc 8*2(%rdx), @acc[2] + adc 8*3(%rdx), @acc[3] + adc 8*4(%rdx), @acc[4] + adc 8*5(%rdx), @acc[5] + adc 8*6(%rdx), @acc[6] + mov 8*7(%rdx), @acc[11] # sign extension + adc @acc[11], @acc[7] + adc @acc[11], @acc[8] + adc @acc[11], @acc[9] + adc @acc[11], @acc[10] + adc @acc[11], %rax + + mov %rdx, $out_ptr # restore original out_ptr + + mov @acc[0], 8*0(%rdx) + mov @acc[1], 8*1(%rdx) + mov @acc[2], 8*2(%rdx) + mov @acc[3], 8*3(%rdx) + mov @acc[4], 8*4(%rdx) + mov @acc[5], 8*5(%rdx) + mov @acc[6], 8*6(%rdx) + mov @acc[7], 8*7(%rdx) + mov @acc[8], 8*8(%rdx) + mov @acc[9], 8*9(%rdx) + mov @acc[10], 8*10(%rdx) + mov %rax, 8*11(%rdx) + + ret +.size __smulq_767x63,.-__smulq_767x63 +___ +} +$code.=<<___; +.type __smulq_383x63,\@abi-omnipotent +.align 32 +__smulq_383x63: +___ +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, $fx + sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor %rdx, $fx # conditionally negate |f0| + add %rax, $fx + + xor %rdx, @acc[0] # conditionally negate |u| (or |v|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |u|*|f0| (or |v|*|g0|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___ if ($j==0); + imulq $fx, %rax + add %rax, @acc[$i] + + lea 8*6($in_ptr), $in_ptr # pointer to |v| + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + imulq $fx, %rax + add %rax, @acc[$i] + + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulq_383x63,.-__smulq_383x63 +___ +{ +$code.=<<___; +.type __smulq_383_n_shift_by_62,\@abi-omnipotent +.align 32 +__smulq_383_n_shift_by_62: + mov $f0, @acc[8] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, $fx + sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor %rdx, $fx # conditionally negate |f0| (or |g0|) + add %rax, $fx + + xor %rdx, @acc[0] # conditionally negate |a| (or |b|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |a|*|f0| (or |b|*|g0|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___ if ($j==0); + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + lea 8*6($in_ptr), $in_ptr # pointer to |b| + mov %rdx, @acc[6] + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$62, @acc[1], @acc[0] + shrd \$62, @acc[2], @acc[1] + shrd \$62, @acc[3], @acc[2] + shrd \$62, @acc[4], @acc[3] + shrd \$62, @acc[5], @acc[4] + shrd \$62, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret +.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 +___ +} } + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi"); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_62,\@abi-omnipotent +.align 32 +__ab_approximation_62: + mov 8*5($in_ptr), @a[2] # load |a| in reverse order + mov 8*11($in_ptr), @b[2] # load |b| in reverse order + mov 8*4($in_ptr), @a[1] + mov 8*10($in_ptr), @b[1] + mov 8*3($in_ptr), @a[0] + mov 8*9($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*2($in_ptr), @a[0] + mov 8*8($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... ones before top-most, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*1($in_ptr), @a[0] + mov 8*7($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*0($in_ptr), @a[0] + mov 8*6($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + jmp __inner_loop_62 + + ret +.size __ab_approximation_62,.-__ab_approximation_62 +___ +} +$code.=<<___; +.type __inner_loop_62,\@abi-omnipotent +.align 8 +.long 0 +__inner_loop_62: + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov \$1, $g1 # |g1|=1 + mov $in_ptr, 8(%rsp) + +.Loop_62: + xor $t0, $t0 + xor $t1, $t1 + test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t2 + mov $b_hi, $t3 + cmovnz $b_lo, $t0 + cmovnz $b_hi, $t1 + sub $a_lo, $t2 # |b_|-|a_| + sbb $a_hi, $t3 + mov $a_lo, $t4 + mov $a_hi, $t5 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + sbb $t1, $a_hi + cmovc $t2, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t3, $a_hi + cmovc $t4, $b_lo # |b_| = |a_| + cmovc $t5, $b_hi + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shrd \$1, $a_hi, $a_lo + shr \$1, $a_hi + test \$1, $t4 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, $cnt + jnz .Loop_62 + + mov 8(%rsp), $in_ptr + ret +.size __inner_loop_62,.-__inner_loop_62 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/ctx_inverse_mod_384-x86_64.pl b/blst/asm/ctx_inverse_mod_384-x86_64.pl new file mode 100755 index 0000000..d207e2f --- /dev/null +++ b/blst/asm/ctx_inverse_mod_384-x86_64.pl @@ -0,0 +1,995 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >4x better than +# modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulx_383_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulx_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edi"; + +$frame = 8*11+2*512; + +$code.=<<___; +.text + +.globl ctx_inverse_mod_383 +.type ctx_inverse_mod_383,\@function,4,"unwind" +.align 32 +ctx_inverse_mod_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov 8*0($n_ptr), @acc[6] # load modulus + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + mov 8*3($n_ptr), @acc[9] + mov 8*4($n_ptr), @acc[10] + mov 8*5($n_ptr), @acc[11] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov @acc[8], 8*8(%rax) + mov @acc[9], 8*9(%rax) + mov @acc[10], 8*10(%rax) + mov %rax, $in_ptr + mov @acc[11], 8*11(%rax) + + ################################# first iteration + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*7(%rsp) # corrected |f0| + #mov $g0, 8*8(%rsp) # corrected |g0| + mov $f0, 8*12($out_ptr) # initialize |u| with |f0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + mov $f0, 8*12($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulx_383_n_shift_by_31 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*12($in_ptr), %rax # |u| + mov 8*18($in_ptr), @acc[3] # |v| + mov $f0, %rbx + mov %rax, @acc[2] + imulq 8*7(%rsp) # |u|*|f0| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq 8*8(%rsp) # |v|*|g0| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*6($out_ptr) # destination |u| + mov @acc[1], 8*7($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*8($out_ptr) + mov @acc[1], 8*9($out_ptr) + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + lea 8*12($in_ptr), $in_ptr # make in_ptr "rewindable" with xor + + mov @acc[2], %rax + imulq %rbx # |u|*|f1| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq %rcx # |v|*|g1| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*12($out_ptr) # destination |v| + mov @acc[1], 8*13($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*14($out_ptr) + mov @acc[1], 8*15($out_ptr) + mov @acc[1], 8*16($out_ptr) + mov @acc[1], 8*17($out_ptr) +___ +for($i=2; $i<23; $i++) { +my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31" + : "__smulx_191_n_shift_by_31"; +my $smul_767x63 = $i>11 ? "__smulx_767x63" + : "__smulx_383x63"; +$code.=<<___; + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call $smul_n_shift + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call $smul_n_shift + mov $f0, 8*9(%rsp) # corrected |f1| + mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*7(%rsp), $f0 # |f0| + mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + call __smulx_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call $smul_767x63 +___ +$code.=<<___ if ($i==11); + sar \$63, @acc[5] # sign extension + mov @acc[5], 8*6($out_ptr) + mov @acc[5], 8*7($out_ptr) + mov @acc[5], 8*8($out_ptr) + mov @acc[5], 8*9($out_ptr) + mov @acc[5], 8*10($out_ptr) + mov @acc[5], 8*11($out_ptr) +___ +} +$code.=<<___; + ################################# two[!] last iterations in one go + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$53, $cnt # 31 + 766 % 31 + #call __ab_approximation_31 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + #xor @acc[1], @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + #xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + #mov $f1, 8*9(%rsp) + #mov $g1, 8*10(%rsp) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulx_383x63 + + #mov 8*9(%rsp), $f0 # |f1| + #mov 8*10(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original out_ptr + call __smulx_767x63 + + mov 8*5(%rsp), $in_ptr # original n_ptr + mov %rax, %rdx # top limb of the result + sar \$63, %rax # result's sign as mask + + mov %rax, @acc[0] # mask |modulus| + mov %rax, @acc[1] + mov %rax, @acc[2] + and 8*0($in_ptr), @acc[0] + and 8*1($in_ptr), @acc[1] + mov %rax, @acc[3] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), @acc[3] + mov %rax, @acc[4] + and 8*4($in_ptr), @acc[4] + and 8*5($in_ptr), %rax + + add @acc[0], @acc[6] # conditionally add |modulus|<<384 + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], %rcx + adc %rax, %rdx + + mov @acc[6], 8*6($out_ptr) # store absolute value + mov @acc[7], 8*7($out_ptr) + mov @acc[8], 8*8($out_ptr) + mov @acc[9], 8*9($out_ptr) + mov %rcx, 8*10($out_ptr) + mov %rdx, 8*11($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 +___ +######################################################################## +# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers +# to the maximum bit-length of the *result*, and "63" - to the maximum +# bit-length of the |f?| and |g?| single-limb multiplicands. However! +# The latter should not be taken literally, as they are always chosen so +# that "bad things" don't happen. For example, there comes a point when +# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we +# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is +# because past that point |f0| is always 1 and |g0| is always 0. And, +# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to +# perform full-width |u|*|f1| multiplication, half-width one with sign +# extension is sufficient... +{ +my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); +my $fx = @acc[9]; + +$code.=<<___; +.type __smulx_767x63,\@abi-omnipotent +.align 32 +__smulx_767x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov $f0, %rax + sar \$63, %rax # |f0|'s sign as mask + xor $fx, $fx # overrides in_ptr + sub %rax, $fx # |f0|'s sign as bit + + mov $out_ptr, 8*1(%rsp) + mov $in_ptr, 8*2(%rsp) + lea 8*6($in_ptr), $in_ptr # pointer to |v| + + xor %rax, $f0 # conditionally negate |f0| + add $fx, $f0 + + xor %rax, @acc[0] # conditionally negate |u| + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor @acc[5], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |u|*|f0| + mulx @acc[1], @acc[1], @acc[5] + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___; + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc \$0, %rdx + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) + mov %rdx, 8*6($out_ptr) + sar \$63, %rdx # sign extension + mov %rdx, 8*7($out_ptr) +___ +{ +my $fx=$in_ptr; +$code.=<<___; + mov $g0, $f0 # load |g0| + mov $g0, %rax + + mov 8*0($in_ptr), @acc[0] # load |v| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + mov 8*6($in_ptr), @acc[6] + mov 8*7($in_ptr), @acc[7] + mov 8*8($in_ptr), @acc[8] + mov 8*9($in_ptr), @acc[9] + mov 8*10($in_ptr), @acc[10] + mov 8*11($in_ptr), @acc[11] + + sar \$63, %rax # |g0|'s sign as mask + xor $fx, $fx # overrides in_ptr + sub %rax, $fx # |g0|'s sign as bit + + xor %rax, $f0 # conditionally negate |g0| + add $fx, $f0 + + xor %rax, @acc[0] # conditionally negate |v| + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor %rax, @acc[5] + xor %rax, @acc[6] + xor %rax, @acc[7] + xor %rax, @acc[8] + xor %rax, @acc[9] + xor %rax, @acc[10] + xor %rax, @acc[11] + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + adc \$0, @acc[8] + adc \$0, @acc[9] + adc \$0, @acc[10] + adc \$0, @acc[11] + + mulx @acc[0], @acc[0], %rax # |v|*|g0| + mulx @acc[1], @acc[1], $fx + add %rax, @acc[1] +___ +for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___; + mulx @acc[11], @acc[11], $fx + mov 8*1(%rsp), %rdx # out_ptr + mov 8*2(%rsp), $in_ptr # restore original in_ptr + adc @acc[11], %rax + + add 8*0(%rdx), @acc[0] # accumulate |u|*|f0| + adc 8*1(%rdx), @acc[1] + adc 8*2(%rdx), @acc[2] + adc 8*3(%rdx), @acc[3] + adc 8*4(%rdx), @acc[4] + adc 8*5(%rdx), @acc[5] + adc 8*6(%rdx), @acc[6] + mov 8*7(%rdx), @acc[11] # sign extension + adc @acc[11], @acc[7] + adc @acc[11], @acc[8] + adc @acc[11], @acc[9] + adc @acc[11], @acc[10] + adc @acc[11], %rax + + mov %rdx, $out_ptr # restore original out_ptr + + mov @acc[0], 8*0(%rdx) + mov @acc[1], 8*1(%rdx) + mov @acc[2], 8*2(%rdx) + mov @acc[3], 8*3(%rdx) + mov @acc[4], 8*4(%rdx) + mov @acc[5], 8*5(%rdx) + mov @acc[6], 8*6(%rdx) + mov @acc[7], 8*7(%rdx) + mov @acc[8], 8*8(%rdx) + mov @acc[9], 8*9(%rdx) + mov @acc[10], 8*10(%rdx) + mov %rax, 8*11(%rdx) + + ret +.size __smulx_767x63,.-__smulx_767x63 +___ +} +$code.=<<___; +.type __smulx_383x63,\@abi-omnipotent +.align 32 +__smulx_383x63: +___ +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), @acc[4] + mov $k+8*5($in_ptr), @acc[5] + + mov $f0, $fx + sar \$63, $fx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $fx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $fx, $f0 # conditionally negate |f0| + add %rax, $f0 + + xor $fx, @acc[0] # conditionally negate |u| (or |v|) + xor $fx, @acc[1] + xor $fx, @acc[2] + xor $fx, @acc[3] + xor $fx, @acc[4] + xor $fx, @acc[5] + add %rax, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulx @acc[0], @acc[0], $fx # |u|*|f0| (or |v|*|g0|) + mulx @acc[1], @acc[1], %rax + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___ if ($j==0); + mulx @acc[$i], @acc[$i], %rax + mov $g0, $f0 + adc $fx, @acc[$i] + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + mulx @acc[$i], @acc[$i], %rax + adc $fx, @acc[$i] + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulx_383x63,.-__smulx_383x63 +___ +######################################################################## +# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of +# the names refers to maximum bit-lengths of |a| and |b|. As already +# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always +# chosen so that "bad things" don't happen. For example, so that the +# sum of the products doesn't overflow, and that the final result is +# never wider than inputs... +{ +$code.=<<___; +.type __smulx_383_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulx_383_n_shift_by_31: + mov $f0, @acc[8] + xor @acc[6], @acc[6] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), @acc[4] + mov $k+8*5($in_ptr), @acc[5] + + mov %rdx, %rax + sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) + xor $fx, $fx + sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) + + xor %rax, %rdx # conditionally negate |f0| (or |g0|) + add $fx, %rdx + + xor %rax, @acc[0] # conditionally negate |a| (or |b|) + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor @acc[5], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) + mulx @acc[1], @acc[1], @acc[5] + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___ if ($j==0); + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc %rdx, @acc[6] + + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) +___ +} +$code.=<<___; + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc \$0, %rdx + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), %rax + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$31, @acc[1], @acc[0] + shrd \$31, @acc[2], @acc[1] + shrd \$31, @acc[3], @acc[2] + shrd \$31, @acc[4], @acc[3] + shrd \$31, %rax, @acc[4] + shrd \$31, @acc[6], %rax + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret +.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 +___ +} { +$code.=<<___; +.type __smulx_191_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulx_191_n_shift_by_31: + mov $f0, @acc[8] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +my @acc=@acc; + @acc=@acc[3..5] if ($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + + mov %rdx, %rax + sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) + xor $fx, $fx + sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) + + xor %rax, %rdx # conditionally negate |f0| (or |g0|) + add $fx, %rdx + + xor %rax, @acc[0] # conditionally negate |a| (or |b|) + xor %rax, @acc[1] + xor @acc[2], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) + mulx @acc[1], @acc[1], @acc[2] + add $fx, @acc[1] + adc \$0, @acc[2] + imulq %rdx + add %rax, @acc[2] + adc \$0, %rdx +___ +$code.=<<___ if ($j==0); + mov %rdx, @acc[6] + mov $g0, %rdx +___ +} +$code.=<<___; + add @acc[0], @acc[3] + adc @acc[1], @acc[4] + adc @acc[2], @acc[5] + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$31, @acc[4], @acc[3] + shrd \$31, @acc[5], @acc[4] + shrd \$31, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[3] # conditionally negate the result + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add $fx, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[3], 8*0($out_ptr) + mov @acc[4], 8*1($out_ptr) + mov @acc[5], 8*2($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret +.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 +___ +} } + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); +my ($a_, $b_) = ($a_lo, $b_lo); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_31,\@abi-omnipotent +.align 32 +__ab_approximation_31: + mov 8*5($in_ptr), @a[2] # load |a| in reverse order + mov 8*11($in_ptr), @b[2] # load |b| in reverse order + mov 8*4($in_ptr), @a[1] + mov 8*10($in_ptr), @b[1] + mov 8*3($in_ptr), @a[0] + mov 8*9($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*2($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*8($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... ones before top-most, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*1($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*7($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*0($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*6($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[2] + cmovz @b[0], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + mov \$0x7FFFFFFF, %eax + and %rax, @a[0] + and %rax, @b[0] + andn @a[2], %rax, @a[2] + andn @b[2], %rax, @b[2] + or @a[2], @a[0] + or @b[2], @b[0] + + jmp __inner_loop_31 + + ret +.size __ab_approximation_31,.-__ab_approximation_31 +___ +} +$code.=<<___; +.type __inner_loop_31,\@abi-omnipotent +.align 32 +__inner_loop_31: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + mov \$0x7FFFFFFF7FFFFFFF, $bias + +.Loop_31: + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + mov $fg0, $t2 + mov $fg1, $t3 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + + shr \$1, $a_ # |a_|>>=1 + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + sub $bias, $fg1 + sub \$1, $cnt + jnz .Loop_31 + + shr \$32, $bias + mov %ecx, %edx # $fg0, $f0 + mov ${fg1}d, ${f1}d + shr \$32, $g0 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret +.size __inner_loop_31,.-__inner_loop_31 + +.type __inner_loop_62,\@abi-omnipotent +.align 32 +__inner_loop_62: + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov \$1, $g1 # |g1|=1 + +.Loop_62: + xor $t0, $t0 + test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t1 + cmovnz $b_lo, $t0 + sub $a_lo, $t1 # |b_|-|a_| + mov $a_lo, $t2 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t2, $b_lo # |b_| = |a_| + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shr \$1, $a_lo + test \$1, $t2 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, $cnt + jnz .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/div3w-armv8.pl b/blst/asm/div3w-armv8.pl new file mode 100755 index 0000000..bfa3245 --- /dev/null +++ b/blst/asm/div3w-armv8.pl @@ -0,0 +1,122 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$code.=<<___; +.text + +.globl div_3_limbs +.type div_3_limbs,%function +.align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +___ +{ +my ($div_rem, $divisor, $quot) = map("x$_",(0..2)); +my @div = map("x$_",(3..4)); +my @acc = map("x$_",(5..7)); +my @t = map("x$_",(8..11)); + +$code.=<<___; +.globl quot_rem_128 +.type quot_rem_128,%function +.align 5 +quot_rem_128: + ldp @div[0],@div[1],[$divisor] + + mul @acc[0],@div[0],$quot // divisor[0:1} * quotient + umulh @acc[1],@div[0],$quot + mul @t[3], @div[1],$quot + umulh @acc[2],@div[1],$quot + + ldp @t[0],@t[1],[$div_rem] // load 3 limbs of the dividend + ldr @t[2],[$div_rem,#16] + + adds @acc[1],@acc[1],@t[3] + adc @acc[2],@acc[2],xzr + + subs @t[0],@t[0],@acc[0] // dividend - divisor * quotient + sbcs @t[1],@t[1],@acc[1] + sbcs @t[2],@t[2],@acc[2] + sbc @acc[0],xzr,xzr // borrow -> mask + + add $quot,$quot,@acc[0] // if borrowed, adjust the quotient ... + and @div[0],@div[0],@acc[0] + and @div[1],@div[1],@acc[0] + adds @t[0],@t[0],@div[0] // ... and add divisor + adc @t[1],@t[1],@div[1] + + stp @t[0],@t[1],[$div_rem] // save 2 limbs of the remainder + str $quot,[$div_rem,#16] // and one limb of the quotient + + mov x0,$quot // return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +.globl quot_rem_64 +.type quot_rem_64,%function +.align 5 +quot_rem_64: + ldr @div[0],[$divisor] + ldr @t[0],[$div_rem] // load 1 limb of the dividend + + mul @acc[0],@div[0],$quot // divisor * quotient + + sub @t[0],@t[0],@acc[0] // dividend - divisor * quotient + + stp @t[0],$quot,[$div_rem] // save remainder and quotient + + mov x0,$quot // return quotient + + ret +.size quot_rem_64,.-quot_rem_64 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/div3w-x86_64.pl b/blst/asm/div3w-x86_64.pl new file mode 100755 index 0000000..b8192db --- /dev/null +++ b/blst/asm/div3w-x86_64.pl @@ -0,0 +1,184 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$c_ref=<<'___'; +/* + * |div_top| points at two most significant limbs of the dividend, |d_hi| + * and |d_lo| are two most significant limbs of the divisor. If divisor + * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. + * The divisor is required to be "bitwise left-aligned," and dividend's + * top limbs to be not larger than the divisor's. The latter limitation + * can be problematic in the first iteration of multi-precision division, + * where in most general case the condition would have to be "smaller." + * The subroutine considers four limbs, two of which are "overlapping," + * hence the name... Another way to look at it is to think of the pair + * of the dividend's limbs being suffixed with a zero: + * +-------+-------+-------+ + * R | | | 0 | + * +-------+-------+-------+ + * +-------+-------+ + * D | | | + * +-------+-------+ + */ +limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi) +{ + llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0]; + llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo; + limb_t Q = 0, mask; + size_t i; + + for (i = 0; i < LIMB_BITS; i++) { + Q <<= 1; + mask = (R >= D); + Q |= mask; + R -= (D & ((llimb_t)0 - mask)); + D >>= 1; + } + + mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */ + + Q <<= 1; + Q |= (R >= D); + + return (Q | mask); +} +___ + +$code.=<<___; +.text + +.globl div_3_limbs +.hidden div_3_limbs +.type div_3_limbs,\@function,3 +.align 32 +div_3_limbs: + mov (%rdi),%r8 # load R.lo + mov 8(%rdi),%r9 # load R.hi + xor %rax,%rax # Q = 0 + mov \$64,%ecx # loop counter + +.Loop: + mov %r8,%r10 # put aside R + sub %rsi,%r8 # R -= D + mov %r9,%r11 + sbb %rdx,%r9 + lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit + mov %rdx,%rdi + cmovc %r10,%r8 # restore R if R - D borrowed + cmovc %r11,%r9 + sbb \$0,%rax # subtract speculative bit + shl \$63,%rdi + shr \$1,%rsi + shr \$1,%rdx + or %rdi,%rsi # D >>= 1 + sub \$1,%ecx + jnz .Loop + + lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit + sar \$63,%rax # top bit -> mask + + sub %rsi,%r8 # R -= D + sbb %rdx,%r9 + sbb \$0,%rcx # subtract speculative bit + + or %rcx,%rax # all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +___ +######################################################################## +# Calculate remainder and adjust the quotient, which can be off-by-one. +# Then save quotient in limb next to top limb of the remainder. There is +# place, because the remainder/next-iteration-dividend gets shorter by +# one limb. +{ +my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx"); +my @acc = ("%r8", "%r9", "%rdx"); +my @tmp = ("%r10", "%r11", "%rax"); + +$code.=<<___; +.globl quot_rem_128 +.hidden quot_rem_128 +.type quot_rem_128,\@function,3 +.align 32 +quot_rem_128: + mov %rdx, %rax + mov %rdx, $quotient + + mulq 0($divisor) # divisor[0:1] * quotient + mov %rax, @acc[0] + mov $quotient, %rax + mov %rdx, @acc[1] + + mulq 8($divisor) + add %rax, @acc[1] + adc \$0, %rdx # %rdx is @acc[2] + + mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend + mov 8($div_rem), @tmp[1] + mov 16($div_rem), @tmp[2] + + sub @acc[0], @tmp[0] # dividend - divisor * quotient + sbb @acc[1], @tmp[1] + sbb @acc[2], @tmp[2] + sbb @acc[0], @acc[0] # borrow -> mask + + add @acc[0], $quotient # if borrowed, adjust the quotient ... + mov @acc[0], @acc[1] + and 0($divisor), @acc[0] + and 8($divisor), @acc[1] + add @acc[0], @tmp[0] # ... and add divisor + adc @acc[1], @tmp[1] + + mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ... + mov @tmp[1], 8($div_rem) + mov $quotient, 16($div_rem) # ... and 1 limb of the quotient + + mov $quotient, %rax # return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +######################################################################## +# Unlike 128-bit case above, quotient is exact. As result just one limb +# of the dividend is sufficient to calculate the remainder... + +.globl quot_rem_64 +.hidden quot_rem_64 +.type quot_rem_64,\@function,3 +.align 32 +quot_rem_64: + mov %rdx, %rax # return quotient + imulq 0($divisor), %rdx # divisor[0] * quotient + + mov 0($div_rem), @tmp[0] # load 1 limb of the dividend + + sub %rdx, @tmp[0] # dividend - divisor * quotient + + mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ... + mov %rax, 8($div_rem) # ... and 1 limb of the quotient + + ret +.size quot_rem_64,.-quot_rem_64 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/mul_mont_256-armv8.pl b/blst/asm/mul_mont_256-armv8.pl new file mode 100755 index 0000000..ba6c2b8 --- /dev/null +++ b/blst/asm/mul_mont_256-armv8.pl @@ -0,0 +1,409 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# As for "sparse" in subroutine names, see commentary in the +# asm/mulx_mont_256-x86_64.pl module. + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); + +@mod=map("x$_",(5..8)); +$bi="x9"; +@a=map("x$_",(10..13)); +@tmp=map("x$_",(14..17)); +@acc=map("x$_",(19..24)); +$m0=$n_ptr; + +$code.=<<___; +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,%function +.align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + mul @acc[0],@a[0],$bi + ldp @mod[0],@mod[1],[$n_ptr] + mul @acc[1],@a[1],$bi + ldp @mod[2],@mod[3],[$n_ptr,#16] + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + mul $m0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + adds @acc[1],@acc[1],@tmp[0] + //mul @tmp[0],@mod[0],$m0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$m0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$m0 + adc @acc[4],xzr, @tmp[3] + mul @tmp[3],@mod[3],$m0 +___ +for ($i=1;$i<4;$i++) { +$code.=<<___; + ldr $bi,[$b_ptr,8*$i] + subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @acc[4],@acc[4],xzr + + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adc @acc[4],xzr,xzr + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $m0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adc @acc[4],@acc[4],xzr + + adds @acc[1],@acc[1],@tmp[0] + //mul @tmp[0],@mod[0],$m0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$m0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$m0 + adc @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$m0 +___ +} +$code.=<<___; + subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @acc[4],@acc[4],xzr + + adds @acc[0],@acc[1],@tmp[0] + adcs @acc[1],@acc[2],@tmp[1] + adcs @acc[2],@acc[3],@tmp[2] + adcs @acc[3],@acc[4],@tmp[3] + adc @acc[4],xzr,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs xzr, @acc[4],xzr + + csel @acc[0],@acc[0],@tmp[0],lo + csel @acc[1],@acc[1],@tmp[1],lo + csel @acc[2],@acc[2],@tmp[2],lo + csel @acc[3],@acc[3],@tmp[3],lo + + stp @acc[0],@acc[1],[$r_ptr] + stp @acc[2],@acc[3],[$r_ptr,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size mul_mont_sparse_256,.-mul_mont_sparse_256 +___ +{ +my @acc = (@a,@acc[0..3]); +my @a = @mod; + +$code.=<<___; +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,%function +.align 5 +sqr_mont_sparse_256: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + mov $n0,$n_ptr + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x] + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul @acc[1],@a[1],@a[0] // a[1]*a[0] + umulh @tmp[1],@a[1],@a[0] + mul @acc[2],@a[2],@a[0] // a[2]*a[0] + umulh @tmp[2],@a[2],@a[0] + mul @acc[3],@a[3],@a[0] // a[3]*a[0] + umulh @acc[4],@a[3],@a[0] + + adds @acc[2],@acc[2],@tmp[1] // accumulate high parts of multiplication + mul @tmp[0],@a[2],@a[1] // a[2]*a[1] + umulh @tmp[1],@a[2],@a[1] + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@a[3],@a[1] // a[3]*a[1] + umulh @tmp[3],@a[3],@a[1] + adc @acc[4],@acc[4],xzr // can't overflow + + mul @acc[5],@a[3],@a[2] // a[3]*a[2] + umulh @acc[6],@a[3],@a[2] + + adds @tmp[1],@tmp[1],@tmp[2] // accumulate high parts of multiplication + mul @acc[0],@a[0],@a[0] // a[0]*a[0] + adc @tmp[2],@tmp[3],xzr // can't overflow + + adds @acc[3],@acc[3],@tmp[0] // accumulate low parts of multiplication + umulh @a[0],@a[0],@a[0] + adcs @acc[4],@acc[4],@tmp[1] + mul @tmp[1],@a[1],@a[1] // a[1]*a[1] + adcs @acc[5],@acc[5],@tmp[2] + umulh @a[1],@a[1],@a[1] + adc @acc[6],@acc[6],xzr // can't overflow + + adds @acc[1],@acc[1],@acc[1] // acc[1-6]*=2 + mul @tmp[2],@a[2],@a[2] // a[2]*a[2] + adcs @acc[2],@acc[2],@acc[2] + umulh @a[2],@a[2],@a[2] + adcs @acc[3],@acc[3],@acc[3] + mul @tmp[3],@a[3],@a[3] // a[3]*a[3] + adcs @acc[4],@acc[4],@acc[4] + umulh @a[3],@a[3],@a[3] + adcs @acc[5],@acc[5],@acc[5] + adcs @acc[6],@acc[6],@acc[6] + adc @acc[7],xzr,xzr + + adds @acc[1],@acc[1],@a[0] // +a[i]*a[i] + adcs @acc[2],@acc[2],@tmp[1] + adcs @acc[3],@acc[3],@a[1] + adcs @acc[4],@acc[4],@tmp[2] + adcs @acc[5],@acc[5],@a[2] + adcs @acc[6],@acc[6],@tmp[3] + adc @acc[7],@acc[7],@a[3] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds @acc[0],@acc[0],@acc[4] // accumulate upper half + adcs @acc[1],@acc[1],@acc[5] + adcs @acc[2],@acc[2],@acc[6] + adcs @acc[3],@acc[3],@acc[7] + adc @acc[4],xzr,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs xzr, @acc[4],xzr + + csel @acc[0],@acc[0],@tmp[0],lo + csel @acc[1],@acc[1],@tmp[1],lo + csel @acc[2],@acc[2],@tmp[2],lo + csel @acc[3],@acc[3],@tmp[3],lo + + stp @acc[0],@acc[1],[$r_ptr] + stp @acc[2],@acc[3],[$r_ptr,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +___ +} +{ +my @a = (@a, $bi); + +$code.=<<___; +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,%function +.align 5 +from_mont_256: + paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov $n0,$n_ptr + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs @tmp[0],@a[0],@mod[0] + sbcs @tmp[1],@a[1],@mod[1] + sbcs @tmp[2],@a[2],@mod[2] + sbcs @tmp[3],@a[3],@mod[3] + + csel @a[0],@a[0],@tmp[0],lo + csel @a[1],@a[1],@tmp[1],lo + csel @a[2],@a[2],@tmp[2],lo + csel @a[3],@a[3],@tmp[3],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ldr x29,[sp],#16 + autiasp + ret +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,%function +.align 5 +redc_mont_256: + paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov $n0,$n_ptr + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp @tmp[0],@tmp[1],[$a_ptr,#32] + ldp @tmp[2],@tmp[3],[$a_ptr,#48] + + adds @a[0],@a[0],@tmp[0] + adcs @a[1],@a[1],@tmp[1] + adcs @a[2],@a[2],@tmp[2] + adcs @a[3],@a[3],@tmp[3] + adc @a[4],xzr,xzr + + subs @tmp[0],@a[0],@mod[0] + sbcs @tmp[1],@a[1],@mod[1] + sbcs @tmp[2],@a[2],@mod[2] + sbcs @tmp[3],@a[3],@mod[3] + sbcs xzr, @a[4],xzr + + csel @a[0],@a[0],@tmp[0],lo + csel @a[1],@a[1],@tmp[1],lo + csel @a[2],@a[2],@tmp[2],lo + csel @a[3],@a[3],@tmp[3],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ldr x29,[sp],#16 + autiasp + ret +.size redc_mont_256,.-redc_mont_256 + +.type __mul_by_1_mont_256,%function +.align 5 +__mul_by_1_mont_256: + mul $m0,$n0,@a[0] + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] +___ +for ($i=1;$i<4;$i++) { +$code.=<<___; + //mul @tmp[0],@mod[0],$m0 + mul @tmp[1],@mod[1],$m0 + mul @tmp[2],@mod[2],$m0 + mul @tmp[3],@mod[3],$m0 + subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @a[1],@a[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @a[2],@a[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @a[3],@a[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @a[4],xzr,xzr + + adds @a[0],@a[1],@tmp[0] + adcs @a[1],@a[2],@tmp[1] + adcs @a[2],@a[3],@tmp[2] + mul $m0,$n0,@a[0] + adc @a[3],@a[4],@tmp[3] +___ +} +$code.=<<___; + //mul @tmp[0],@mod[0],$m0 + mul @tmp[1],@mod[1],$m0 + mul @tmp[2],@mod[2],$m0 + mul @tmp[3],@mod[3],$m0 + subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @a[1],@a[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @a[2],@a[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @a[3],@a[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @a[4],xzr,xzr + + adds @a[0],@a[1],@tmp[0] + adcs @a[1],@a[2],@tmp[1] + adcs @a[2],@a[3],@tmp[2] + adc @a[3],@a[4],@tmp[3] + + ret +.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 +___ +} + +print $code; + +close STDOUT; diff --git a/blst/asm/mul_mont_384-armv8.pl b/blst/asm/mul_mont_384-armv8.pl new file mode 100755 index 0000000..44e12a0 --- /dev/null +++ b/blst/asm/mul_mont_384-armv8.pl @@ -0,0 +1,2015 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); + +@mod = map("x$_",(5..10)); +@a = map("x$_",(11..16)); +$bi = "x17"; +@acc = map("x$_",(19..25)); +@tmp = map("x$_",(26..28,0,1,3)); + +$code.=<<___; +.text + +.globl add_mod_384x384 +.type add_mod_384x384,%function +.align 5 +add_mod_384x384: + paciasp + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + autiasp + ret +.size add_mod_384x384,.-add_mod_384x384 + +.type __add_mod_384x384,%function +.align 5 +__add_mod_384x384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + adds @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + adcs @a[3],@a[3],@acc[3] + stp @a[0], @a[1], [$r_ptr] + adcs @a[4],@a[4],@acc[4] + ldp @a[0], @a[1], [$a_ptr,#48] + adcs @a[5],@a[5],@acc[5] + + ldp @acc[0],@acc[1],[$b_ptr,#48] + stp @a[2], @a[3], [$r_ptr,#16] + ldp @a[2], @a[3], [$a_ptr,#64] + ldp @acc[2],@acc[3],[$b_ptr,#64] + + adcs @a[0],@a[0],@acc[0] + stp @a[4], @a[5], [$r_ptr,#32] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#80] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#80] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc $bi,xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,$bi,xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + stp @a[0],@a[1],[$r_ptr,#48] + csel @a[4],@a[4],@acc[4],lo + stp @a[2],@a[3],[$r_ptr,#64] + csel @a[5],@a[5],@acc[5],lo + stp @a[4],@a[5],[$r_ptr,#80] + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.globl sub_mod_384x384 +.type sub_mod_384x384,%function +.align 5 +sub_mod_384x384: + paciasp + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + autiasp + ret +.size sub_mod_384x384,.-sub_mod_384x384 + +.type __sub_mod_384x384,%function +.align 5 +__sub_mod_384x384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + subs @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + sbcs @a[3],@a[3],@acc[3] + stp @a[0], @a[1], [$r_ptr] + sbcs @a[4],@a[4],@acc[4] + ldp @a[0], @a[1], [$a_ptr,#48] + sbcs @a[5],@a[5],@acc[5] + + ldp @acc[0],@acc[1],[$b_ptr,#48] + stp @a[2], @a[3], [$r_ptr,#16] + ldp @a[2], @a[3], [$a_ptr,#64] + ldp @acc[2],@acc[3],[$b_ptr,#64] + + sbcs @a[0],@a[0],@acc[0] + stp @a[4], @a[5], [$r_ptr,#32] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#80] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#80] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc $bi,xzr,xzr + + and @acc[0],@mod[0],$bi + and @acc[1],@mod[1],$bi + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],$bi + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],$bi + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],$bi + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],$bi + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr,#48] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + adds @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc $bi,xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,$bi,xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[5],@a[5],@acc[5],lo + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + subs @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc $bi,xzr,xzr + + and @acc[0],@mod[0],$bi + and @acc[1],@mod[1],$bi + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],$bi + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],$bi + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],$bi + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],$bi + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,%function +.align 5 +mul_mont_384x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov @tmp[0],$r_ptr // save r_ptr + mov @tmp[1],$a_ptr // save b_ptr + mov @tmp[2],$b_ptr // save b_ptr + + sub $r_ptr,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add $a_ptr,$a_ptr,#48 // mul_384(t1, a->im, b->im) + add $b_ptr,$b_ptr,#48 + add $r_ptr,sp,#96 + bl __mul_384 + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + sub $b_ptr,$a_ptr,#48 + add $r_ptr,sp,#240 + bl __add_mod_384 + + add $a_ptr,@tmp[2],#0 + add $b_ptr,@tmp[2],#48 + add $r_ptr,sp,#192 // t2 + bl __add_mod_384 + + add $a_ptr,$r_ptr,#0 + add $b_ptr,$r_ptr,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + mov $a_ptr,$r_ptr + add $b_ptr,sp,#0 + bl __sub_mod_384x384 + + add $b_ptr,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add $a_ptr,sp,#0 + add $b_ptr,sp,#96 + add $r_ptr,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add $a_ptr,sp,#0 // ret->re = redc(t0) + add $r_ptr,@tmp[0],#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add $a_ptr,sp,#192 // ret->im = redc(t2) + add $r_ptr,$r_ptr,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_mont_384x,.-mul_mont_384x + +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,%function +.align 5 +sqr_mont_384x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n_ptr,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + add $b_ptr,$a_ptr,#48 + add $r_ptr,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add $r_ptr,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds @a[0],@a[0],@a[0] // add with itself + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @acc[6],xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,@acc[6],xzr + + csel @acc[0],@a[0],@acc[0],lo + csel @acc[1],@a[1],@acc[1],lo + csel @acc[2],@a[2],@acc[2],lo + ldp @a[0],@a[1],[sp] + csel @acc[3],@a[3],@acc[3],lo + ldr $bi, [sp,#48] + csel @acc[4],@a[4],@acc[4],lo + ldp @a[2],@a[3],[sp,#16] + csel @acc[5],@a[5],@acc[5],lo + ldp @a[4],@a[5],[sp,#32] + + stp @acc[0],@acc[1],[$b_ptr,#48] + stp @acc[2],@acc[3],[$b_ptr,#64] + stp @acc[4],@acc[5],[$b_ptr,#80] + + add $b_ptr,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,%function +.align 5 +mul_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n0,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_mont_384,.-mul_mont_384 + +.type __mul_mont_384,%function +.align 5 +__mul_mont_384: + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + mul $n0,$n0,@acc[0] + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + umulh @tmp[4],@a[4],$bi + umulh @tmp[5],@a[5],$bi + + adds @acc[1],@acc[1],@tmp[0] + // mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],xzr, @tmp[5] + mul @tmp[5],@mod[5],$n0 + mov $bi,xzr +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adcs @acc[6],@acc[6],xzr + adc $n0,$bi,xzr + ldr $bi,[$b_ptr,8*$i] + + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adcs @acc[4],@acc[5],@tmp[4] + mul @tmp[4],@a[4],$bi + adcs @acc[5],@acc[6],@tmp[5] + mul @tmp[5],@a[5],$bi + adc @acc[6],$n0,xzr + ldr $n0,[x29,#96] + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $n0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@a[4],$bi + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@a[5],$bi + adcs @acc[6],@acc[6],xzr + adc $bi,xzr,xzr + + adds @acc[1],@acc[1],@tmp[0] + // mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adcs @acc[6],@acc[6],@tmp[5] + mul @tmp[5],@mod[5],$n0 + adc $bi,$bi,xzr +___ +} +$code.=<<___; + subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adcs @acc[6],@acc[6],xzr + ldp $n0,$b_ptr,[x29,#96] // pull r_ptr + adc $bi,$bi,xzr + + adds @acc[0],@acc[1],@tmp[0] + adcs @acc[1],@acc[2],@tmp[1] + adcs @acc[2],@acc[3],@tmp[2] + adcs @acc[3],@acc[4],@tmp[3] + adcs @acc[4],@acc[5],@tmp[4] + adcs @acc[5],@acc[6],@tmp[5] + adc @acc[6],$bi,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs @tmp[4],@acc[4],@mod[4] + sbcs @tmp[5],@acc[5],@mod[5] + sbcs xzr, @acc[6],xzr + + csel @a[0],@acc[0],@tmp[0],lo + csel @a[1],@acc[1],@tmp[1],lo + csel @a[2],@acc[2],@tmp[2],lo + csel @a[3],@acc[3],@tmp[3],lo + csel @a[4],@acc[4],@tmp[4],lo + csel @a[5],@acc[5],@tmp[5],lo + ret +.size __mul_mont_384,.-__mul_mont_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,%function +.align 5 +sqr_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov $n0,$n_ptr // adjust for missing b_ptr + + mov $n_ptr,$r_ptr // save r_ptr + mov $r_ptr,sp + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __sqr_384 + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + mov $a_ptr,sp + mov $r_ptr,$n_ptr // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_mont_384,.-sqr_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,%function +.align 5 +sqr_n_mul_mont_383: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n0,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov $bi,x5 // save b_ptr + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + mov $r_ptr,sp +.Loop_sqr_383: + bl __sqr_384 + sub $b_ptr,$b_ptr,#1 // counter + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + mov $a_ptr,sp + bl __mul_by_1_mont_384 + + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @a[0],@a[0],@acc[0] // just accumulate upper half + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adc @a[5],@a[5],@acc[5] + + cbnz $b_ptr,.Loop_sqr_383 + + mov $b_ptr,$bi + ldr $bi,[$bi] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +___ +{ +my @acc=(@acc,@tmp[0..2]); + +$code.=<<___; +.type __sqr_384,%function +.align 5 +__sqr_384: + mul @acc[0],@a[1],@a[0] + mul @acc[1],@a[2],@a[0] + mul @acc[2],@a[3],@a[0] + mul @acc[3],@a[4],@a[0] + mul @acc[4],@a[5],@a[0] + + umulh @mod[1],@a[1],@a[0] + umulh @mod[2],@a[2],@a[0] + umulh @mod[3],@a[3],@a[0] + umulh @mod[4],@a[4],@a[0] + adds @acc[1],@acc[1],@mod[1] + umulh @mod[5],@a[5],@a[0] + adcs @acc[2],@acc[2],@mod[2] + mul @mod[2],@a[2],@a[1] + adcs @acc[3],@acc[3],@mod[3] + mul @mod[3],@a[3],@a[1] + adcs @acc[4],@acc[4],@mod[4] + mul @mod[4],@a[4],@a[1] + adc @acc[5],xzr, @mod[5] + mul @mod[5],@a[5],@a[1] + + adds @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],@a[1] + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],@a[1] + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],@a[1] + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],@a[1] + adc @acc[6],xzr,xzr + + mul @mod[0],@a[0],@a[0] + adds @acc[3],@acc[3],@mod[2] + umulh @a[0], @a[0],@a[0] + adcs @acc[4],@acc[4],@mod[3] + mul @mod[3],@a[3],@a[2] + adcs @acc[5],@acc[5],@mod[4] + mul @mod[4],@a[4],@a[2] + adc @acc[6],@acc[6],@mod[5] + mul @mod[5],@a[5],@a[2] + + adds @acc[4],@acc[4],@mod[3] + umulh @mod[3],@a[3],@a[2] + adcs @acc[5],@acc[5],@mod[4] + umulh @mod[4],@a[4],@a[2] + adcs @acc[6],@acc[6],@mod[5] + umulh @mod[5],@a[5],@a[2] + adc @acc[7],xzr,xzr + + mul @mod[1],@a[1],@a[1] + adds @acc[5],@acc[5],@mod[3] + umulh @a[1], @a[1],@a[1] + adcs @acc[6],@acc[6],@mod[4] + mul @mod[4],@a[4],@a[3] + adc @acc[7],@acc[7],@mod[5] + mul @mod[5],@a[5],@a[3] + + adds @acc[6],@acc[6],@mod[4] + umulh @mod[4],@a[4],@a[3] + adcs @acc[7],@acc[7],@mod[5] + umulh @mod[5],@a[5],@a[3] + adc @acc[8],xzr,xzr + mul @mod[2],@a[2],@a[2] + adds @acc[7],@acc[7],@mod[4] + umulh @a[2], @a[2],@a[2] + adc @acc[8],@acc[8],@mod[5] + mul @mod[3],@a[3],@a[3] + + mul @mod[5],@a[5],@a[4] + umulh @a[3], @a[3],@a[3] + adds @acc[8],@acc[8],@mod[5] + umulh @mod[5],@a[5],@a[4] + mul @mod[4],@a[4],@a[4] + adc @acc[9],@mod[5],xzr + + adds @acc[0],@acc[0],@acc[0] + adcs @acc[1],@acc[1],@acc[1] + adcs @acc[2],@acc[2],@acc[2] + adcs @acc[3],@acc[3],@acc[3] + adcs @acc[4],@acc[4],@acc[4] + adcs @acc[5],@acc[5],@acc[5] + adcs @acc[6],@acc[6],@acc[6] + adcs @acc[7],@acc[7],@acc[7] + umulh @a[4], @a[4],@a[4] + adcs @acc[8],@acc[8],@acc[8] + mul @mod[5],@a[5],@a[5] + adcs @acc[9],@acc[9],@acc[9] + umulh @a[5], @a[5],@a[5] + adc $a_ptr,xzr,xzr + + adds @acc[0],@acc[0],@a[0] + adcs @acc[1],@acc[1],@mod[1] + adcs @acc[2],@acc[2],@a[1] + adcs @acc[3],@acc[3],@mod[2] + adcs @acc[4],@acc[4],@a[2] + adcs @acc[5],@acc[5],@mod[3] + adcs @acc[6],@acc[6],@a[3] + stp @mod[0],@acc[0],[$r_ptr] + adcs @acc[7],@acc[7],@mod[4] + stp @acc[1],@acc[2],[$r_ptr,#16] + adcs @acc[8],@acc[8],@a[4] + stp @acc[3],@acc[4],[$r_ptr,#32] + adcs @acc[9],@acc[9],@mod[5] + stp @acc[5],@acc[6],[$r_ptr,#48] + adc @a[5],@a[5],$a_ptr + stp @acc[7],@acc[8],[$r_ptr,#64] + stp @acc[9],@a[5],[$r_ptr,#80] + + ret +.size __sqr_384,.-__sqr_384 +___ +} +$code.=<<___; +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,%function +.align 5 +sqr_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_384,.-sqr_384 + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,%function +.align 5 +redc_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size redc_mont_384,.-redc_mont_384 + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,%function +.align 5 +from_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + csel @a[5],@a[5],@acc[5],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size from_mont_384,.-from_mont_384 + +.type __mul_by_1_mont_384,%function +.align 5 +__mul_by_1_mont_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + mul @tmp[0],$n0,@a[0] + ldp @a[4],@a[5],[$a_ptr,#32] + + // mul @acc[0],@mod[0],@tmp[0] + mul @acc[1],@mod[1],@tmp[0] + mul @acc[2],@mod[2],@tmp[0] + mul @acc[3],@mod[3],@tmp[0] + mul @acc[4],@mod[4],@tmp[0] + mul @acc[5],@mod[5],@tmp[0] + subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] + umulh @a[0],@mod[0],@tmp[0] + adcs @acc[1],@acc[1],@a[1] + umulh @a[1],@mod[1],@tmp[0] + adcs @acc[2],@acc[2],@a[2] + umulh @a[2],@mod[2],@tmp[0] + adcs @acc[3],@acc[3],@a[3] + umulh @a[3],@mod[3],@tmp[0] + adcs @acc[4],@acc[4],@a[4] + umulh @a[4],@mod[4],@tmp[0] + adcs @acc[5],@acc[5],@a[5] + umulh @a[5],@mod[5],@tmp[0] + adc @acc[6],xzr,xzr +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + adds @a[0],@a[0],@acc[1] + adcs @a[1],@a[1],@acc[2] + adcs @a[2],@a[2],@acc[3] + mul @tmp[0],$n0,@a[0] + adcs @a[3],@a[3],@acc[4] + adcs @a[4],@a[4],@acc[5] + adc @a[5],@a[5],@acc[6] + + // mul @acc[0],@mod[0],@tmp[0] + mul @acc[1],@mod[1],@tmp[0] + mul @acc[2],@mod[2],@tmp[0] + mul @acc[3],@mod[3],@tmp[0] + mul @acc[4],@mod[4],@tmp[0] + mul @acc[5],@mod[5],@tmp[0] + subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] + umulh @a[0],@mod[0],@tmp[0] + adcs @acc[1],@acc[1],@a[1] + umulh @a[1],@mod[1],@tmp[0] + adcs @acc[2],@acc[2],@a[2] + umulh @a[2],@mod[2],@tmp[0] + adcs @acc[3],@acc[3],@a[3] + umulh @a[3],@mod[3],@tmp[0] + adcs @acc[4],@acc[4],@a[4] + umulh @a[4],@mod[4],@tmp[0] + adcs @acc[5],@acc[5],@a[5] + umulh @a[5],@mod[5],@tmp[0] + adc @acc[6],xzr,xzr +___ +} +$code.=<<___; + adds @a[0],@a[0],@acc[1] + adcs @a[1],@a[1],@acc[2] + adcs @a[2],@a[2],@acc[3] + adcs @a[3],@a[3],@acc[4] + adcs @a[4],@a[4],@acc[5] + adc @a[5],@a[5],@acc[6] + + ret +.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 + +.type __redc_tail_mont_384,%function +.align 5 +__redc_tail_mont_384: + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @a[0],@a[0],@acc[0] // accumulate upper half + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc @acc[6],xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,@acc[6],xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + csel @a[5],@a[5],@acc[5],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl mul_384 +.hidden mul_384 +.type mul_384,%function +.align 5 +mul_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_384,.-mul_384 + +.type __mul_384,%function +.align 5 +__mul_384: + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + + umulh @mod[0],@a[0],$bi + umulh @mod[1],@a[1],$bi + umulh @mod[2],@a[2],$bi + umulh @mod[3],@a[3],$bi + umulh @mod[4],@a[4],$bi + umulh @mod[5],@a[5],$bi + ldr $bi,[$b_ptr,8*1] + + str @acc[0],[$r_ptr] + adds @acc[0],@acc[1],@mod[0] + mul @mod[0],@a[0],$bi + adcs @acc[1],@acc[2],@mod[1] + mul @mod[1],@a[1],$bi + adcs @acc[2],@acc[3],@mod[2] + mul @mod[2],@a[2],$bi + adcs @acc[3],@acc[4],@mod[3] + mul @mod[3],@a[3],$bi + adcs @acc[4],@acc[5],@mod[4] + mul @mod[4],@a[4],$bi + adc @acc[5],xzr, @mod[5] + mul @mod[5],@a[5],$bi +___ +for ($i=1;$i<5;$i++) { +$code.=<<___; + adds @acc[0],@acc[0],@mod[0] + umulh @mod[0],@a[0],$bi + adcs @acc[1],@acc[1],@mod[1] + umulh @mod[1],@a[1],$bi + adcs @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],$bi + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],$bi + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],$bi + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],$bi + ldr $bi,[$b_ptr,#8*($i+1)] + adc @acc[6],xzr,xzr + + str @acc[0],[$r_ptr,8*$i] + adds @acc[0],@acc[1],@mod[0] + mul @mod[0],@a[0],$bi + adcs @acc[1],@acc[2],@mod[1] + mul @mod[1],@a[1],$bi + adcs @acc[2],@acc[3],@mod[2] + mul @mod[2],@a[2],$bi + adcs @acc[3],@acc[4],@mod[3] + mul @mod[3],@a[3],$bi + adcs @acc[4],@acc[5],@mod[4] + mul @mod[4],@a[4],$bi + adc @acc[5],@acc[6],@mod[5] + mul @mod[5],@a[5],$bi +___ +} +$code.=<<___; + adds @acc[0],@acc[0],@mod[0] + umulh @mod[0],@a[0],$bi + adcs @acc[1],@acc[1],@mod[1] + umulh @mod[1],@a[1],$bi + adcs @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],$bi + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],$bi + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],$bi + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],$bi + adc @acc[6],xzr,xzr + + str @acc[0],[$r_ptr,8*$i] + adds @acc[0],@acc[1],@mod[0] + adcs @acc[1],@acc[2],@mod[1] + adcs @acc[2],@acc[3],@mod[2] + adcs @acc[3],@acc[4],@mod[3] + adcs @acc[4],@acc[5],@mod[4] + adc @acc[5],@acc[6],@mod[5] + + stp @acc[0],@acc[1],[$r_ptr,#48] + stp @acc[2],@acc[3],[$r_ptr,#64] + stp @acc[4],@acc[5],[$r_ptr,#80] + + ret +.size __mul_384,.-__mul_384 + +.globl mul_382x +.hidden mul_382x +.type mul_382x,%function +.align 5 +mul_382x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp @a[0],@a[1],[$a_ptr] + mov @tmp[0],$r_ptr // save r_ptr + ldp @acc[0],@acc[1],[$a_ptr,#48] + mov @tmp[1],$a_ptr // save a_ptr + ldp @a[2],@a[3],[$a_ptr,#16] + mov @tmp[2],$b_ptr // save b_ptr + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @a[4],@a[5],[$a_ptr,#32] + adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im + ldp @acc[4],@acc[5],[$a_ptr,#80] + adcs @mod[1],$a[1],@acc[1] + ldp @a[0],@a[1],[$b_ptr] + adcs @mod[2],$a[2],@acc[2] + ldp @acc[0],@acc[1],[$b_ptr,#48] + adcs @mod[3],$a[3],@acc[3] + ldp @a[2],@a[3],[$b_ptr,#16] + adcs @mod[4],$a[4],@acc[4] + ldp @acc[2],@acc[3],[$b_ptr,#64] + adc @mod[5],$a[5],@acc[5] + ldp @a[4],@a[5],[$b_ptr,#32] + + stp @mod[0],@mod[1],[sp] + adds @mod[0],$a[0],@acc[0] // t1 = b->re + b->im + ldp @acc[4],@acc[5],[$b_ptr,#80] + adcs @mod[1],$a[1],@acc[1] + stp @mod[2],@mod[3],[sp,#16] + adcs @mod[2],$a[2],@acc[2] + adcs @mod[3],$a[3],@acc[3] + stp @mod[4],@mod[5],[sp,#32] + adcs @mod[4],$a[4],@acc[4] + stp @mod[0],@mod[1],[sp,#48] + adc @mod[5],$a[5],@acc[5] + stp @mod[2],@mod[3],[sp,#64] + stp @mod[4],@mod[5],[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add $a_ptr,sp,#0 // mul_384(ret->im, t0, t1) + add $b_ptr,sp,#48 + add $r_ptr,@tmp[0],#96 + bl __mul_384 + + add $a_ptr,@tmp[1],#48 // mul_384(tx, a->im, b->im) + add $b_ptr,@tmp[2],#48 + add $r_ptr,sp,#0 + bl __mul_384 + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + add $a_ptr,@tmp[0],#96 // ret->im -= tx + add $b_ptr,sp,#0 + add $r_ptr,@tmp[0],#96 + bl __sub_mod_384x384 + + add $b_ptr,@tmp[0],#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add $a_ptr,@tmp[0],#0 // ret->re -= tx + add $b_ptr,sp,#0 + add $r_ptr,@tmp[0],#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_382x,.-mul_382x + +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,%function +.align 5 +sqr_382x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp @a[0],@a[1],[$a_ptr] + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @a[2],@a[3],[$a_ptr,#16] + adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im + ldp @acc[2],@acc[3],[$a_ptr,#64] + adcs @mod[1],$a[1],@acc[1] + ldp @a[4],@a[5],[$a_ptr,#32] + adcs @mod[2],$a[2],@acc[2] + ldp @acc[4],@acc[5],[$a_ptr,#80] + adcs @mod[3],$a[3],@acc[3] + stp @mod[0],@mod[1],[$r_ptr] + adcs @mod[4],$a[4],@acc[4] + ldp @mod[0],@mod[1],[$b_ptr] + adc @mod[5],$a[5],@acc[5] + stp @mod[2],@mod[3],[$r_ptr,#16] + + subs @a[0],$a[0],@acc[0] // t1 = a->re - a->im + ldp @mod[2],@mod[3],[$b_ptr,#16] + sbcs @a[1],$a[1],@acc[1] + stp @mod[4],@mod[5],[$r_ptr,#32] + sbcs @a[2],$a[2],@acc[2] + ldp @mod[4],@mod[5],[$b_ptr,#32] + sbcs @a[3],$a[3],@acc[3] + sbcs @a[4],$a[4],@acc[4] + sbcs @a[5],$a[5],@acc[5] + sbc @acc[6],xzr,xzr + + and @acc[0],@mod[0],@acc[6] + and @acc[1],@mod[1],@acc[6] + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],@acc[6] + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],@acc[6] + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],@acc[6] + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],@acc[6] + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr,#48] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + mov $n0,$a_ptr // save a_ptr + add $a_ptr,$r_ptr,#0 // mul_384(ret->re, t0, t1) + add $b_ptr,$r_ptr,#48 + bl __mul_384 + + add $a_ptr,$n0,#0 // mul_384(ret->im, a->re, a->im) + add $b_ptr,$n0,#48 + add $r_ptr,$r_ptr,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + adds @a[0],@a[0],@a[0] // add with itself + ldp @a[4],@a[5],[$r_ptr,#32] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adcs @acc[0],@acc[0],@acc[0] + adcs @acc[1],@acc[1],@acc[1] + stp @a[0],@a[1],[$r_ptr] + adcs @acc[2],@acc[2],@acc[2] + stp @a[2],@a[3],[$r_ptr,#16] + adcs @acc[3],@acc[3],@acc[3] + stp @a[4],@a[5],[$r_ptr,#32] + adcs @acc[4],@acc[4],@acc[4] + stp @acc[0],@acc[1],[$r_ptr,#48] + adc @acc[5],@acc[5],@acc[5] + stp @acc[2],@acc[3],[$r_ptr,#64] + stp @acc[4],@acc[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_382x,.-sqr_382x + +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,%function +.align 5 +sqr_mont_382x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n_ptr,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp $bi,@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @mod[0],$a[0],$bi // t0 = a->re + a->im + adcs @mod[1],$a[1],@acc[1] + adcs @mod[2],$a[2],@acc[2] + adcs @mod[3],$a[3],@acc[3] + adcs @mod[4],$a[4],@acc[4] + adc @mod[5],$a[5],@acc[5] + + subs @acc[0],$a[0],$bi // t1 = a->re - a->im + sbcs @acc[1],$a[1],@acc[1] + sbcs @acc[2],$a[2],@acc[2] + sbcs @acc[3],$a[3],@acc[3] + sbcs @acc[4],$a[4],@acc[4] + sbcs @acc[5],$a[5],@acc[5] + sbc @acc[6],xzr,xzr // borrow flag as mask + + stp @mod[0],@mod[1],[sp] + stp @mod[2],@mod[3],[sp,#16] + stp @mod[4],@mod[5],[sp,#32] + stp @acc[0],@acc[1],[sp,#48] + stp @acc[2],@acc[3],[sp,#64] + stp @acc[4],@acc[5],[sp,#80] + str @acc[6],[sp,#96] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + add $b_ptr,$a_ptr,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds @acc[0],@a[0],@a[0] // add with itself + adcs @acc[1],@a[1],@a[1] + adcs @acc[2],@a[2],@a[2] + adcs @acc[3],@a[3],@a[3] + adcs @acc[4],@a[4],@a[4] + adc @acc[5],@a[5],@a[5] + + stp @acc[0],@acc[1],[$b_ptr,#48] + stp @acc[2],@acc[3],[$b_ptr,#64] + stp @acc[4],@acc[5],[$b_ptr,#80] + + ldp @a[0],@a[1],[sp] + ldr $bi,[sp,#48] + ldp @a[2],@a[3],[sp,#16] + ldp @a[4],@a[5],[sp,#32] + + add $b_ptr,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr @acc[6],[sp,#96] // account for sign from a->re - a->im + ldp @acc[0],@acc[1],[sp] + ldp @acc[2],@acc[3],[sp,#16] + ldp @acc[4],@acc[5],[sp,#32] + + and @acc[0],@acc[0],@acc[6] + and @acc[1],@acc[1],@acc[6] + and @acc[2],@acc[2],@acc[6] + and @acc[3],@acc[3],@acc[6] + and @acc[4],@acc[4],@acc[6] + and @acc[5],@acc[5],@acc[6] + + subs @a[0],@a[0],@acc[0] + sbcs @a[1],@a[1],@acc[1] + sbcs @a[2],@a[2],@acc[2] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc @acc[6],xzr,xzr + + and @acc[0],@mod[0],@acc[6] + and @acc[1],@mod[1],@acc[6] + and @acc[2],@mod[2],@acc[6] + and @acc[3],@mod[3],@acc[6] + and @acc[4],@mod[4],@acc[6] + and @acc[5],@mod[5],@acc[6] + + adds @a[0],@a[0],@acc[0] + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adc @a[5],@a[5],@acc[5] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_mont_382x,.-sqr_mont_382x + +.type __mul_mont_383_nonred,%function +.align 5 +__mul_mont_383_nonred: + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + mul $n0,$n0,@acc[0] + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + umulh @tmp[4],@a[4],$bi + umulh @tmp[5],@a[5],$bi + + adds @acc[1],@acc[1],@tmp[0] + mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],xzr, @tmp[5] + mul @tmp[5],@mod[5],$n0 +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + ldr $bi,[$b_ptr,8*$i] + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adc @acc[6],@acc[6],xzr + + ldr $n0,[x29,#96] + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adcs @acc[4],@acc[5],@tmp[4] + mul @tmp[4],@a[4],$bi + adcs @acc[5],@acc[6],@tmp[5] + mul @tmp[5],@a[5],$bi + adc @acc[6],xzr,xzr + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $n0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@a[4],$bi + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@a[5],$bi + adc @acc[6],@acc[6],xzr + + adds @acc[1],@acc[1],@tmp[0] + mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],@acc[6],@tmp[5] + mul @tmp[5],@mod[5],$n0 +___ +} +$code.=<<___; + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adc @acc[6],@acc[6],xzr + ldp $n0,$b_ptr,[x29,#96] // pull r_ptr + + adds @a[0],@acc[1],@tmp[0] + adcs @a[1],@acc[2],@tmp[1] + adcs @a[2],@acc[3],@tmp[2] + adcs @a[3],@acc[4],@tmp[3] + adcs @a[4],@acc[5],@tmp[4] + adcs @a[5],@acc[6],@tmp[5] + + ret +.size __mul_mont_383_nonred,.-__mul_mont_383_nonred + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,%function +.align 5 +sgn0_pty_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov $n0,$b_ptr + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + mov $a_ptr,$r_ptr + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and $r_ptr,@a[0],#1 + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $r_ptr,$r_ptr,$bi + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,%function +.align 5 +sgn0_pty_mont_384x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov $n0,$b_ptr + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + mov $a_ptr,$r_ptr + + bl __mul_by_1_mont_384 + add $a_ptr,$a_ptr,#48 + + and $b_ptr,@a[0],#1 + orr $n_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $n_ptr,$n_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $n_ptr,$n_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $n_ptr,$n_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $n_ptr,$n_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $b_ptr,$b_ptr,$bi + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and $r_ptr,@a[0],#1 + orr $a_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $a_ptr,$a_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $a_ptr,$a_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $a_ptr,$a_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $a_ptr,$a_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $r_ptr,$r_ptr,$bi + + cmp $n_ptr,#0 + csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp $a_ptr,#0 + csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and $n_ptr,$n_ptr,#1 + and $a_ptr,$a_ptr,#2 + orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +___ + +if (0) { +my @b = ($bi, @mod[0..4]); +my @comba = @acc[4..6]; + +$code.=<<___; +.type __mul_384_comba,%function +.align 5 +__mul_384_comba: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @b[4],@b[5],[$b_ptr,#32] + + mul @comba[0],@a[0],@b[0] + umulh @comba[1],@a[0],@b[0] + mul @acc[0],@a[1],@b[0] + umulh @acc[1],@a[1],@b[0] + str @comba[0],[$r_ptr] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[0],@b[1] + umulh @acc[3],@a[0],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],xzr, @acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[2],@b[0] + umulh @acc[1],@a[2],@b[0] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#8] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[1],@b[1] + umulh @acc[3],@a[1],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[0],@b[2] + umulh @acc[1],@a[0],@b[2] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[3],@b[0] + umulh @acc[3],@a[3],@b[0] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#16] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[2],@b[1] + umulh @acc[1],@a[2],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[1],@b[2] + umulh @acc[3],@a[1],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[0],@b[3] + umulh @acc[1],@a[0],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[4],@b[0] + umulh @acc[3],@a[4],@b[0] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#24] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[3],@b[1] + umulh @acc[1],@a[3],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[2],@b[2] + umulh @acc[3],@a[2],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[3] + umulh @acc[1],@a[1],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[0],@b[4] + umulh @acc[3],@a[0],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[0] + umulh @acc[1],@a[5],@b[0] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#32] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[1] + umulh @acc[3],@a[4],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[3],@b[2] + umulh @acc[1],@a[3],@b[2] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[2],@b[3] + umulh @acc[3],@a[2],@b[3] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[4] + umulh @acc[1],@a[1],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[0],@b[5] + umulh @acc[3],@a[0],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[1] + umulh @acc[1],@a[5],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#40] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[2] + umulh @acc[3],@a[4],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[3],@b[3] + umulh @acc[1],@a[3],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[2],@b[4] + umulh @acc[3],@a[2],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[5] + umulh @acc[1],@a[1],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[5],@b[2] + umulh @acc[3],@a[5],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#48] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[4],@b[3] + umulh @acc[1],@a[4],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[3],@b[4] + umulh @acc[3],@a[3],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[2],@b[5] + umulh @acc[1],@a[2],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[5],@b[3] + umulh @acc[3],@a[5],@b[3] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#56] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[4],@b[4] + umulh @acc[1],@a[4],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[3],@b[5] + umulh @acc[3],@a[3],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[4] + umulh @acc[1],@a[5],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#64] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[5] + umulh @acc[3],@a[4],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[5],@b[5] + umulh @acc[1],@a[5],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#72] +___ + push(@comba,shift(@comba)); +$code.=<<___; + adds @comba[0],@comba[0],@acc[0] + adc @comba[1],@comba[1],@acc[1] + stp @comba[0],@comba[1],[$r_ptr,#80] + + ret +.size __mul_384_comba,.-__mul_384_comba +___ +} +print $code; + +close STDOUT; diff --git a/blst/asm/mulq_mont_256-x86_64.pl b/blst/asm/mulq_mont_256-x86_64.pl new file mode 100755 index 0000000..12e58bb --- /dev/null +++ b/blst/asm/mulq_mont_256-x86_64.pl @@ -0,0 +1,513 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# As for "sparse" in subroutine names, see commentary in the +# asm/mulx_mont_256-x86_64.pl module. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 256 bits +my @acc=map("%r$_",(9..15)); + +{ ############################################################## mulq +my ($hi, $a0) = ("%rbp", $r_ptr); + +$code.=<<___; +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,\@function,5,"unwind" +.align 32 +mul_mont_sparse_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($b_org), %rax + mov 8*0($a_ptr), @acc[4] + mov 8*1($a_ptr), @acc[5] + mov 8*2($a_ptr), @acc[3] + mov 8*3($a_ptr), $hi + mov $b_org, $b_ptr # evacuate from %rdx + + mov %rax, @acc[6] + mulq @acc[4] # a[0]*b[0] + mov %rax, @acc[0] + mov @acc[6], %rax + mov %rdx, @acc[1] + call __mulq_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_sparse_256,.-mul_mont_sparse_256 + +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,\@function,4,"unwind" +.align 32 +sqr_mont_sparse_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), %rax + mov $n_ptr, $n0 + mov 8*1($a_ptr), @acc[5] + mov $b_org, $n_ptr + mov 8*2($a_ptr), @acc[3] + lea ($a_ptr), $b_ptr + mov 8*3($a_ptr), $hi + + mov %rax, @acc[6] + mulq %rax # a[0]*a[0] + mov %rax, @acc[0] + mov @acc[6], %rax + mov %rdx, @acc[1] + call __mulq_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +___ +{ +my @acc=@acc; +$code.=<<___; +.type __mulq_mont_sparse_256,\@abi-omnipotent +.align 32 +__mulq_mont_sparse_256: + mulq @acc[5] # a[1]*b[0] + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[3] # a[2]*b[0] + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq $hi # a[3]*b[0] + add %rax, @acc[3] + mov 8($b_ptr), %rax + adc \$0, %rdx + xor @acc[5], @acc[5] + mov %rdx, @acc[4] + +___ +for (my $i=1; $i<4; $i++) { +my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], $a0 + imulq $n0, @acc[0] + + ################################# Multiply by b[$i] + mov %rax, @acc[6] + mulq 8*0($a_ptr) + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*1($a_ptr) + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($a_ptr) + add %rax, @acc[3] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($a_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc %rdx, @acc[5] # can't overflow + xor @acc[6], @acc[6] + + ################################# reduction + mulq 8*0($n_ptr) + add %rax, $a0 # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, $a0 + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add $a0, @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov $b_next, %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + add %rdx, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq $n0, %rax + mov 8(%rsp), $a_ptr # restore $r_ptr + + ################################# last reduction + mov %rax, @acc[6] + mulq 8*0($n_ptr) + add %rax, @acc[0] # guaranteed to be zero + mov @acc[6], %rax + adc %rdx, @acc[0] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + add @acc[0], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + mov @acc[2], $b_ptr + add $hi, @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[1], %rax + adc \$0, %rdx + add %rdx, @acc[4] + adc \$0, @acc[5] + + ################################# + # Branch-less conditional subtraction of modulus + + mov @acc[3], @acc[0] + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + sbb 8*2($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*3($n_ptr), @acc[4] + sbb \$0, @acc[5] + + cmovc %rax, @acc[1] + cmovc $b_ptr, @acc[2] + cmovc @acc[0], @acc[3] + mov @acc[1], 8*0($a_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*1($a_ptr) + mov @acc[3], 8*2($a_ptr) + mov @acc[4], 8*3($a_ptr) + + ret +.cfi_endproc +.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 +___ +} } +{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,\@function,4,"unwind" +.align 32 +from_mont_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_256 + + ################################# + # Branch-less conditional acc[0:3] - modulus + + #mov @acc[4], %rax # __mulq_by_1_mont_256 does it + mov @acc[5], @acc[1] + mov @acc[6], @acc[2] + mov @acc[0], @acc[3] + + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[6] + sbb 8*3($n_ptr), @acc[0] + + cmovnc @acc[4], %rax + cmovnc @acc[5], @acc[1] + cmovnc @acc[6], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[0], @acc[3] + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,\@function,4,"unwind" +.align 32 +redc_mont_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_256 + + add 8*4($a_ptr), @acc[4] # accumulate upper half + adc 8*5($a_ptr), @acc[5] + mov @acc[4], %rax + adc 8*6($a_ptr), @acc[6] + mov @acc[5], @acc[1] + adc 8*7($a_ptr), @acc[0] + sbb $a_ptr, $a_ptr + + ################################# + # Branch-less conditional acc[0:4] - modulus + + mov @acc[6], @acc[2] + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[6] + mov @acc[0], @acc[3] + sbb 8*3($n_ptr), @acc[0] + sbb \$0, $a_ptr + + cmovnc @acc[4], %rax + cmovnc @acc[5], @acc[1] + cmovnc @acc[6], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[0], @acc[3] + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redc_mont_256,.-redc_mont_256 +___ +{ +my @acc=@acc; + +$code.=<<___; +.type __mulq_by_1_mont_256,\@abi-omnipotent +.align 32 +__mulq_by_1_mont_256: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + mov %rax, @acc[4] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<4; $i++) { +my $hi = @acc[4]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[4] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[4] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[4], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) +___ +$code.=<<___ if ($i<3); + mov @acc[1], @acc[5] + imulq $n0, @acc[1] +___ +$code.=<<___; + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 +___ +} } } + +print $code; +close STDOUT; diff --git a/blst/asm/mulq_mont_384-x86_64.pl b/blst/asm/mulq_mont_384-x86_64.pl new file mode 100755 index 0000000..3812319 --- /dev/null +++ b/blst/asm/mulq_mont_384-x86_64.pl @@ -0,0 +1,2675 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +######################################################################## +{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.text + +######################################################################## +# Double-width subtraction modulo n<<384, as opposite to naively +# expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +.type __sub_mod_384x384,\@abi-omnipotent +.align 32 +__sub_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,\@abi-omnipotent +.align 32 +__add_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,\@abi-omnipotent +.align 32 +__sub_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__sub_mod_384_a_is_loaded: + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __sub_mod_384,.-__sub_mod_384 +___ +} + +######################################################################## +# "Complex" multiplication and squaring. Use vanilla multiplication when +# possible to fold reductions. I.e. instead of mul_mont, mul_mont +# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod +# followed by *common* reduction... +{ my $frame = 5*8 + # place for argument off-load + + 3*768/8; # place for 3 768-bit temporary vectors +$code.=<<___; +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,\@function,5,"unwind" +.align 32 +mul_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $b_org, $b_ptr + mov $r_ptr, 8*4(%rsp) # offload arguments + mov $a_ptr, 8*3(%rsp) + mov $b_org, 8*2(%rsp) + mov $n_ptr, 8*1(%rsp) + mov $n0, 8*0(%rsp) + + ################################# mul_384(t0, a->re, b->re); + #lea 0($b_btr), $b_ptr # b->re + #lea 0($a_ptr), $a_ptr # a->re + lea 40(%rsp), $r_ptr # t0 + call __mulq_384 + + ################################# mul_384(t1, a->im, b->im); + lea 48($b_ptr), $b_ptr # b->im + lea 48($a_ptr), $a_ptr # a->im + lea 40+96(%rsp), $r_ptr # t1 + call __mulq_384 + + ################################# mul_384(t2, a->re+a->im, b->re+b->im); + mov 8*1(%rsp), $n_ptr + lea -48($a_ptr), $b_org + lea 40+192+48(%rsp), $r_ptr + call __add_mod_384 + + mov 8*2(%rsp), $a_ptr + lea 48($a_ptr), $b_org + lea -48($r_ptr), $r_ptr + call __add_mod_384 + + lea ($r_ptr),$b_ptr + lea 48($r_ptr),$a_ptr + call __mulq_384 + + ################################# t2=t2-t0-t1 + lea ($r_ptr), $a_ptr # t2 + lea 40(%rsp), $b_org # t0 + mov 8*1(%rsp), $n_ptr + call __sub_mod_384x384 # t2=t2-t0 + + lea ($r_ptr), $a_ptr # t2 + lea -96($r_ptr), $b_org # t1 + call __sub_mod_384x384 # t2=t2-t1 + + ################################# t0=t0-t1 + lea 40(%rsp), $a_ptr + lea 40+96(%rsp), $b_org + lea 40(%rsp), $r_ptr + call __sub_mod_384x384 # t0-t1 + + mov $n_ptr, $b_ptr # n_ptr for redc_mont_384 + + ################################# redc_mont_384(ret->re, t0, mod, n0); + lea 40(%rsp), $a_ptr # t0 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + mov 8*4(%rsp), $r_ptr # ret->re + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + ################################# redc_mont_384(ret->im, t2, mod, n0); + lea 40+192(%rsp), $a_ptr # t2 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + lea 48($r_ptr), $r_ptr # ret->im + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +$code.=<<___; +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,\@function,4,"unwind" +.align 32 +sqr_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $a_ptr, 8*2(%rsp) + + ################################# add_mod_384(t0, a->re, a->im); + lea 48($a_ptr), $b_org # a->im + lea 32(%rsp), $r_ptr # t0 + call __add_mod_384 + + ################################# sub_mod_384(t1, a->re, a->im); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea 32+48(%rsp), $r_ptr # t1 + call __sub_mod_384 + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rax # a->im + mov 8*0($a_ptr), @acc[6] # a->re + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + + call __mulq_mont_384 +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($r_ptr) # ret->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+48(%rsp), $b_ptr # t1 + + mov 32+48(%rsp), %rax # t1[0] + mov 32+8*0(%rsp), @acc[6] # t0[0..3] + mov 32+8*1(%rsp), @acc[7] + mov 32+8*2(%rsp), @acc[4] + mov 32+8*3(%rsp), @acc[5] + + call __mulq_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,\@function,4,"unwind" +.align 32 +mul_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 96($r_ptr), $r_ptr # ret->im + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + mov $r_ptr, 8*2(%rsp) # offload ret->im + mov $n_ptr, 8*3(%rsp) + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*6($a_ptr), @acc[0] + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + + mov @acc[0], 32+8*0(%rsp) + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + ################################# t1 = b->re + b->im + mov 8*0($b_org), @acc[0] + mov 8*1($b_org), @acc[1] + mov 8*2($b_org), @acc[2] + mov 8*3($b_org), @acc[3] + mov 8*4($b_org), @acc[4] + mov 8*5($b_org), @acc[5] + + add 8*6($b_org), @acc[0] + adc 8*7($b_org), @acc[1] + adc 8*8($b_org), @acc[2] + adc 8*9($b_org), @acc[3] + adc 8*10($b_org), @acc[4] + adc 8*11($b_org), @acc[5] + + mov @acc[0], 32+8*6(%rsp) + mov @acc[1], 32+8*7(%rsp) + mov @acc[2], 32+8*8(%rsp) + mov @acc[3], 32+8*9(%rsp) + mov @acc[4], 32+8*10(%rsp) + mov @acc[5], 32+8*11(%rsp) + + ################################# mul_384(ret->im, t0, t1); + lea 32+8*0(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + call __mulq_384 + + ################################# mul_384(ret->re, a->re, b->re); + mov 8*0(%rsp), $a_ptr + mov 8*1(%rsp), $b_ptr + lea -96($r_ptr), $r_ptr # ret->re + call __mulq_384 + + ################################# mul_384(tx, a->im, b->im); + lea 48($a_ptr), $a_ptr + lea 48($b_ptr), $b_ptr + lea 32(%rsp), $r_ptr + call __mulq_384 + + ################################# ret->im -= tx + mov 8*2(%rsp), $a_ptr # restore ret->im + lea 32(%rsp), $b_org + mov 8*3(%rsp), $n_ptr + mov $a_ptr, $r_ptr + call __sub_mod_384x384 + + ################################# ret->im -= ret->re + lea 0($r_ptr), $a_ptr + lea -96($r_ptr), $b_org + call __sub_mod_384x384 + + ################################# ret->re -= tx + lea -96($r_ptr), $a_ptr + lea 32(%rsp), $b_org + lea -96($r_ptr), $r_ptr + call __sub_mod_384x384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_382x,.-mul_382x +___ +} +{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,\@function,3,"unwind" +.align 32 +sqr_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + mov 8*5($a_ptr), @acc[11] + + mov @acc[6], @acc[0] + add 8*6($a_ptr), @acc[6] + mov @acc[7], @acc[1] + adc 8*7($a_ptr), @acc[7] + mov @acc[8], @acc[2] + adc 8*8($a_ptr), @acc[8] + mov @acc[9], @acc[3] + adc 8*9($a_ptr), @acc[9] + mov @acc[10], @acc[4] + adc 8*10($a_ptr), @acc[10] + mov @acc[11], @acc[5] + adc 8*11($a_ptr), @acc[11] + + mov @acc[6], 8*0($r_ptr) + mov @acc[7], 8*1($r_ptr) + mov @acc[8], 8*2($r_ptr) + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + ################################# t1 = a->re - a->im + lea 48($a_ptr), $b_org + lea 48($r_ptr), $r_ptr + call __sub_mod_384_a_is_loaded + + ################################# mul_384(ret->re, t0, t1); + lea ($r_ptr), $a_ptr + lea -48($r_ptr), $b_ptr + lea -48($r_ptr), $r_ptr + call __mulq_384 + + ################################# mul_384(ret->im, a->re, a->im); + mov (%rsp), $a_ptr + lea 48($a_ptr), $b_ptr + lea 96($r_ptr), $r_ptr + call __mulq_384 + + mov 8*0($r_ptr), @acc[0] # double ret->im + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + mov 8*6($r_ptr), @acc[6] + mov 8*7($r_ptr), @acc[7] + mov 8*8($r_ptr), @acc[8] + mov 8*9($r_ptr), @acc[9] + mov 8*10($r_ptr), @acc[10] + add @acc[0], @acc[0] + mov 8*11($r_ptr), @acc[11] + adc @acc[1], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[2], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[3], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[4], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[5], @acc[5] + mov @acc[4], 8*4($r_ptr) + adc @acc[6], @acc[6] + mov @acc[5], 8*5($r_ptr) + adc @acc[7], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[8], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[9], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[10], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[11], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_382x,.-sqr_382x +___ +} +{ ########################################################## 384-bit mul +my @acc=map("%r$_",("cx",8..12)); +my $bi = "%rbp"; + +$code.=<<___; +.globl mul_384 +.hidden mul_384 +.type mul_384,\@function,3,"unwind" +.align 32 +mul_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov $b_org, $b_ptr + call __mulq_384 + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,\@abi-omnipotent +.align 32 +__mulq_384: + mov 8*0($b_ptr), %rax + + mov %rax, $bi + mulq 8*0($a_ptr) + mov %rax, 8*0($r_ptr) + mov $bi, %rax + mov %rdx, @acc[0] + + mulq 8*1($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[1] + + mulq 8*2($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq 8*3($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq 8*4($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*5($a_ptr) + add %rax, @acc[4] + mov 8*1($b_ptr), %rax + adc \$0, %rdx + mov %rdx, @acc[5] +___ +for(my $i=1; $i<6; $i++) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; +$code.=<<___; + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov @acc[0], 8*$i($r_ptr) + mov %rdx, @acc[0] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[1], @acc[0] + adc \$0, %rdx + mov %rdx, @acc[1] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[2], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[3], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[4], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add @acc[5], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[5] +___ +} +$code.=<<___; + mov @acc[0], 8*6($r_ptr) + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) + + ret +.size __mulq_384,.-__mulq_384 +___ +} +if (0) { ############################################################## +my @b=map("%r$_",(10..15)); +my @a=reverse(@b); + @b[5]=$b_ptr; +my $bi = "%rbp"; +my @comba=map("%r$_",("cx",8,9)); +# a[0]*b[0] +# a[1]*b[0] +# a[0]*b[1] +# a[2]*b[0] +# a[1]*b[1] +# a[0]*b[2] +# a[3]*b[0] +# a[2]*b[1] +# a[1]*b[2] +# a[0]*b[3] +# a[4]*b[0] +# a[3]*b[1] +# a[2]*b[2] +# a[1]*b[3] +# a[0]*b[4] +# a[5]*b[0] +# a[4]*b[1] +# a[3]*b[2] +# a[2]*b[3] +# a[1]*b[4] +# a[0]*b[5] +# a[5]*b[1] +# a[4]*b[2] +# a[3]*b[3] +# a[2]*b[4] +# a[1]*b[5] +# a[5]*b[2] +# a[4]*b[3] +# a[3]*b[4] +# a[2]*b[5] +# a[5]*b[3] +# a[4]*b[4] +# a[3]*b[5] +# a[5]*b[4] +# a[4]*b[5] +# a[5]*b[5] +# +# 13% less instructions give +15% on Core2, +10% on Goldmont, +# -0% on Sandy Bridge, but -16% on Haswell:-( +# [for reference +5% on Skylake, +11% on Ryzen] + +$code.=<<___; +.type __mulq_comba_384,\@abi-omnipotent +.align 32 +__mulq_comba_384: + mov 8*0($b_ptr), %rax + mov 8*0($a_ptr), @a[0] + mov 8*1($a_ptr), @a[1] + mov 8*1($b_ptr), @b[1] + + mov %rax, @b[0] + mulq @a[0] # a[0]*b[0] + mov %rax, 8*0($r_ptr) + mov @b[0], %rax + mov %rdx, @comba[0] + + ################################# + mov 8*2($a_ptr), @a[2] + xor @comba[2], @comba[2] + mulq @a[1] # a[1]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc \$0, %rdx + mov 8*2($b_ptr), @b[2] + mov %rdx, @comba[1] + + mulq @a[0] # a[0]*b[1] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*1($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[2] # a[2]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[0] # a[0]*b[2] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*2($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq 8*3($a_ptr) # a[3]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[2] # a[2]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[2] + add %rax, @comba[0] + mov 8*3($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[3] + mulq @a[0] # a[0]*b[3] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*3($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq 8*4($a_ptr) # a[4]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[3] + add %rax, @comba[0] + mov 8*4($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[4] + mulq @a[0] # a[0]*b[4] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + mov 8*5($a_ptr), @a[5] + adc \$0, @comba[2] + mov @comba[0], 8*4($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*4($a_ptr) # a[4]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*1($a_ptr) # a[1]*b[4] + add %rax, @comba[0] + mov 8*5($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[5] + mulq @a[0] # a[0]*b[5] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + mov 8*4($a_ptr), @a[4] + adc \$0, @comba[2] + mov @comba[0], 8*5($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*1($a_ptr) # a[1]*b[5] + add %rax, @comba[0] + mov $b[2], %rax + adc %rdx, @comba[1] + mov 8*3($a_ptr), @a[3] + adc \$0, @comba[2] + mov @comba[0], 8*6($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[3] # a[3]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[5] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*7($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[3] # a[3]*b[5] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*8($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[5] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*9($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + mulq @a[5] # a[5]*b[4] + add %rax, @comba[0] + adc %rdx, @comba[1] + + mov @comba[0], 8*10($r_ptr) + mov @comba[1], 8*11($r_ptr) + + ret +.size __mulq_comba_384,.-__mulq_comba_384 +___ +} +{ ########################################################## 384-bit sqr +my @acc=(@acc,"%rcx","%rbx","%rbp",$a_ptr); +my $hi; + +$code.=<<___; +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,\@function,2,"unwind" +.align 32 +sqr_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sqrq_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,\@abi-omnipotent +.align 32 +__sqrq_384: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + + ######################################### + mov %rax, @acc[6] + mulq @acc[7] # a[1]*a[0] + mov %rax, @acc[1] + mov @acc[6], %rax + mov 8*4($a_ptr), @acc[10] + mov %rdx, @acc[2] + + mulq @acc[8] # a[2]*a[0] + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + mov 8*5($a_ptr), @acc[11] + mov %rdx, @acc[3] + + mulq @acc[9] # a[3]*a[0] + add %rax, @acc[3] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq @acc[10] # a[4]*a[0] + add %rax, @acc[4] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq @acc[11] # a[5]*a[0] + add %rax, @acc[5] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq %rax # a[0]*a[0] + xor @acc[0], @acc[0] + mov %rax, 8*0($r_ptr) + mov @acc[7], %rax + add @acc[1], @acc[1] # double acc[1] + adc \$0, @acc[0] + add %rdx, @acc[1] # accumulate a[0]*a[0] + adc \$0, @acc[0] # carries to a[1]*a[1] + mov @acc[1], 8*1($r_ptr) +___ +$hi=@acc[1]; +$code.=<<___; + ######################################### + mulq @acc[8] # a[2]*a[1] + add %rax, @acc[3] + mov @acc[7], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[9] # a[3]*a[1] + add %rax, @acc[4] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[10] # a[4]*a[1] + add %rax, @acc[5] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[5] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[1] + add %rax, @acc[6] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[6] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq %rax # a[1]*a[1] + xor @acc[1], @acc[1] + add %rax, @acc[0] # can't carry + mov @acc[8], %rax + add @acc[2], @acc[2] # double acc[2:3] + adc @acc[3], @acc[3] + adc \$0, @acc[1] + add @acc[0], @acc[2] # accumulate a[1]*a[1] + adc %rdx, @acc[3] + adc \$0, @acc[1] # carries to a[2]*a[2] + mov @acc[2], 8*2($r_ptr) +___ +$hi=@acc[0]; +$code.=<<___; + ######################################### + mulq @acc[9] # a[3]*a[2] + add %rax, @acc[5] + mov @acc[8], %rax + adc \$0, %rdx + mov @acc[3], 8*3($r_ptr) + mov %rdx, $hi + + mulq @acc[10] # a[4]*a[2] + add %rax, @acc[6] + mov @acc[8], %rax + adc \$0, %rdx + add $hi, @acc[6] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[2] + add %rax, @acc[7] + mov @acc[8], %rax + adc \$0, %rdx + add $hi, @acc[7] + adc \$0, %rdx + mov %rdx, @acc[8] + + mulq %rax # a[2]*a[2] + xor @acc[3], @acc[3] + add %rax, @acc[1] # can't carry + mov @acc[9], %rax + add @acc[4], @acc[4] # double acc[4:5] + adc @acc[5], @acc[5] + adc \$0, @acc[3] + add @acc[1], @acc[4] # accumulate a[2]*a[2] + adc %rdx, @acc[5] + adc \$0, @acc[3] # carries to a[3]*a[3] + mov @acc[4], 8*4($r_ptr) + + ######################################### + mulq @acc[10] # a[4]*a[3] + add %rax, @acc[7] + mov @acc[9], %rax + adc \$0, %rdx + mov @acc[5], 8*5($r_ptr) + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[3] + add %rax, @acc[8] + mov @acc[9], %rax + adc \$0, %rdx + add $hi, @acc[8] + adc \$0, %rdx + mov %rdx, @acc[9] + + mulq %rax # a[3]*a[3] + xor @acc[4], @acc[4] + add %rax, @acc[3] # can't carry + mov @acc[10], %rax + add @acc[6], @acc[6] # double acc[6:7] + adc @acc[7], @acc[7] + adc \$0, @acc[4] + add @acc[3], @acc[6] # accumulate a[3]*a[3] + adc %rdx, @acc[7] + mov @acc[6], 8*6($r_ptr) + adc \$0, @acc[4] # carries to a[4]*a[4] + mov @acc[7], 8*7($r_ptr) + + ######################################### + mulq @acc[11] # a[5]*a[4] + add %rax, @acc[9] + mov @acc[10], %rax + adc \$0, %rdx + mov %rdx, @acc[10] + + mulq %rax # a[4]*a[4] + xor @acc[5], @acc[5] + add %rax, @acc[4] # can't carry + mov @acc[11], %rax + add @acc[8], @acc[8] # double acc[8:9] + adc @acc[9], @acc[9] + adc \$0, @acc[5] + add @acc[4], @acc[8] # accumulate a[4]*a[4] + adc %rdx, @acc[9] + mov @acc[8], 8*8($r_ptr) + adc \$0, @acc[5] # carries to a[5]*a[5] + mov @acc[9], 8*9($r_ptr) + + ######################################### + mulq %rax # a[5]*a[5] + add @acc[5], %rax # can't carry + add @acc[10], @acc[10] # double acc[10] + adc \$0, %rdx + add @acc[10], %rax # accumulate a[5]*a[5] + adc \$0, %rdx + mov %rax, 8*10($r_ptr) + mov %rdx, 8*11($r_ptr) + + ret +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,\@function,4,"unwind" +.align 32 +sqr_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*15, %rsp +.cfi_adjust_cfa_offset 8*15 +.cfi_end_prologue + + mov $n_ptr, 8*12(%rsp) # n0 + mov $b_org, 8*13(%rsp) # n_ptr + mov $r_ptr, 8*14(%rsp) + + mov %rsp, $r_ptr + call __sqrq_384 + + lea 0(%rsp), $a_ptr + mov 8*12(%rsp), %rcx # n0 for mul_by_1 + mov 8*13(%rsp), $b_ptr # n_ptr for mul_by_1 + mov 8*14(%rsp), $r_ptr + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea 8*15(%rsp), %r8 # size optimization + mov 8*15(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*21 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 +___ +} +{ ########################################################## 384-bit redc_mont +my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +######################################################################## +# void redc_mont_384(uint64_t ret[6], const uint64_t a[12], +# uint64_t m[6], uint64_t n0); +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,\@function,4,"unwind" +.align 32 +redc_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + +######################################################################## +# void from_mont_384(uint64_t ret[6], const uint64_t a[6], +# uint64_t m[6], uint64_t n0); +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,\@function,4,"unwind" +.align 32 +from_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_384 + + ################################# + # Branch-less conditional acc[0:6] - modulus + + #mov @acc[6], %rax # __mulq_by_1_mont_384 does it + mov @acc[7], %rcx + mov @acc[0], %rdx + mov @acc[1], %rbp + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[2], @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + sbb 8*4($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*5($n_ptr), @acc[3] + + cmovc %rax, @acc[6] + cmovc %rcx, @acc[7] + cmovc %rdx, @acc[0] + mov @acc[6], 8*0($r_ptr) + cmovc %rbp, @acc[1] + mov @acc[7], 8*1($r_ptr) + cmovc @acc[5], @acc[2] + mov @acc[0], 8*2($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size from_mont_384,.-from_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulq_by_1_mont_384,\@abi-omnipotent +.align 32 +__mulq_by_1_mont_384: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov %rax, @acc[6] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<6; $i++) { +my $hi = @acc[6]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[6] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[6] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[6], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx +___ +$code.=<<___ if ($i<5); + mov @acc[1], @acc[7] + imulq $n0, @acc[1] +___ +$code.=<<___; + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[5] + adc \$0, %rdx + mov %rdx, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redc_tail_mont_384,\@abi-omnipotent +.align 32 +__redc_tail_mont_384: + add 8*6($a_ptr), @acc[0] # accumulate upper half + mov @acc[0], %rax + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + mov @acc[1], %rcx + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + sbb @acc[6], @acc[6] + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[2], %rdx + mov @acc[3], %rbp + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[7] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], $a_ptr + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rcx, @acc[1] + cmovc %rdx, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc %rbp, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc $a_ptr, @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,\@function,3,"unwind" +.align 32 +sgn0_pty_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 0($r_ptr), $a_ptr + mov $b_org, $n0 + call __mulq_by_1_mont_384 + + xor %rax, %rax + mov @acc[0], @acc[7] + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,\@function,3,"unwind" +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 48($r_ptr), $a_ptr # sgn0(a->im) + mov $b_org, $n0 + call __mulq_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), $a_ptr # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + call __mulq_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +___ +} } + +{ ########################################################## mulq_mont +my ($bi, $hi) = ("%rdi", "%rbp"); + +$code.=<<___; +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,\@function,5,"unwind" +.align 32 +mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*3, %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov 8*0($b_org), %rax + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + mov $b_org, $b_ptr # evacuate from %rdx + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + + call __mulq_mont_384 + + mov 24(%rsp),%r15 +.cfi_restore %r15 + mov 32(%rsp),%r14 +.cfi_restore %r14 + mov 40(%rsp),%r13 +.cfi_restore %r13 + mov 48(%rsp),%r12 +.cfi_restore %r12 + mov 56(%rsp),%rbx +.cfi_restore %rbx + mov 64(%rsp),%rbp +.cfi_restore %rbp + lea 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulq_mont_384,\@abi-omnipotent +.align 32 +__mulq_mont_384: + mov %rax, $bi + mulq @acc[6] # a[0]*b[0] + mov %rax, @acc[0] + mov $bi, %rax + mov %rdx, @acc[1] + + mulq @acc[7] # a[1]*b[0] + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[4] # a[2]*b[0] + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mov @acc[0], $hi + imulq 8(%rsp), @acc[0] + + mulq @acc[5] # a[3]*b[0] + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + xor @acc[7], @acc[7] + mov %rdx, @acc[6] +___ +for (my $i=0; $i<6;) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, $hi # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, $hi + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add $hi, @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add $hi, @acc[5] + adc %rdx, @acc[6] + adc \$0, @acc[7] +___ + push(@acc,shift(@acc)); +$code.=<<___ if ($i++<5); + ################################# Multiply by b[$i] + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[7] + + mov @acc[0], $hi + imulq 8(%rsp), @acc[0] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*5($a_ptr) + add @acc[7], @acc[5] + adc \$0, %rdx + xor @acc[7], @acc[7] + add %rax, @acc[5] + mov @acc[0], %rax + adc %rdx, @acc[6] + adc \$0, @acc[7] +___ +} +$code.=<<___; + ################################# + # Branch-less conditional acc[0:6] - modulus + + #mov @acc[0], %rax + mov 8*2(%rsp), $r_ptr # restore $r_ptr + sub 8*0($n_ptr), @acc[0] + mov @acc[1], %rdx + sbb 8*1($n_ptr), @acc[1] + mov @acc[2], $b_ptr + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*3($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[7] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rdx, @acc[1] + cmovc $b_ptr, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[7], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __mulq_mont_384,.-__mulq_mont_384 +___ +} } +$code.=<<___; +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,\@function,6,"unwind" +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*17, %rsp +.cfi_adjust_cfa_offset 8*17 +.cfi_end_prologue + + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $n_ptr, 8*2(%rsp) + lea 8*4(%rsp), $r_ptr + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq (%r9), %xmm2 # prefetch b[0] + +.Loop_sqr_384: + movd %edx, %xmm1 # loop counter + + call __sqrq_384 + + lea 0($r_ptr), $a_ptr + mov 8*0(%rsp), %rcx # n0 for mul_by_1 + mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1, %edx + lea 0($r_ptr), $a_ptr + dec %edx + jnz .Loop_sqr_384 + + movq %xmm2, %rax # b[0] + mov $b_ptr, $n_ptr + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + #mov 8*0($b_ptr), %rax + #mov 8*0($a_ptr), @acc[6] + #mov 8*1($a_ptr), @acc[7] + #mov 8*2($a_ptr), @acc[4] + #mov 8*3($a_ptr), @acc[5] + mov @acc[0], @acc[4] + mov @acc[1], @acc[5] + + call __mulq_mont_384 + + lea 8*17(%rsp), %r8 # size optimization + mov 8*17(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*23 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,\@function,6,"unwind" +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*17, %rsp +.cfi_adjust_cfa_offset 8*17 +.cfi_end_prologue + + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $n_ptr, 8*2(%rsp) + lea 8*4(%rsp), $r_ptr + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq (%r9), %xmm2 # prefetch b[0] + +.Loop_sqr_383: + movd %edx, %xmm1 # loop counter + + call __sqrq_384 + + lea 0($r_ptr), $a_ptr + mov 8*0(%rsp), %rcx # n0 for mul_by_1 + mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 + call __mulq_by_1_mont_384 + + movd %xmm1, %edx # loop counter + add 8*6($a_ptr), @acc[6] # just accumulate upper half + adc 8*7($a_ptr), @acc[7] + adc 8*8($a_ptr), @acc[0] + adc 8*9($a_ptr), @acc[1] + adc 8*10($a_ptr), @acc[2] + adc 8*11($a_ptr), @acc[3] + lea 0($r_ptr), $a_ptr + + mov @acc[6], 8*0($r_ptr) # omitting full reduction gives ~5% + mov @acc[7], 8*1($r_ptr) # in addition-chains + mov @acc[0], 8*2($r_ptr) + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + dec %edx + jnz .Loop_sqr_383 + + movq %xmm2, %rax # b[0] + mov $b_ptr, $n_ptr + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + #movq 8*0($b_ptr), %rax + #mov 8*0($a_ptr), @acc[6] + #mov 8*1($a_ptr), @acc[7] + #mov 8*2($a_ptr), @acc[4] + #mov 8*3($a_ptr), @acc[5] + mov @acc[0], @acc[4] + mov @acc[1], @acc[5] + + call __mulq_mont_384 # formally one can omit full reduction + # even after multiplication... + lea 8*17(%rsp), %r8 # size optimization + mov 8*17(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*23 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +___ +{ my @acc=@acc; # will be rotated locally + my $bi = "%rbp"; + +$code.=<<___; +.type __mulq_mont_383_nonred,\@abi-omnipotent +.align 32 +__mulq_mont_383_nonred: + mov %rax, $bi + mulq @acc[6] # a[0]*b[0] + mov %rax, @acc[0] + mov $bi, %rax + mov %rdx, @acc[1] + + mulq @acc[7] # a[1]*b[0] + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[4] # a[2]*b[0] + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mov @acc[0], @acc[7] + imulq 8(%rsp), @acc[0] + + mulq @acc[5] # a[3]*b[0] + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[6] +___ +for (my $i=0; $i<6;) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[7] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[7] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*3($n_ptr) + add @acc[7], @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add @acc[7], @acc[5] + adc %rdx, @acc[6] +___ + push(@acc,shift(@acc)); +$code.=<<___ if ($i++<5); + ################################# Multiply by b[$i] + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[6] + + mov @acc[0], @acc[7] + imulq 8(%rsp), @acc[0] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*5($a_ptr) + add @acc[6], @acc[5] + adc \$0, %rdx + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[6] +___ +} +$code.=<<___; + ret +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); + +# omitting 3 reductions gives 8-11% better performance in add-chains +$code.=<<___; +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,\@function,4,"unwind" +.align 32 +sqr_mont_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $a_ptr, 8*2(%rsp) + mov $r_ptr, 8*3(%rsp) + + ################################# + mov 8*0($a_ptr), @acc[0] # a->re + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $r_ptr, $r_ptr # borrow flag as mask + + mov @acc[0], 32+8*0(%rsp) # t0 + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + mov @acc[6], 32+8*6(%rsp) # t1 + mov @acc[7], 32+8*7(%rsp) + mov @acc[8], 32+8*8(%rsp) + mov @acc[9], 32+8*9(%rsp) + mov @acc[10], 32+8*10(%rsp) + mov @acc[11], 32+8*11(%rsp) + mov $r_ptr, 32+8*12(%rsp) + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + #mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rax # a->im + mov 8*0($a_ptr), @acc[6] # a->re + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + + mov 8*3(%rsp), $r_ptr + call __mulq_mont_383_nonred +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + + mov @acc[0], 8*6($r_ptr) # ret->im + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + + mov 32+8*6(%rsp), %rax # t1[0] + mov 32+8*0(%rsp), @acc[6] # t0[0..3] + mov 32+8*1(%rsp), @acc[7] + mov 32+8*2(%rsp), @acc[4] + mov 32+8*3(%rsp), @acc[5] + + call __mulq_mont_383_nonred +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im + mov 32+8*0(%rsp), @acc[6] + mov 32+8*1(%rsp), @acc[7] + and @acc[11], @acc[6] + mov 32+8*2(%rsp), @acc[8] + and @acc[11], @acc[7] + mov 32+8*3(%rsp), @acc[9] + and @acc[11], @acc[8] + mov 32+8*4(%rsp), @acc[10] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 32+8*5(%rsp), @acc[11] + + sub @acc[6], @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb @acc[7], @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb @acc[8], @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb @acc[9], @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb @acc[10], @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb @acc[11], @acc[5] + sbb @acc[11], @acc[11] + + and @acc[11], @acc[6] + and @acc[11], @acc[7] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + adc @acc[8], @acc[2] + adc @acc[9], @acc[3] + adc @acc[10], @acc[4] + adc @acc[11], @acc[5] + + mov @acc[0], 8*0($r_ptr) # ret->re + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) +___ +} +$code.=<<___; + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/mulx_mont_256-x86_64.pl b/blst/asm/mulx_mont_256-x86_64.pl new file mode 100755 index 0000000..0d6bf2e --- /dev/null +++ b/blst/asm/mulx_mont_256-x86_64.pl @@ -0,0 +1,486 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# "Sparse" in subroutine names refers to most significant limb of the +# modulus. Though "sparse" is a bit of misnomer, because limitation is +# just not-all-ones. Or in other words not larger than 2^256-2^192-1. +# In general Montgomery multiplication algorithm can handle one of the +# inputs being non-reduced and capped by 1<<radix_width, 1<<256 in this +# case, rather than the modulus. Whether or not mul_mont_sparse_256, a +# *taylored* implementation of the algorithm, can handle such input can +# be circumstantial. For example, in most general case it depends on +# similar "bit sparsity" of individual limbs of the second, fully reduced +# multiplicand. If you can't make such assumption about the limbs, then +# non-reduced value shouldn't be larger than "same old" 2^256-2^192-1. +# This requirement can be met by conditionally subtracting "bitwise +# left-aligned" modulus. For example, if modulus is 200 bits wide, you +# would need to conditionally subtract the value of modulus<<56. Common +# source of non-reduced values is redc_mont_256 treating 512-bit inputs. +# Well, more specifically ones with upper half not smaller than modulus. +# Just in case, why limitation at all and not general-purpose 256-bit +# subroutines? Unlike the 384-bit case, accounting for additional carry +# has disproportionate impact on performance, especially in adcx/adox +# implementation. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 255 bits +my @acc=map("%r$_",(10..15)); + +{ ############################################################## mulq +my ($lo,$hi)=("%rbp","%r9"); + +$code.=<<___; +.text + +.globl mulx_mont_sparse_256 +.hidden mulx_mont_sparse_256 +.type mulx_mont_sparse_256,\@function,5,"unwind" +.align 32 +mulx_mont_sparse_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8,%rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), @acc[4] + mov 8*1($a_ptr), @acc[5] + mov 8*2($a_ptr), $lo + mov 8*3($a_ptr), $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx @acc[4], %rax, @acc[1] # a[0]*b[0] + call __mulx_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_sparse_256,.-mulx_mont_sparse_256 + +.globl sqrx_mont_sparse_256 +.hidden sqrx_mont_sparse_256 +.type sqrx_mont_sparse_256,\@function,4,"unwind" +.align 32 +sqrx_mont_sparse_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8,%rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $b_ptr + mov $n_ptr, $n0 + mov $b_org, $n_ptr + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[5] + mov 8*2($a_ptr), $lo + mov 8*3($a_ptr), $hi + lea -128($b_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %rdx, %rax, @acc[1] # a[0]*a[0] + call __mulx_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256 +___ +{ +my @acc=@acc; +$code.=<<___; +.type __mulx_mont_sparse_256,\@abi-omnipotent +.align 32 +__mulx_mont_sparse_256: + mulx @acc[5], @acc[5], @acc[2] + mulx $lo, $lo, @acc[3] + add @acc[5], @acc[1] + mulx $hi, $hi, @acc[4] + mov 8($b_ptr), %rdx + adc $lo, @acc[2] + adc $hi, @acc[3] + adc \$0, @acc[4] + +___ +for (my $i=1; $i<4; $i++) { +my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : "%rax"; +my $a5 = $i==1 ? @acc[5] : $lo; +$code.=<<___; + mov %rax, @acc[0] + imulq $n0, %rax + + ################################# Multiply by b[$i] + xor $a5, $a5 # [@acc[5]=0,] cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + mov %rax, %rdx + adox $lo, @acc[4] + adcx @acc[5], $hi # cf=0 + adox $hi, @acc[5] # of=0 + + ################################# reduction + mulx 8*0+128($n_ptr), $lo, %rax + adcx $lo, @acc[0] # guaranteed to be zero + adox @acc[1], %rax + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, %rax # @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[3] + adox $hi, @acc[4] + adcx @acc[0], @acc[4] + adox @acc[0], @acc[5] + adcx @acc[0], @acc[5] + adox @acc[0], @acc[0] # acc[5] in next iteration + adc \$0, @acc[0] # cf=0, of=0 +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq $n0, %rdx + + ################################# last reduction + xor $lo, $lo # cf=0, of=0 + mulx 8*0+128($n_ptr), @acc[0], $hi + adcx %rax, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + mov @acc[1], %rdx + lea 128($n_ptr), $n_ptr + adcx $lo, @acc[3] + adox $hi, @acc[4] + mov @acc[2], %rax + adcx @acc[0], @acc[4] + adox @acc[0], @acc[5] + adc \$0, @acc[5] + + ################################# + # Branch-less conditional acc[1:5] - modulus + + mov @acc[3], $lo + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + sbb 8*2($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*3($n_ptr), @acc[4] + sbb \$0, @acc[5] + + cmovc %rdx, @acc[1] + cmovc %rax, @acc[2] + cmovc $lo, @acc[3] + mov @acc[1], 8*0($r_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*1($r_ptr) + mov @acc[3], 8*2($r_ptr) + mov @acc[4], 8*3($r_ptr) + + ret +.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256 +___ +} } +{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +.globl fromx_mont_256 +.hidden fromx_mont_256 +.type fromx_mont_256,\@function,4,"unwind" +.align 32 +fromx_mont_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_256 + + ################################# + # Branch-less conditional acc[0:3] - modulus + + #mov @acc[4], %rax # __mulq_by_1_mont_256 does it + mov @acc[5], %rdx + mov @acc[0], @acc[2] + mov @acc[1], @acc[3] + + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + + cmovnc @acc[4], %rax + cmovnc @acc[5], %rdx + cmovnc @acc[0], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[1], @acc[3] + mov %rdx, 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size fromx_mont_256,.-fromx_mont_256 + +.globl redcx_mont_256 +.hidden redcx_mont_256 +.type redcx_mont_256,\@function,4,"unwind" +.align 32 +redcx_mont_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_256 + + add 8*4($a_ptr), @acc[4] # accumulate upper half + adc 8*5($a_ptr), @acc[5] + mov @acc[4], %rax + adc 8*6($a_ptr), @acc[0] + mov @acc[5], %rdx + adc 8*7($a_ptr), @acc[1] + sbb $a_ptr, $a_ptr + + ################################# + # Branch-less conditional acc[0:4] - modulus + + mov @acc[0], @acc[2] + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[0] + mov @acc[1], @acc[3] + sbb 8*3($n_ptr), @acc[1] + sbb \$0, $a_ptr + + cmovnc @acc[4], %rax + cmovnc @acc[5], %rdx + cmovnc @acc[0], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[1], @acc[3] + mov %rdx, 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redcx_mont_256,.-redcx_mont_256 +___ +{ +my @acc=@acc; + +$code.=<<___; +.type __mulx_by_1_mont_256,\@abi-omnipotent +.align 32 +__mulx_by_1_mont_256: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + mov %rax, @acc[4] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<4; $i++) { +my $hi = @acc[4]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[4] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[4] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[4], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) +___ +$code.=<<___ if ($i<3); + mov @acc[1], @acc[5] + imulq $n0, @acc[1] +___ +$code.=<<___; + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256 +___ +} } } + +print $code; +close STDOUT; diff --git a/blst/asm/mulx_mont_384-x86_64.pl b/blst/asm/mulx_mont_384-x86_64.pl new file mode 100755 index 0000000..a762807 --- /dev/null +++ b/blst/asm/mulx_mont_384-x86_64.pl @@ -0,0 +1,2384 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +######################################################################## +{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.text + +######################################################################## +# Double-width subtraction modulo n<<384, as opposite to naively +# expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +.type __sub_mod_384x384,\@abi-omnipotent +.align 32 +__sub_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,\@abi-omnipotent +.align 32 +__add_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,\@abi-omnipotent +.align 32 +__sub_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__sub_mod_384_a_is_loaded: + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __sub_mod_384,.-__sub_mod_384 +___ +} + +######################################################################## +# "Complex" multiplication and squaring. Use vanilla multiplication when +# possible to fold reductions. I.e. instead of mul_mont, mul_mont +# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod +# followed by *common* reduction... For single multiplication disjoint +# reduction is bad for performance for given vector length, yet overall +# it's a win here, because it's one reduction less. +{ my $frame = 5*8 + # place for argument off-load + + 3*768/8; # place for 3 768-bit temporary vectors +$code.=<<___; +.globl mulx_mont_384x +.hidden mulx_mont_384x +.type mulx_mont_384x,\@function,5,"unwind" +.align 32 +mulx_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $b_org, $b_ptr + mov $r_ptr, 8*4(%rsp) # offload arguments + mov $a_ptr, 8*3(%rsp) + mov $b_org, 8*2(%rsp) + mov $n_ptr, 8*1(%rsp) + mov $n0, 8*0(%rsp) + + ################################# mul_384(t0, a->re, b->re); + #lea 0($b_btr), $b_ptr # b->re + #lea 0($a_ptr), $a_ptr # a->re + lea 40(%rsp), $r_ptr # t0 + call __mulx_384 + + ################################# mul_384(t1, a->im, b->im); + lea 48($b_ptr), $b_ptr # b->im + lea 128+48($a_ptr), $a_ptr # a->im + lea 96($r_ptr), $r_ptr # t1 + call __mulx_384 + + ################################# mul_384(t2, a->re+a->im, b->re+b->im); + mov 8*1(%rsp), $n_ptr + lea ($b_ptr), $a_ptr # b->re + lea -48($b_ptr), $b_org # b->im + lea 40+192+48(%rsp), $r_ptr + call __add_mod_384 + + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea -48($r_ptr), $r_ptr + call __add_mod_384 + + lea ($r_ptr),$b_ptr + lea 48($r_ptr),$a_ptr + call __mulx_384 + + ################################# t2=t2-t0-t1 + lea ($r_ptr), $a_ptr # t2 + lea 40(%rsp), $b_org # t0 + mov 8*1(%rsp), $n_ptr + call __sub_mod_384x384 # t2-t0 + + lea ($r_ptr), $a_ptr # t2 + lea -96($r_ptr), $b_org # t1 + call __sub_mod_384x384 # t2-t0-t1 + + ################################# t0=t0-t1 + lea 40(%rsp), $a_ptr + lea 40+96(%rsp), $b_org + lea 40(%rsp), $r_ptr + call __sub_mod_384x384 # t0-t1 + + lea ($n_ptr), $b_ptr # n_ptr for redc_mont_384 + + ################################# redc_mont_384(ret->re, t0, mod, n0); + lea 40(%rsp), $a_ptr # t0 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + mov 8*4(%rsp), $r_ptr # ret->re + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + ################################# redc_mont_384(ret->im, t2, mod, n0); + lea 40+192(%rsp), $a_ptr # t2 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + lea 48($r_ptr), $r_ptr # ret->im + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # alignment +$code.=<<___; +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,\@function,4,"unwind" +.align 32 +sqrx_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + # gap for __mulx_mont_384 + mov $r_ptr, 8*2(%rsp) + mov $a_ptr, 8*3(%rsp) + + ################################# add_mod_384(t0, a->re, a->im); + lea 48($a_ptr), $b_org # a->im + lea 32(%rsp), $r_ptr # t0 + call __add_mod_384 + + ################################# sub_mod_384(t1, a->re, a->im); + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea 32+48(%rsp), $r_ptr # t1 + call __sub_mod_384 + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rdx + mov 8*0($a_ptr), %r14 # @acc[6] + mov 8*1($a_ptr), %r15 # @acc[7] + mov 8*2($a_ptr), %rax # @acc[8] + mov 8*3($a_ptr), %r12 # @acc[4] + mov 8*4($a_ptr), %rdi # $lo + mov 8*5($a_ptr), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_384 +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $a_ptr, $a_ptr + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $a_ptr + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($b_ptr) # ret->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($b_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($b_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($b_ptr) + mov @acc[4], 8*10($b_ptr) + mov @acc[5], 8*11($b_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+48(%rsp), $b_ptr # t1 + + mov 32+48(%rsp), %rdx # t1[0] + mov 32+8*0(%rsp), %r14 # @acc[6] + mov 32+8*1(%rsp), %r15 # @acc[7] + mov 32+8*2(%rsp), %rax # @acc[8] + mov 32+8*3(%rsp), %r12 # @acc[4] + mov 32+8*4(%rsp), %rdi # $lo + mov 32+8*5(%rsp), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,\@function,4,"unwind" +.align 32 +mulx_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 96($r_ptr), $r_ptr # ret->im + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + mov $r_ptr, 8*2(%rsp) # offload ret->im + mov $n_ptr, 8*3(%rsp) + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*6($a_ptr), @acc[0] + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + + mov @acc[0], 32+8*0(%rsp) + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + ################################# t1 = b->re + b->im + mov 8*0($b_org), @acc[0] + mov 8*1($b_org), @acc[1] + mov 8*2($b_org), @acc[2] + mov 8*3($b_org), @acc[3] + mov 8*4($b_org), @acc[4] + mov 8*5($b_org), @acc[5] + + add 8*6($b_org), @acc[0] + adc 8*7($b_org), @acc[1] + adc 8*8($b_org), @acc[2] + adc 8*9($b_org), @acc[3] + adc 8*10($b_org), @acc[4] + adc 8*11($b_org), @acc[5] + + mov @acc[0], 32+8*6(%rsp) + mov @acc[1], 32+8*7(%rsp) + mov @acc[2], 32+8*8(%rsp) + mov @acc[3], 32+8*9(%rsp) + mov @acc[4], 32+8*10(%rsp) + mov @acc[5], 32+8*11(%rsp) + + ################################# mul_384(ret->im, t0, t1); + lea 32+8*0(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + call __mulx_384 + + ################################# mul_384(ret->re, a->re, b->re); + mov 8*0(%rsp), $a_ptr + mov 8*1(%rsp), $b_ptr + lea -96($r_ptr), $r_ptr # ret->re + call __mulx_384 + + ################################# mul_384(tx, a->im, b->im); + lea 48+128($a_ptr), $a_ptr + lea 48($b_ptr), $b_ptr + lea 32(%rsp), $r_ptr + call __mulx_384 + + ################################# ret->im -= tx + mov 8*2(%rsp), $a_ptr # restore ret->im + lea 32(%rsp), $b_org + mov 8*3(%rsp), $n_ptr + mov $a_ptr, $r_ptr + call __sub_mod_384x384 + + ################################# ret->im -= ret->re + lea 0($r_ptr), $a_ptr + lea -96($r_ptr), $b_org + call __sub_mod_384x384 + + ################################# ret->re -= tx + lea -96($r_ptr), $a_ptr + lea 32(%rsp), $b_org + lea -96($r_ptr), $r_ptr + call __sub_mod_384x384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_382x,.-mulx_382x +___ +} +{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,\@function,3,"unwind" +.align 32 +sqrx_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + mov 8*5($a_ptr), @acc[11] + + mov @acc[6], @acc[0] + add 8*6($a_ptr), @acc[6] + mov @acc[7], @acc[1] + adc 8*7($a_ptr), @acc[7] + mov @acc[8], @acc[2] + adc 8*8($a_ptr), @acc[8] + mov @acc[9], @acc[3] + adc 8*9($a_ptr), @acc[9] + mov @acc[10], @acc[4] + adc 8*10($a_ptr), @acc[10] + mov @acc[11], @acc[5] + adc 8*11($a_ptr), @acc[11] + + mov @acc[6], 8*0($r_ptr) + mov @acc[7], 8*1($r_ptr) + mov @acc[8], 8*2($r_ptr) + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + ################################# t1 = a->re - a->im + lea 48($a_ptr), $b_org + lea 48($r_ptr), $r_ptr + call __sub_mod_384_a_is_loaded + + ################################# mul_384(ret->re, t0, t1); + lea ($r_ptr), $a_ptr + lea -48($r_ptr), $b_ptr + lea -48($r_ptr), $r_ptr + call __mulx_384 + + ################################# mul_384(ret->im, a->re, a->im); + mov (%rsp), $a_ptr + lea 48($a_ptr), $b_ptr + lea 96($r_ptr), $r_ptr + call __mulx_384 + + mov 8*0($r_ptr), @acc[0] # double ret->im + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + mov 8*6($r_ptr), @acc[6] + mov 8*7($r_ptr), @acc[7] + mov 8*8($r_ptr), @acc[8] + mov 8*9($r_ptr), @acc[9] + mov 8*10($r_ptr), @acc[10] + add @acc[0], @acc[0] + mov 8*11($r_ptr), @acc[11] + adc @acc[1], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[2], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[3], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[4], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[5], @acc[5] + mov @acc[4], 8*4($r_ptr) + adc @acc[6], @acc[6] + mov @acc[5], 8*5($r_ptr) + adc @acc[7], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[8], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[9], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[10], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[11], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +___ +} +{ ########################################################## 384-bit mulx +my ($a0, $a1) = @acc[6..7]; +my @acc = @acc[0..5]; +my ($lo, $hi, $zr) = ("%rax", "%rcx", "%rbp"); + +$code.=<<___; +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,\@function,3,"unwind" +.align 32 +mulx_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx + call __mulx_384 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,\@abi-omnipotent +.align 32 +__mulx_384: + mov 8*0($b_ptr), %rdx + mov 8*0($a_ptr), $a0 + mov 8*1($a_ptr), $a1 + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + lea -128($a_ptr), $a_ptr + + mulx $a0, @acc[1], $hi + xor $zr, $zr + + mulx $a1, @acc[0], $lo + adcx $hi, @acc[0] + mov @acc[1], 8*0($r_ptr) + + mulx @acc[2], @acc[1], $hi + adcx $lo, @acc[1] + + mulx @acc[3], @acc[2], $lo + adcx $hi, @acc[2] + + mulx @acc[4], @acc[3], $hi + adcx $lo, @acc[3] + + mulx @acc[5], @acc[4], @acc[5] + mov 8*1($b_ptr), %rdx + adcx $hi, @acc[4] + adcx $zr, @acc[5] +___ +for(my $i=1; $i<6; $i++) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; +$code.=<<___; + mulx $a0, $lo, $hi + adcx @acc[0], $lo + adox $hi, @acc[1] + mov $lo, 8*$i($r_ptr) + + mulx $a1, @acc[0], $hi + adcx @acc[1], $acc[0] + adox $hi, @acc[2] + + mulx 128+8*2($a_ptr), @acc[1], $lo + adcx @acc[2], @acc[1] + adox $lo, @acc[3] + + mulx 128+8*3($a_ptr), @acc[2], $hi + adcx @acc[3], @acc[2] + adox $hi, @acc[4] + + mulx 128+8*4($a_ptr), @acc[3], $lo + adcx @acc[4], @acc[3] + adox @acc[5], $lo + + mulx 128+8*5($a_ptr), @acc[4], @acc[5] + mov $b_next, %rdx + adcx $lo, @acc[4] + adox $zr, @acc[5] + adcx $zr, @acc[5] +___ +} +$code.=<<___; + mov @acc[0], 8*6($r_ptr) + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) + + ret +.size __mulx_384,.-__mulx_384 +___ +} +{ ########################################################## 384-bit sqrx +$code.=<<___; +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,\@function,2,"unwind" +.align 32 +sqrx_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sqrx_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_384,.-sqrx_384 +___ +if (0) { +# up to 5% slower than below variant +my @acc=map("%r$_",("no",8..15,"cx","bx")); + push(@acc, $a_ptr); +my ($lo, $hi, $carry)=("%rax", "%rbp", "%rno"); + +$code.=<<___; +.type __sqrx_384,\@abi-omnipotent +.align 32 +__sqrx_384: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + + ######################################### + mulx @acc[7], @acc[1], $lo # a[1]*a[0] + mov 8*5($a_ptr), @acc[11] + mulx @acc[8], @acc[2], $hi # a[2]*a[0] + add $lo, @acc[2] + mulx @acc[9], @acc[3], $lo # a[3]*a[0] + adc $hi, @acc[3] + mulx @acc[10], @acc[4], $hi # a[4]*a[0] + adc $lo, @acc[4] + mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] + adc $hi, @acc[5] + adc \$0, @acc[6] + + mulx %rdx, $lo, $hi # a[0]*a[0] + mov @acc[7], %rdx + xor @acc[7], @acc[7] + add @acc[1], @acc[1] # double acc[1] + adc \$0, @acc[7] + add $hi, @acc[1] + adc \$0, @acc[7] + mov $lo, 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) +___ +($carry, @acc[7]) = (@acc[7], @acc[1]); +$code.=<<___; + ######################################### + xor @acc[7], @acc[7] + mulx @acc[8], $lo, $hi # a[2]*a[1] + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx @acc[9], $lo, $hi # a[3]*a[1] + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx @acc[10], $lo, $hi # a[4]*a[1] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[11], $lo, $hi # a[5]*a[1] + adcx $lo, @acc[6] + adox @acc[7], $hi + adcx $hi, @acc[7] + + mulx %rdx, $lo, $hi # a[1]*a[1] + mov @acc[8], %rdx + xor @acc[8], @acc[8] + adox @acc[2], @acc[2] # double acc[2:3] + adcx $carry, $lo # can't carry + adox @acc[3], @acc[3] + adcx $lo, @acc[2] + adox @acc[8], @acc[8] + adcx $hi, @acc[3] + adc \$0, @acc[8] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) +___ +($carry,@acc[8])=(@acc[8],$carry); +$code.=<<___; + ######################################### + xor @acc[8], @acc[8] + mulx @acc[9], $lo, $hi # a[3]*a[2] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[10], $lo, $hi # a[4]*a[2] + adcx $lo, @acc[6] + adox $hi, @acc[7] + + mulx @acc[11], $lo, $hi # a[5]*a[2] + adcx $lo, @acc[7] + adox @acc[8], $hi + adcx $hi, @acc[8] + + mulx %rdx, $lo, $hi # a[2]*a[2] + mov @acc[9], %rdx + xor @acc[9], @acc[9] + adox @acc[4], @acc[4] # double acc[4:5] + adcx $carry, $lo # can't carry + adox @acc[5], @acc[5] + adcx $lo, @acc[4] + adox @acc[9], @acc[9] + adcx $hi, @acc[5] + adc \$0, $acc[9] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) +___ +($carry,@acc[9])=(@acc[9],$carry); +$code.=<<___; + ######################################### + xor @acc[9], @acc[9] + mulx @acc[10], $lo, $hi # a[4]*a[3] + adcx $lo, @acc[7] + adox $hi, @acc[8] + + mulx @acc[11], $lo, $hi # a[5]*a[3] + adcx $lo, @acc[8] + adox @acc[9], $hi + adcx $hi, @acc[9] + + mulx %rdx, $lo, $hi + mov @acc[10], %rdx + xor @acc[10], @acc[10] + adox @acc[6], @acc[6] # double acc[6:7] + adcx $carry, $lo # can't carry + adox @acc[7], @acc[7] + adcx $lo, @acc[6] + adox @acc[10], @acc[10] + adcx $hi, @acc[7] + adc \$0, $acc[10] + mov @acc[6], 8*6($r_ptr) + mov @acc[7], 8*7($r_ptr) +___ +($carry,@acc[10])=(@acc[10],$carry); +$code.=<<___; + ######################################### + mulx @acc[11], $lo, @acc[10] # a[5]*a[4] + add $lo, @acc[9] + adc \$0, @acc[10] + + mulx %rdx, $lo, $hi # a[4]*a[4] + mov @acc[11], %rdx + xor @acc[11], @acc[11] + adox @acc[8], @acc[8] # double acc[8:10] + adcx $carry, $lo # can't carry + adox @acc[9], @acc[9] + adcx $lo, @acc[8] + adox @acc[10], @acc[10] + adcx $hi, @acc[9] + adox @acc[11], @acc[11] + mov @acc[8], 8*8($r_ptr) + mov @acc[9], 8*9($r_ptr) + + ######################################### + mulx %rdx, $lo, $hi # a[5]*a[5] + adcx $lo, @acc[10] + adcx $hi, @acc[11] + + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sqrx_384,.-__sqrx_384 +___ +} else { +my @acc=map("%r$_",("no",8..15,"cx","bx","bp")); +my ($lo, $hi)=($r_ptr, "%rax"); + +$code.=<<___; +.type __sqrx_384,\@abi-omnipotent +.align 32 +__sqrx_384: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + + ######################################### + mulx @acc[7], @acc[1], $lo # a[1]*a[0] + mov 8*5($a_ptr), @acc[11] + mulx @acc[8], @acc[2], $hi # a[2]*a[0] + add $lo, @acc[2] + mulx @acc[9], @acc[3], $lo # a[3]*a[0] + adc $hi, @acc[3] + mulx @acc[10], @acc[4], $hi # a[4]*a[0] + adc $lo, @acc[4] + mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] + mov @acc[7], %rdx + adc $hi, @acc[5] + adc \$0, @acc[6] + + ######################################### + xor @acc[7], @acc[7] + mulx @acc[8], $lo, $hi # a[2]*a[1] + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx @acc[9], $lo, $hi # a[3]*a[1] + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx @acc[10], $lo, $hi # a[4]*a[1] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[11], $lo, $hi # a[5]*a[1] + mov @acc[8], %rdx + adcx $lo, @acc[6] + adox @acc[7], $hi + adcx $hi, @acc[7] + + ######################################### + xor @acc[8], @acc[8] + mulx @acc[9], $lo, $hi # a[3]*a[2] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[10], $lo, $hi # a[4]*a[2] + adcx $lo, @acc[6] + adox $hi, @acc[7] + + mulx @acc[11], $lo, $hi # a[5]*a[2] + mov @acc[9], %rdx + adcx $lo, @acc[7] + adox @acc[8], $hi + adcx $hi, @acc[8] + + ######################################### + xor @acc[9], @acc[9] + mulx @acc[10], $lo, $hi # a[4]*a[3] + adcx $lo, @acc[7] + adox $hi, @acc[8] + + mulx @acc[11], $lo, $hi # a[5]*a[3] + mov @acc[10], %rdx + adcx $lo, @acc[8] + adox @acc[9], $hi + adcx $hi, @acc[9] + + ######################################### + mulx @acc[11], $lo, @acc[10] # a[5]*a[4] + mov 8*0($a_ptr), %rdx + add $lo, @acc[9] + mov 8(%rsp), $r_ptr # restore $r_ptr + adc \$0, @acc[10] + + ######################################### double acc[1:10] + xor @acc[11], @acc[11] + adcx @acc[1], @acc[1] + adcx @acc[2], @acc[2] + adcx @acc[3], @acc[3] + adcx @acc[4], @acc[4] + adcx @acc[5], @acc[5] + + ######################################### accumulate a[i]*a[i] + mulx %rdx, %rdx, $hi # a[0]*a[0] + mov %rdx, 8*0($r_ptr) + mov 8*1($a_ptr), %rdx + adox $hi, @acc[1] + mov @acc[1], 8*1($r_ptr) + + mulx %rdx, @acc[1], $hi # a[1]*a[1] + mov 8*2($a_ptr), %rdx + adox @acc[1], @acc[2] + adox $hi, @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[2]*a[2] + mov 8*3($a_ptr), %rdx + adox @acc[1], @acc[4] + adox @acc[2], @acc[5] + adcx @acc[6], @acc[6] + adcx @acc[7], @acc[7] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[3]*a[3] + mov 8*4($a_ptr), %rdx + adox @acc[1], @acc[6] + adox @acc[2], @acc[7] + adcx @acc[8], @acc[8] + adcx @acc[9], @acc[9] + mov @acc[6], 8*6($r_ptr) + mov @acc[7], 8*7($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[4]*a[4] + mov 8*5($a_ptr), %rdx + adox @acc[1], @acc[8] + adox @acc[2], @acc[9] + adcx @acc[10], @acc[10] + adcx @acc[11], @acc[11] + mov @acc[8], 8*8($r_ptr) + mov @acc[9], 8*9($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[5]*a[5] + adox @acc[1], @acc[10] + adox @acc[2], @acc[11] + + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sqrx_384,.-__sqrx_384 +___ +} + +{ ########################################################## 384-bit redcx_mont +my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" +my ($lo, $hi) = ("%rax", "%rbp"); + +$code.=<<___; +######################################################################## +# void redcx_mont_384(uint64_t ret[6], const uint64_t a[12], +# uint64_t m[6], uint64_t n0); +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,\@function,4,"unwind" +.align 32 +redcx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + +######################################################################## +# void fromx_mont_384(uint64_t ret[6], const uint64_t a[6], +# uint64_t m[6], uint64_t n0); +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,\@function,4,"unwind" +.align 32 +fromx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_384 + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[6], %rax + mov @acc[7], %rcx + mov @acc[0], %rdx + mov @acc[1], %rbp + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[2], @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + sbb 8*4($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*5($n_ptr), @acc[3] + + cmovc %rax, @acc[6] + cmovc %rcx, @acc[7] + cmovc %rdx, @acc[0] + mov @acc[6], 8*0($r_ptr) + cmovc %rbp, @acc[1] + mov @acc[7], 8*1($r_ptr) + cmovc @acc[5], @acc[2] + mov @acc[0], 8*2($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_by_1_mont_384,\@abi-omnipotent +.align 32 +__mulx_by_1_mont_384: + mov 8*0($a_ptr), @acc[0] + mov $n0, %rdx + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] +___ +for (my $i=0; $i<6; $i++) { +$code.=<<___; + imulq @acc[0], %rdx + + ################################# reduction $i + xor @acc[6], @acc[6] # @acc[6]=0, cf=0, of=0 + mulx 8*0($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5($n_ptr), $lo, $hi + mov $n0, %rdx + adcx $lo, @acc[5] + adox @acc[6], $hi + adcx $hi, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redc_tail_mont_384,\@abi-omnipotent +.align 32 +__redc_tail_mont_384: + add 8*6($a_ptr), @acc[0] # accumulate upper half + mov @acc[0], %rax + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + mov @acc[1], %rcx + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + sbb @acc[6], @acc[6] + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[2], %rdx + mov @acc[3], %rbp + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[7] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], $a_ptr + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rcx, @acc[1] + cmovc %rdx, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc %rbp, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc $a_ptr, @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,\@function,3,"unwind" +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 0($r_ptr), $a_ptr + mov $b_org, $n0 + call __mulx_by_1_mont_384 + + xor %rax, %rax + mov @acc[0], @acc[7] + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,\@function,3,"unwind" +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 48($r_ptr), $a_ptr # sgn0(a->im) + mov $b_org, $n0 + call __mulx_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), $a_ptr # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + call __mulx_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +___ +} } + +{ ########################################################## mulx/sqrx_mont +my @acc = (@acc, "%rax"); +my ($lo,$hi)=("%rdi","%rbp"); + +$code.=<<___; +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,\@function,5,"unwind" +.align 32 +mulx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + mov $n0, (%rsp) + + mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_mont_384,\@abi-omnipotent +.align 32 +__mulx_mont_384: +.cfi_startproc + mulx @acc[7], @acc[6], @acc[2] + mulx @acc[8], @acc[7], @acc[3] + add @acc[6], @acc[1] + mulx @acc[4], @acc[8], @acc[4] + adc @acc[7], @acc[2] + mulx $lo, $lo, @acc[5] + adc @acc[8], @acc[3] + mulx $hi, $hi, @acc[6] + mov 8($b_ptr), %rdx + adc $lo, @acc[4] + adc $hi, @acc[5] + adc \$0, @acc[6] + xor @acc[7], @acc[7] + +___ +for (my $i=1; $i<6; $i++) { +my $tt = $i==1 ? @acc[7] : $hi; +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], 16(%rsp) + imulq 8(%rsp), @acc[0] + + ################################# Multiply by b[$i] + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + adox $lo, @acc[4] + adcx $hi, @acc[5] + + mulx 8*4+128($a_ptr), $lo, $hi + adox $lo, @acc[5] + adcx $hi, @acc[6] + + mulx 8*5+128($a_ptr), $lo, $hi + mov @acc[0], %rdx + adox $lo, @acc[6] + adcx $hi, @acc[7] # cf=0 + adox @acc[8], @acc[7] + adox @acc[8], @acc[8] + + ################################# reduction + xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx 16(%rsp), $lo # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adcx @acc[0], @acc[6] + adox @acc[0], @acc[7] + adcx @acc[0], @acc[7] + adox @acc[0], @acc[8] + adcx @acc[0], @acc[8] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq 8(%rsp), %rdx + mov 8*3(%rsp), $b_ptr # restore $r_ptr + + ################################# last reduction + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + mov @acc[2], @acc[0] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + mov @acc[3], $a_ptr + + mulx 8*5+128($n_ptr), $lo, $hi + adcx $lo, @acc[5] + adox $hi, @acc[6] + mov @acc[1], %rdx + adcx @acc[8], @acc[6] + adox @acc[8], @acc[7] + lea 128($n_ptr), $n_ptr + mov @acc[4], @acc[8] + adc \$0, @acc[7] + + ################################# + # Branch-less conditional acc[1:7] - modulus + + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + mov @acc[5], $lo + sbb 8*2($n_ptr), @acc[3] + sbb 8*3($n_ptr), @acc[4] + sbb 8*4($n_ptr), @acc[5] + mov @acc[6], $hi + sbb 8*5($n_ptr), @acc[6] + sbb \$0, @acc[7] + + cmovnc @acc[1], %rdx + cmovc @acc[0], @acc[2] + cmovc $a_ptr, @acc[3] + cmovnc @acc[4], @acc[8] + mov %rdx, 8*0($b_ptr) + cmovnc @acc[5], $lo + mov @acc[2], 8*1($b_ptr) + cmovnc @acc[6], $hi + mov @acc[3], 8*2($b_ptr) + mov @acc[8], 8*3($b_ptr) + mov $lo, 8*4($b_ptr) + mov $hi, 8*5($b_ptr) + + ret +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +___ +} +$code.=<<___; +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,\@function,4,"unwind" +.align 32 +sqrx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $n_ptr, $n0 # n0 + lea -128($b_org), $n_ptr # control u-op density + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + lea ($a_ptr), $b_ptr + mov $n0, (%rsp) # n0 + lea -128($a_ptr), $a_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_384 # as fast as dedicated squaring + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,\@function,6,"unwind" +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp), %rsp +.cfi_adjust_cfa_offset 8*5 +.cfi_end_prologue + + mov $b_org, @acc[2] # loop counter + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov $a_ptr, $b_ptr + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) # to __mulx_mont_384 + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + mov $n0, (%rsp) + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq 8*0(%r9), %xmm2 # prefetch b[0] + +.Loop_sqrx_384: + movd @acc[2]d, %xmm1 + lea -128($b_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_384 + + movd %xmm1, @acc[2]d + dec @acc[2]d + jnz .Loop_sqrx_384 + + mov %rdx, @acc[6] + movq %xmm2, %rdx # b[0] + lea -128($b_ptr), $a_ptr # control u-op density + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + lea -128($n_ptr), $n_ptr # control u-op density + + mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*5(%rsp),%r15 +.cfi_restore %r15 + mov 8*6(%rsp),%r14 +.cfi_restore %r14 + mov 8*7(%rsp),%r13 +.cfi_restore %r13 + mov 8*8(%rsp),%r12 +.cfi_restore %r12 + mov 8*9(%rsp),%rbx +.cfi_restore %rbx + mov 8*10(%rsp),%rbp +.cfi_restore %rbp + lea 8*11(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,\@function,6,"unwind" +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp), %rsp +.cfi_adjust_cfa_offset 8*5 +.cfi_end_prologue + + mov $b_org, @acc[2] # loop counter + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov $a_ptr, $b_ptr + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) # to __mulx_mont_383_nonred + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + mov $n0, (%rsp) + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq 8*0(%r9), %xmm2 # prefetch b[0] + lea -128($n_ptr), $n_ptr # control u-op density + +.Loop_sqrx_383: + movd @acc[2]d, %xmm1 + lea -128($b_ptr), $a_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_383_nonred # omitting full reduction gives ~15% + # in addition-chains + movd %xmm1, @acc[2]d + dec @acc[2]d + jnz .Loop_sqrx_383 + + mov %rdx, @acc[6] + movq %xmm2, %rdx # b[0] + lea -128($b_ptr), $a_ptr # control u-op density + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + mulx @acc[6], @acc[0], @acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*5(%rsp),%r15 +.cfi_restore %r15 + mov 8*6(%rsp),%r14 +.cfi_restore %r14 + mov 8*7(%rsp),%r13 +.cfi_restore %r13 + mov 8*8(%rsp),%r12 +.cfi_restore %r12 + mov 8*9(%rsp),%rbx +.cfi_restore %rbx + mov 8*10(%rsp),%rbp +.cfi_restore %rbp + lea 8*11(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_mont_383_nonred,\@abi-omnipotent +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + mulx @acc[7], @acc[6], @acc[2] + mulx @acc[8], @acc[7], @acc[3] + add @acc[6], @acc[1] + mulx @acc[4], @acc[8], @acc[4] + adc @acc[7], @acc[2] + mulx $lo, $lo, @acc[5] + adc @acc[8], @acc[3] + mulx $hi, $hi, @acc[6] + mov 8($b_ptr), %rdx + adc $lo, @acc[4] + adc $hi, @acc[5] + adc \$0, @acc[6] +___ +for (my $i=1; $i<6; $i++) { +my $tt = $i==1 ? @acc[7] : $hi; +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], @acc[8] + imulq 8(%rsp), @acc[0] + + ################################# Multiply by b[$i] + xor @acc[7], @acc[7] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + adox $lo, @acc[4] + adcx $hi, @acc[5] + + mulx 8*4+128($a_ptr), $lo, $hi + adox $lo, @acc[5] + adcx $hi, @acc[6] + + mulx 8*5+128($a_ptr), $lo, $hi + mov @acc[0], %rdx + adox $lo, @acc[6] + adcx @acc[7], $hi + adox $hi, @acc[7] + + ################################# reduction + xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[8] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adcx @acc[8], @acc[6] + adox @acc[8], @acc[7] + adcx @acc[8], @acc[7] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq 8(%rsp), %rdx + mov 8*3(%rsp), $b_ptr # restore $r_ptr + + ################################# last reduction + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov @acc[1], %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adc \$0, @acc[6] + mov @acc[4], @acc[8] + + mov @acc[1], 8*0($b_ptr) + mov @acc[2], 8*1($b_ptr) + mov @acc[3], 8*2($b_ptr) + mov @acc[5], $lo + mov @acc[4], 8*3($b_ptr) + mov @acc[5], 8*4($b_ptr) + mov @acc[6], 8*5($b_ptr) + mov @acc[6], $hi + + ret +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +___ +} } } +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); + +# omitting 3 reductions gives ~10% better performance in add-chains +$code.=<<___; +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,\@function,4,"unwind" +.align 32 +sqrx_mont_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $r_ptr, 8*2(%rsp) + mov $a_ptr, 8*3(%rsp) + + ################################# + mov 8*0($a_ptr), @acc[0] # a->re + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $r_ptr, $r_ptr # borrow flag as mask + + mov @acc[0], 32+8*0(%rsp) # t0 + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + mov @acc[6], 32+8*6(%rsp) # t1 + mov @acc[7], 32+8*7(%rsp) + mov @acc[8], 32+8*8(%rsp) + mov @acc[9], 32+8*9(%rsp) + mov @acc[10], 32+8*10(%rsp) + mov @acc[11], 32+8*11(%rsp) + mov $r_ptr, 32+8*12(%rsp) + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + #mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rdx + mov 8*0($a_ptr), %r14 # @acc[6] + mov 8*1($a_ptr), %r15 # @acc[7] + mov 8*2($a_ptr), %rax # @acc[8] + mov 8*3($a_ptr), %r12 # @acc[4] + mov 8*4($a_ptr), %rdi # $lo + mov 8*5($a_ptr), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_383_nonred +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + + mov @acc[0], 8*6($b_ptr) # ret->im + mov @acc[1], 8*7($b_ptr) + mov @acc[2], 8*8($b_ptr) + mov @acc[3], 8*9($b_ptr) + mov @acc[4], 8*10($b_ptr) + mov @acc[5], 8*11($b_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32-128(%rsp), $a_ptr # t0 [+u-op density] + lea 32+8*6(%rsp), $b_ptr # t1 + + mov 32+8*6(%rsp), %rdx # t1[0] + mov 32+8*0(%rsp), %r14 # @acc[6] + mov 32+8*1(%rsp), %r15 # @acc[7] + mov 32+8*2(%rsp), %rax # @acc[8] + mov 32+8*3(%rsp), %r12 # @acc[4] + mov 32+8*4(%rsp), %rdi # $lo + mov 32+8*5(%rsp), %rbp # $hi + #lea -128($a_ptr), $a_ptr # control u-op density + #lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_383_nonred +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im + lea 128($n_ptr), $n_ptr + mov 32+8*0(%rsp), @acc[6] + and @acc[11], @acc[6] + mov 32+8*1(%rsp), @acc[7] + and @acc[11], @acc[7] + mov 32+8*2(%rsp), @acc[8] + and @acc[11], @acc[8] + mov 32+8*3(%rsp), @acc[9] + and @acc[11], @acc[9] + mov 32+8*4(%rsp), @acc[10] + and @acc[11], @acc[10] + and 32+8*5(%rsp), @acc[11] + + sub @acc[6], @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb @acc[7], @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb @acc[8], @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb @acc[9], @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb @acc[10], @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb @acc[11], @acc[5] + sbb @acc[11], @acc[11] + + and @acc[11], @acc[6] + and @acc[11], @acc[7] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + adc @acc[8], @acc[2] + adc @acc[9], @acc[3] + adc @acc[10], @acc[4] + adc @acc[11], @acc[5] + + mov @acc[0], 8*0($b_ptr) # ret->re + mov @acc[1], 8*1($b_ptr) + mov @acc[2], 8*2($b_ptr) + mov @acc[3], 8*3($b_ptr) + mov @acc[4], 8*4($b_ptr) + mov @acc[5], 8*5($b_ptr) +___ +} +$code.=<<___; + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/sha256-armv8.pl b/blst/asm/sha256-armv8.pl new file mode 100755 index 0000000..1de27c7 --- /dev/null +++ b/blst/asm/sha256-armv8.pl @@ -0,0 +1,541 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for ARMv8. +# +# This module is stripped of scalar code paths, with raionale that all +# known processors are NEON-capable. +# +# See original module at CRYPTOGAMS for further details. + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$BITS=256; +$SZ=4; +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; +$reg_t="w"; +$pre="blst_"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +$code.=<<___; +.text + +.align 6 +.type .LK$BITS,%object +.LK$BITS: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +.size .LK$BITS,.-.LK$BITS +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm" +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +.globl ${pre}sha256_block_armv8 +.type ${pre}sha256_block_armv8,%function +.align 6 +${pre}sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size ${pre}sha256_block_armv8,.-${pre}sha256_block_armv8 +___ +} + +if ($SZ==4) { ######################################### NEON stuff # +# You'll surely note a lot of similarities with sha256-armv4 module, +# and of course it's not a coincidence. sha256-armv4 was used as +# initial template, but was adapted for ARMv8 instruction set and +# extensively re-tuned for all-round performance. + +my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); +my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); +my $Ktbl="x16"; +my $Xfer="x17"; +my @X = map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } +sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } +sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + &ushr_32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] + eval(shift(@insns)); + &sli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T4,$T7,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T4,$T7,32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T5,$T7,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T7,$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_u32 ($T3,$T7,32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T6,@X[0],$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T7,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T6,@X[0],32-$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T5,@X[0],$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T6); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T5,@X[0],32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl], #16"); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T5); + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dhi($T5), &Dlo($T7)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + while($#insns>=1) { eval(shift(@insns)); } + &st1_32 ("{$T0}","[$Xfer], #16"); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_8 ("{@X[0]}","[$inp],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &rev32 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &st1_32 ("{$T0}","[$Xfer], #16"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past + '&and ($t1,$f,$e)', + '&bic ($t4,$g,$e)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&orr ($t1,$t1,$t4)', # Ch(e,f,g) + '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ror ($t0,$t0,"#$Sigma1[0]")', + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t0)', # h+=Sigma1(e) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&ror ($t4,$t4,"#$Sigma0[0]")', + '&add ($d,$d,$h)', # d+=h + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +.globl ${pre}sha256_block_data_order +.type ${pre}sha256_block_data_order,%function +.align 4 +${pre}sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr $Ktbl,.LK256 + add $num,$inp,$num,lsl#6 // len to point at the end of inp + + ld1.8 {@X[0]},[$inp], #16 + ld1.8 {@X[1]},[$inp], #16 + ld1.8 {@X[2]},[$inp], #16 + ld1.8 {@X[3]},[$inp], #16 + ld1.32 {$T0},[$Ktbl], #16 + ld1.32 {$T1},[$Ktbl], #16 + ld1.32 {$T2},[$Ktbl], #16 + ld1.32 {$T3},[$Ktbl], #16 + rev32 @X[0],@X[0] // yes, even on + rev32 @X[1],@X[1] // big-endian + rev32 @X[2],@X[2] + rev32 @X[3],@X[3] + mov $Xfer,sp + add.32 $T0,$T0,@X[0] + add.32 $T1,$T1,@X[1] + add.32 $T2,$T2,@X[2] + st1.32 {$T0-$T1},[$Xfer], #32 + add.32 $T3,$T3,@X[3] + st1.32 {$T2-$T3},[$Xfer] + sub $Xfer,$Xfer,#32 + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldp $E,$F,[$ctx,#16] + ldp $G,$H,[$ctx,#24] + ldr $t1,[sp,#0] + mov $t2,wzr + eor $t3,$B,$C + mov $t4,wzr + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + cmp $t1,#0 // check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + sub $Ktbl,$Ktbl,#256 // rewind $Ktbl + cmp $inp,$num + mov $Xfer, #64 + csel $Xfer, $Xfer, xzr, eq + sub $inp,$inp,$Xfer // avoid SEGV + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + add $A,$A,$t4 // h+=Sigma0(a) from the past + ldp $t0,$t1,[$ctx,#0] + add $A,$A,$t2 // h+=Maj(a,b,c) from the past + ldp $t2,$t3,[$ctx,#8] + add $A,$A,$t0 // accumulate + add $B,$B,$t1 + ldp $t0,$t1,[$ctx,#16] + add $C,$C,$t2 + add $D,$D,$t3 + ldp $t2,$t3,[$ctx,#24] + add $E,$E,$t0 + add $F,$F,$t1 + ldr $t1,[sp,#0] + stp $A,$B,[$ctx,#0] + add $G,$G,$t2 + mov $t2,wzr + stp $C,$D,[$ctx,#8] + add $H,$H,$t3 + stp $E,$F,[$ctx,#16] + eor $t3,$B,$C + stp $G,$H,[$ctx,#24] + mov $t4,wzr + mov $Xfer,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size ${pre}sha256_block_data_order,.-${pre}sha256_block_data_order +___ +} + +{ +my ($out,$inp,$len) = map("x$_",(0..2)); + +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,%function +.align 4 +${pre}sha256_emit: + ldp x4,x5,[$inp] + ldp x6,x7,[$inp,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[$out,#4] + lsr x4,x4,#32 + str w5,[$out,#12] + lsr x5,x5,#32 + str w6,[$out,#20] + lsr x6,x6,#32 + str w7,[$out,#28] + lsr x7,x7,#32 + str w4,[$out,#0] + str w5,[$out,#8] + str w6,[$out,#16] + str w7,[$out,#24] + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,%function +.align 4 +${pre}sha256_bcopy: +.Loop_bcopy: + ldrb w3,[$inp],#1 + sub $len,$len,#1 + strb w3,[$out],#1 + cbnz $len,.Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,%function +.align 4 +${pre}sha256_hcopy: + ldp x4,x5,[$inp] + ldp x6,x7,[$inp,#16] + stp x4,x5,[$out] + stp x6,x7,[$out,#16] + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; + + s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers + + s/\.[ui]?8(\s)/$1/; + s/\.\w?64\b// and s/\.16b/\.2d/g or + s/\.\w?32\b// and s/\.16b/\.4s/g; + m/\bext\b/ and s/\.2d/\.16b/g or + m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; + + print $_,"\n"; +} + +close STDOUT; diff --git a/blst/asm/sha256-portable-x86_64.pl b/blst/asm/sha256-portable-x86_64.pl new file mode 100755 index 0000000..eca0564 --- /dev/null +++ b/blst/asm/sha256-portable-x86_64.pl @@ -0,0 +1,337 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for x86_64. +# +# Scalar-only version with minor twist minimizing 'lea' instructions. + +$flavour = shift; +$output = pop; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$pre="blst_"; +$func="${pre}sha256_block_data_order"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$framesz="16*$SZ+3*8"; + +sub ROUND_00_15() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + my $STRIDE=$SZ; + # $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); + +$code.=<<___; + ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 + mov $f,$a2 + + xor $e,$a0 + ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $g,$a2 # f^g + + mov $T1,`$SZ*($i&0xf)`(%rsp) + xor $a,$a1 + and $e,$a2 # (f^g)&e + + ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 + add $h,$T1 # T1+=h + xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g + + ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 + xor $e,$a0 + add $a2,$T1 # T1+=Ch(e,f,g) + + mov $a,$a2 + add `$SZ*$i`($Tbl),$T1 # T1+=K[round] + xor $a,$a1 + + xor $b,$a2 # a^b, b^c in next round + ror \$$Sigma1[0],$a0 # Sigma1(e) + mov $b,$h + + and $a2,$a3 + ror \$$Sigma0[0],$a1 # Sigma0(a) + add $a0,$T1 # T1+=Sigma1(e) + + xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) + add $T1,$d # d+=T1 + add $T1,$h # h+=T1 +___ +$code.=<<___ if ($i==31); + lea `16*$SZ`($Tbl),$Tbl # round+=16 +___ +$code.=<<___ if ($i<15); + add $a1,$h # h+=Sigma0(a) +___ + ($a2,$a3) = ($a3,$a2); +} + +sub ROUND_16_XX() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 + mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 + + mov $a0,$T1 + ror \$`$sigma0[1]-$sigma0[0]`,$a0 + add $a1,$a # modulo-scheduled h+=Sigma0(a) + mov $a2,$a1 + ror \$`$sigma1[1]-$sigma1[0]`,$a2 + + xor $T1,$a0 + shr \$$sigma0[2],$T1 + ror \$$sigma0[0],$a0 + xor $a1,$a2 + shr \$$sigma1[2],$a1 + + ror \$$sigma1[0],$a2 + xor $a0,$T1 # sigma0(X[(i+1)&0xf]) + xor $a1,$a2 # sigma1(X[(i+14)&0xf]) + add `$SZ*(($i+9)&0xf)`(%rsp),$T1 + + add `$SZ*($i&0xf)`(%rsp),$T1 + mov $e,$a0 + add $a2,$T1 + mov $a,$a1 +___ + &ROUND_00_15(@_); +} + +$code=<<___; +.text + +.globl $func +.type $func,\@function,3,"unwind" +.align 16 +$func: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp +.cfi_adjust_cfa_offset $framesz + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg +.cfi_end_prologue + + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H + jmp .Lloop + +.align 16 +.Lloop: + mov $B,$a3 + lea $TABLE(%rip),$Tbl + xor $C,$a3 # magic +___ + for($i=0;$i<16;$i++) { + $code.=" mov $SZ*$i($inp),$T1\n"; + $code.=" mov @ROT[4],$a0\n"; + $code.=" mov @ROT[0],$a1\n"; + $code.=" bswap $T1\n"; + &ROUND_00_15($i,@ROT); + unshift(@ROT,pop(@ROT)); + } +$code.=<<___; + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: +___ + for(;$i<32;$i++) { + &ROUND_16_XX($i,@ROT); + unshift(@ROT,pop(@ROT)); + } + +$code.=<<___; + cmpb \$0x19,`$SZ-1`($Tbl) + jnz .Lrounds_16_xx + + mov $_ctx,$ctx + add $a1,$A # modulo-scheduled h+=Sigma0(a) + lea 16*$SZ($inp),$inp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop + + lea $framesz+6*8(%rsp),%r11 +.cfi_def_cfa %r11,8 + mov $framesz(%rsp),%r15 +.cfi_restore %r15 + mov -40(%r11),%r14 +.cfi_restore %r14 + mov -32(%r11),%r13 +.cfi_restore %r13 + mov -24(%r11),%r12 +.cfi_restore %r12 + mov -16(%r11),%rbp +.cfi_restore %rbp + mov -8(%r11),%rbx +.cfi_restore %rbx +.cfi_epilogue + lea (%r11),%rsp + ret +.cfi_endproc +.size $func,.-$func + +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" +___ +{ +my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order + ("%rdi","%rsi","%rdx"); # Unix order +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,\@abi-omnipotent +.align 16 +${pre}sha256_emit: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + bswap %r8 + mov 24($inp), %r11 + bswap %r9 + mov %r8d, 4($out) + bswap %r10 + mov %r9d, 12($out) + bswap %r11 + mov %r10d, 20($out) + shr \$32, %r8 + mov %r11d, 28($out) + shr \$32, %r9 + mov %r8d, 0($out) + shr \$32, %r10 + mov %r9d, 8($out) + shr \$32, %r11 + mov %r10d, 16($out) + mov %r11d, 24($out) + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,\@abi-omnipotent +.align 16 +${pre}sha256_bcopy: + sub $inp, $out +.Loop_bcopy: + movzb ($inp), %eax + lea 1($inp), $inp + mov %al, -1($out,$inp) + dec $len + jnz .Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,\@abi-omnipotent +.align 16 +${pre}sha256_hcopy: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + mov 24($inp), %r11 + mov %r8, 0($out) + mov %r9, 8($out) + mov %r10, 16($out) + mov %r11, 24($out) + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + print $_,"\n"; +} +close STDOUT; diff --git a/blst/asm/sha256-x86_64.pl b/blst/asm/sha256-x86_64.pl new file mode 100755 index 0000000..22b3763 --- /dev/null +++ b/blst/asm/sha256-x86_64.pl @@ -0,0 +1,789 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for x86_64. +# +# This module is stripped of AVX and even scalar code paths, with +# raionale that +# +# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one* +# processor, venerable Sandy Bridge; +# b) AVX2 incurs costly power transitions, which would be justifiable +# if AVX2 code was executing most of the time, which is not the +# case in the context; +# c) all comtemporary processors support SSSE3, so that nobody would +# actually use scalar code path anyway; +# +# See original module at CRYPTOGAMS for further details. + +$flavour = shift; +$output = pop; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$pre="blst_"; +$func="${pre}sha256_block_data_order"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$framesz="16*$SZ+3*8"; + +$code=<<___; +.text + +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" +___ + +###################################################################### +# SIMD code paths +# +{{{ +###################################################################### +# Intel SHA Extensions implementation of SHA256 update function. +# +my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); + +my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); +my @MSG=map("%xmm$_",(3..6)); + +$code.=<<___; +.globl ${pre}sha256_block_data_order_shaext +.hidden ${pre}sha256_block_data_order_shaext +.type ${pre}sha256_block_data_order_shaext,\@function,3,"unwind" +.align 64 +${pre}sha256_block_data_order_shaext: +.cfi_startproc +___ +$code.=<<___ if ($win64); + sub \$0x58,%rsp +.cfi_adjust_cfa_offset 0x58 + movaps %xmm6,-0x58(%r11) +.cfi_offset %xmm6,-0x60 + movaps %xmm7,-0x48(%r11) +.cfi_offset %xmm7,-0x50 + movaps %xmm8,-0x38(%r11) +.cfi_offset %xmm8,-0x40 + movaps %xmm9,-0x28(%r11) +.cfi_offset %xmm9,-0x30 + movaps %xmm10,-0x18(%r11) +.cfi_offset %xmm10,-0x20 +.cfi_end_prologue +___ +$code.=<<___; + lea K256+0x80(%rip),$Tbl + movdqu ($ctx),$ABEF # DCBA + movdqu 16($ctx),$CDGH # HGFE + movdqa 0x100-0x80($Tbl),$TMP # byte swap mask + + pshufd \$0x1b,$ABEF,$Wi # ABCD + pshufd \$0xb1,$ABEF,$ABEF # CDAB + pshufd \$0x1b,$CDGH,$CDGH # EFGH + movdqa $TMP,$BSWAP # offload + palignr \$8,$CDGH,$ABEF # ABEF + punpcklqdq $Wi,$CDGH # CDGH + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu ($inp),@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqu 0x20($inp),@MSG[2] + pshufb $TMP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + + movdqa 0*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + pshufb $TMP,@MSG[1] + movdqa $CDGH,$CDGH_SAVE # offload + sha256rnds2 $ABEF,$CDGH # 0-3 + pshufd \$0x0e,$Wi,$Wi + nop + movdqa $ABEF,$ABEF_SAVE # offload + sha256rnds2 $CDGH,$ABEF + + movdqa 1*16-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + pshufb $TMP,@MSG[2] + sha256rnds2 $ABEF,$CDGH # 4-7 + pshufd \$0x0e,$Wi,$Wi + lea 0x40($inp),$inp + sha256msg1 @MSG[1],@MSG[0] + sha256rnds2 $CDGH,$ABEF + + movdqa 2*16-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + pshufb $TMP,@MSG[3] + sha256rnds2 $ABEF,$CDGH # 8-11 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[3],$TMP + palignr \$4,@MSG[2],$TMP + nop + paddd $TMP,@MSG[0] + sha256msg1 @MSG[2],@MSG[1] + sha256rnds2 $CDGH,$ABEF + + movdqa 3*16-0x80($Tbl),$Wi + paddd @MSG[3],$Wi + sha256msg2 @MSG[3],@MSG[0] + sha256rnds2 $ABEF,$CDGH # 12-15 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[0],$TMP + palignr \$4,@MSG[3],$TMP + nop + paddd $TMP,@MSG[1] + sha256msg1 @MSG[3],@MSG[2] + sha256rnds2 $CDGH,$ABEF +___ +for($i=4;$i<16-3;$i++) { +$code.=<<___; + movdqa $i*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 16-19... + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + nop + paddd $TMP,@MSG[2] + sha256msg1 @MSG[0],@MSG[3] + sha256rnds2 $CDGH,$ABEF +___ + push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqa 13*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 52-55 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + sha256rnds2 $CDGH,$ABEF + paddd $TMP,@MSG[2] + + movdqa 14*16-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + sha256rnds2 $ABEF,$CDGH # 56-59 + pshufd \$0x0e,$Wi,$Wi + sha256msg2 @MSG[1],@MSG[2] + movdqa $BSWAP,$TMP + sha256rnds2 $CDGH,$ABEF + + movdqa 15*16-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + nop + sha256rnds2 $ABEF,$CDGH # 60-63 + pshufd \$0x0e,$Wi,$Wi + dec $num + nop + sha256rnds2 $CDGH,$ABEF + + paddd $CDGH_SAVE,$CDGH + paddd $ABEF_SAVE,$ABEF + jnz .Loop_shaext + + pshufd \$0xb1,$CDGH,$CDGH # DCHG + pshufd \$0x1b,$ABEF,$TMP # FEBA + pshufd \$0xb1,$ABEF,$ABEF # BAFE + punpckhqdq $CDGH,$ABEF # DCBA + palignr \$8,$TMP,$CDGH # HGFE + + movdqu $ABEF,($ctx) + movdqu $CDGH,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -0x58(%r11),%xmm6 + movaps -0x48(%r11),%xmm7 + movaps -0x38(%r11),%xmm8 + movaps -0x28(%r11),%xmm9 + movaps -0x18(%r11),%xmm10 + mov %r11,%rsp +.cfi_def_cfa %r11,8 +.cfi_epilogue +___ +$code.=<<___; + ret +.cfi_endproc +.size ${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext +___ +}}} +{{{ + +my $a4=$T1; +my ($a,$b,$c,$d,$e,$f,$g,$h); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&ror ($a0,$Sigma1[2]-$Sigma1[1])', + '&mov ($a,$a1)', + '&mov ($a4,$f)', + + '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a0,$e)', + '&xor ($a4,$g)', # f^g + + '&ror ($a0,$Sigma1[1]-$Sigma1[0])', + '&xor ($a1,$a)', + '&and ($a4,$e)', # (f^g)&e + + '&xor ($a0,$e)', + '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] + '&mov ($a2,$a)', + + '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', + '&xor ($a2,$b)', # a^b, b^c in next round + + '&add ($h,$a4)', # h+=Ch(e,f,g) + '&ror ($a0,$Sigma1[0])', # Sigma1(e) + '&and ($a3,$a2)', # (b^c)&(a^b) + + '&xor ($a1,$a)', + '&add ($h,$a0)', # h+=Sigma1(e) + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + + '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($d,$h)', # d+=h + '&add ($h,$a3)', # h+=Maj(a,b,c) + + '&mov ($a0,$d)', + '&add ($a1,$h);'. # h+=Sigma0(a) + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); +} + +###################################################################### +# SSSE3 code path +# +{ +my $Tbl = $inp; +my $_ctx="0(%rbp)"; +my $_inp="8(%rbp)"; +my $_end="16(%rbp)"; +my $framesz=4*8+$win64*16*4+8; + +my @X = map("%xmm$_",(0..3)); +my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); + +$code.=<<___; +.globl ${func} +.hidden ${func} +.type ${func},\@function,3,"unwind" +.align 64 +${func}: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp +.cfi_adjust_cfa_offset $framesz + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + mov $ctx,0(%rsp) # save ctx, 1st arg + #mov $inp,8(%rsp) # save inp, 2nd arg + mov %rdx,16(%rsp) # save end pointer, "3rd" arg +___ +$code.=<<___ if ($win64); + movaps %xmm6,0x20(%rsp) +.cfi_offset %xmm6,-0x78 + movaps %xmm7,0x30(%rsp) +.cfi_offset %xmm7,-0x68 + movaps %xmm8,0x40(%rsp) +.cfi_offset %xmm8,-0x58 + movaps %xmm9,0x50(%rsp) +.cfi_offset %xmm9,-0x48 +___ +$code.=<<___; + mov %rsp,%rbp +.cfi_def_cfa_register %rbp +.cfi_end_prologue + + lea -16*$SZ(%rsp),%rsp + mov $SZ*0($ctx),$A + and \$-64,%rsp # align stack + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + +$code.=<<___; + #movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t4 + #movdqa $TABLE+`$SZ*$rounds`+64(%rip),$t5 + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + mov $inp,$_inp # offload $inp + movdqu 0x00($inp),@X[0] + movdqu 0x10($inp),@X[1] + movdqu 0x20($inp),@X[2] + pshufb $t3,@X[0] + movdqu 0x30($inp),@X[3] + lea $TABLE(%rip),$Tbl + pshufb $t3,@X[1] + movdqa 0x00($Tbl),$t0 + movdqa 0x10($Tbl),$t1 + pshufb $t3,@X[2] + paddd @X[0],$t0 + movdqa 0x20($Tbl),$t2 + pshufb $t3,@X[3] + movdqa 0x30($Tbl),$t3 + paddd @X[1],$t1 + paddd @X[2],$t2 + paddd @X[3],$t3 + movdqa $t0,0x00(%rsp) + mov $A,$a1 + movdqa $t1,0x10(%rsp) + mov $B,$a3 + movdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + movdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + sub \$`-16*$SZ`,$Tbl # size optimization +___ +sub Xupdate_256_SSSE3 () { + ( + '&movdqa ($t0,@X[1]);', + '&movdqa ($t3,@X[3])', + '&palignr ($t0,@X[0],$SZ)', # X[1..4] + '&palignr ($t3,@X[2],$SZ);', # X[9..12] + '&movdqa ($t1,$t0)', + '&movdqa ($t2,$t0);', + '&psrld ($t0,$sigma0[2])', + '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] + '&psrld ($t2,$sigma0[0])', + '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&pslld ($t1,8*$SZ-$sigma0[1]);'. + '&pxor ($t0,$t2)', + '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t1)', + '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t2);', + '&movdqa ($t2,$t3)', + '&pxor ($t0,$t1);', # sigma0(X[1..4]) + '&psrld ($t3,$sigma1[2])', + '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2)', + '&pshufb ($t3,$t4)', # sigma1(X[14..15]) + '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) + '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&movdqa ($t2,$t3);', + '&psrld ($t3,$sigma1[2])', + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2);', + '&movdqa ($t2,16*$j."($Tbl)")', + '&pshufb ($t3,$t5)', + '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub SSSE3_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + if (0) { + foreach (Xupdate_256_SSSE3()) { # 36 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } else { # squeeze extra 4% on Westmere and 19% on Atom + eval(shift(@insns)); #@ + &movdqa ($t0,@X[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t3,@X[3]); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &palignr ($t0,@X[0],$SZ); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + &palignr ($t3,@X[2],$SZ); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t1,$t0); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,$t0); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t2,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[3],0b11111010); # X[4..15] + eval(shift(@insns)); + eval(shift(@insns)); #@ + &pslld ($t1,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrld ($t2,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + &pxor ($t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + &pslld ($t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t1); # sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + #&pshufb ($t3,$t4); # sigma1(X[14..15]) + &pshufd ($t3,$t3,0b10000000); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[0],0b01010000); # X[16..17] + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + #&pshufb ($t3,$t5); + &pshufd ($t3,$t3,0b00001000); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,16*$j."($Tbl)"); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &pslldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + } + &paddd ($t2,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &movdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &SSSE3_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &jne (".Lssse3_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } +$code.=<<___; + mov $_ctx,$ctx + mov $a1,$A + mov $_inp,$inp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + lea 16*$SZ($inp),$inp + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop_ssse3 + + xorps %xmm0, %xmm0 + lea $framesz+6*8(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0, 0x00(%rsp) # scrub the stack + movaps %xmm0, 0x10(%rsp) + movaps %xmm0, 0x20(%rsp) + movaps %xmm0, 0x30(%rsp) +___ +$code.=<<___ if ($win64); + movaps 0x20(%rbp),%xmm6 + movaps 0x30(%rbp),%xmm7 + movaps 0x40(%rbp),%xmm8 + movaps 0x50(%rbp),%xmm9 +___ +$code.=<<___; + mov $framesz(%rbp),%r15 +.cfi_restore %r15 + mov -40(%r11),%r14 +.cfi_restore %r14 + mov -32(%r11),%r13 +.cfi_restore %r13 + mov -24(%r11),%r12 +.cfi_restore %r12 + mov -16(%r11),%rbx +.cfi_restore %rbx + mov -8(%r11),%rbp +.cfi_restore %rbp +.cfi_epilogue + lea (%r11),%rsp + ret +.cfi_endproc +.size ${func},.-${func} +___ +} +}}} +{ +my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order + ("%rdi","%rsi","%rdx"); # Unix order +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,\@abi-omnipotent +.align 16 +${pre}sha256_emit: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + bswap %r8 + mov 24($inp), %r11 + bswap %r9 + mov %r8d, 4($out) + bswap %r10 + mov %r9d, 12($out) + bswap %r11 + mov %r10d, 20($out) + shr \$32, %r8 + mov %r11d, 28($out) + shr \$32, %r9 + mov %r8d, 0($out) + shr \$32, %r10 + mov %r9d, 8($out) + shr \$32, %r11 + mov %r10d, 16($out) + mov %r11d, 24($out) + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,\@abi-omnipotent +.align 16 +${pre}sha256_bcopy: + sub $inp, $out +.Loop_bcopy: + movzb ($inp), %eax + lea 1($inp), $inp + mov %al, -1($out,$inp) + dec $len + jnz .Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,\@abi-omnipotent +.align 16 +${pre}sha256_hcopy: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + mov 24($inp), %r11 + mov %r8, 0($out) + mov %r9, 8($out) + mov %r10, 16($out) + mov %r11, 24($out) + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +sub sha256op38 { + my $instr = shift; + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { + my @opcode=(0x0f,0x38); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; + + print $_,"\n"; +} +close STDOUT; diff --git a/blst/asm/x86_64-xlate.pl b/blst/asm/x86_64-xlate.pl new file mode 100755 index 0000000..62be619 --- /dev/null +++ b/blst/asm/x86_64-xlate.pl @@ -0,0 +1,1781 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Ascetic x86_64 AT&T to MASM/NASM assembler translator by @dot-asm. +# +# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T +# format is way easier to parse. Because it's simpler to "gear" from +# Unix ABI to Windows one [see cross-reference "card" at the end of +# file]. Because Linux targets were available first... +# +# In addition the script also "distills" code suitable for GNU +# assembler, so that it can be compiled with more rigid assemblers, +# such as Solaris /usr/ccs/bin/as. +# +# This translator is not designed to convert *arbitrary* assembler +# code from AT&T format to MASM one. It's designed to convert just +# enough to provide for dual-ABI OpenSSL modules development... +# There *are* limitations and you might have to modify your assembler +# code or this script to achieve the desired result... +# +# Currently recognized limitations: +# +# - can't use multiple ops per line; +# +# Dual-ABI styling rules. +# +# 1. Adhere to Unix register and stack layout [see cross-reference +# ABI "card" at the end for explanation]. +# 2. Forget about "red zone," stick to more traditional blended +# stack frame allocation. If volatile storage is actually required +# that is. If not, just leave the stack as is. +# 3. Functions tagged with ".type name,@function" get crafted with +# unified Win64 prologue and epilogue automatically. If you want +# to take care of ABI differences yourself, tag functions as +# ".type name,@abi-omnipotent" instead. +# 4. To optimize the Win64 prologue you can specify number of input +# arguments as ".type name,@function,N." Keep in mind that if N is +# larger than 6, then you *have to* write "abi-omnipotent" code, +# because >6 cases can't be addressed with unified prologue. +# 5. Name local labels as .L*, do *not* use dynamic labels such as 1: +# (sorry about latter). +# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is +# required to identify the spots, where to inject Win64 epilogue! +# But on the pros, it's then prefixed with rep automatically:-) +# 7. Stick to explicit ip-relative addressing. If you have to use +# GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. +# Both are recognized and translated to proper Win64 addressing +# modes. +# +# 8. In order to provide for structured exception handling unified +# Win64 prologue copies %rsp value to %rax. [Unless function is +# tagged with additional .type tag.] For further details see SEH +# paragraph at the end. +# 9. .init segment is allowed to contain calls to functions only. +# a. If function accepts more than 4 arguments *and* >4th argument +# is declared as non 64-bit value, do clear its upper part. + + +use strict; + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +open STDOUT,">$output" || die "can't open $output: $!" + if (defined($output)); + +my $gas=1; $gas=0 if ($output =~ /\.asm$/); +my $elf=1; $elf=0 if (!$gas); +my $dwarf=$elf; +my $win64=0; +my $prefix=""; +my $decor=".L"; + +my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 +my $masm=0; +my $PTR=" PTR"; + +my $nasmref=2.03; +my $nasm=0; + +if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; + $prefix=`echo __USER_LABEL_PREFIX__ | \${CC:-false} -E -P -`; + $prefix =~ s|\R$||; # Better chomp + } +elsif ($flavour eq "macosx") { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; } +elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } +elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } +elsif (!$gas) +{ if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i) + { $nasm = $1 + $2*0.01; $PTR=""; } + elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/) + { $masm = $1 + $2*2**-16 + $4*2**-32; } + die "no assembler found on %PATH%" if (!($nasm || $masm)); + $win64=1; + $elf=0; + $decor="\$L\$"; +} + +$dwarf=0 if($win64); + +my $current_segment; +my $current_function; +my %globals; + +{ package opcode; # pick up opcodes + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^([a-z][a-z0-9]*)/i) { + bless $self,$class; + $self->{op} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + undef $self->{sz}; + if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... + $self->{op} = $1; + $self->{sz} = $2; + } elsif ($self->{op} =~ /cmov[n]?[lb]$/) { + # pass through + } elsif ($self->{op} =~ /call|jmp/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn + $self->{sz} = ""; + } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov + $self->{sz} = ""; + } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { + $self->{op} = $1; + $self->{sz} = $2; + } + } + $ret; + } + sub size { + my ($self, $sz) = @_; + $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); + $self->{sz}; + } + sub out { + my $self = shift; + if ($gas) { + if ($self->{op} eq "movz") { # movz is pain... + sprintf "%s%s%s",$self->{op},$self->{sz},shift; + } elsif ($self->{op} =~ /^set/) { + "$self->{op}"; + } elsif ($self->{op} eq "ret") { + my $epilogue = ""; + if ($win64 && $current_function->{abi} eq "svr4" + && !$current_function->{unwind}) { + $epilogue = "movq 8(%rsp),%rdi\n\t" . + "movq 16(%rsp),%rsi\n\t"; + } + $epilogue . ".byte 0xf3,0xc3"; + } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { + ".p2align\t3\n\t.quad"; + } else { + "$self->{op}$self->{sz}"; + } + } else { + $self->{op} =~ s/^movz/movzx/; + if ($self->{op} eq "ret") { + $self->{op} = ""; + if ($win64 && $current_function->{abi} eq "svr4" + && !$current_function->{unwind}) { + $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t". + "mov rsi,QWORD$PTR\[16+rsp\]\n\t"; + } + $self->{op} .= "DB\t0F3h,0C3h\t\t;repret"; + } elsif ($self->{op} =~ /^(pop|push)f/) { + $self->{op} .= $self->{sz}; + } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { + $self->{op} = "\tDQ"; + } + $self->{op}; + } + } + sub mnemonic { + my ($self, $op) = @_; + $self->{op}=$op if (defined($op)); + $self->{op}; + } +} +{ package const; # pick up constants, which start with $ + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^\$([^,]+)/) { + bless $self, $class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub out { + my $self = shift; + + $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig; + if ($gas) { + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{value} + my $value = $self->{value}; + no warnings; # oct might complain about overflow, ignore here... + $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi; + if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) { + $self->{value} = $value; + } + sprintf "\$%s",$self->{value}; + } else { + my $value = $self->{value}; + $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); + sprintf "%s",$value; + } + } +} +{ package ea; # pick up effective addresses: expr(%reg,%reg,scale) + + my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", + l=>"DWORD$PTR", d=>"DWORD$PTR", + q=>"QWORD$PTR", o=>"OWORD$PTR", + x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", + z=>"ZMMWORD$PTR" ) if (!$gas); + + my %sifmap = ( ss=>"d", sd=>"q", # broadcast only + i32x2=>"q", f32x2=>"q", + i32x4=>"x", i64x2=>"x", i128=>"x", + f32x4=>"x", f64x2=>"x", f128=>"x", + i32x8=>"y", i64x4=>"y", + f32x8=>"y", f64x4=>"y" ) if (!$gas); + + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) { + bless $self, $class; + $self->{asterisk} = $1; + $self->{label} = $2; + ($self->{base},$self->{index},$self->{scale})=split(/,/,$3); + $self->{scale} = 1 if (!defined($self->{scale})); + $self->{opmask} = $4; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { + die if ($opcode->mnemonic() ne "mov"); + $opcode->mnemonic("lea"); + } + $self->{base} =~ s/^%//; + $self->{index} =~ s/^%// if (defined($self->{index})); + $self->{opcode} = $opcode; + } + $ret; + } + sub size {} + sub out { + my ($self, $sz) = @_; + + $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{label} =~ s/\.L/$decor/g; + + # Silently convert all EAs to 64-bit. This is required for + # elder GNU assembler and results in more compact code, + # *but* most importantly AES module depends on this feature! + $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{label}... + use integer; + $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi; + $self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; + + # Some assemblers insist on signed presentation of 32-bit + # offsets, but sign extension is a tricky business in perl... + $self->{label} =~ s/\b([0-9]+)\b/unpack("l",pack("L",$1))/eg; + + # if base register is %rbp or %r13, see if it's possible to + # flip base and index registers [for better performance] + if (!$self->{label} && $self->{index} && $self->{scale}==1 && + $self->{base} =~ /(rbp|r13)/) { + $self->{base} = $self->{index}; $self->{index} = $1; + } + + if ($gas) { + $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); + + if (defined($self->{index})) { + sprintf "%s%s(%s,%%%s,%d)%s", + $self->{asterisk},$self->{label}, + $self->{base}?"%$self->{base}":"", + $self->{index},$self->{scale}, + $self->{opmask}; + } else { + sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label}, + $self->{base},$self->{opmask}; + } + } else { + $self->{label} =~ s/\./\$/g; + $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig; + $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); + + my $mnemonic = $self->{opcode}->mnemonic(); + ($self->{asterisk}) && ($sz="q") || + ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || + ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^v(?:broadcast|extract|insert)([sif]\w+)$/) + && ($sz=$sifmap{$1}); + + $self->{opmask} =~ s/%(k[0-7])/$1/; + + if (defined($self->{index})) { + sprintf "%s[%s%s*%d%s]%s",$szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{index},$self->{scale}, + $self->{base}?"+$self->{base}":"", + $self->{opmask}; + } elsif ($self->{base} eq "rip") { + sprintf "%s[%s]",$szmap{$sz},$self->{label}; + } else { + sprintf "%s[%s%s]%s", $szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{base},$self->{opmask}; + } + } + } +} +{ package register; # pick up registers, which start with %. + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) { + bless $self,$class; + $self->{asterisk} = $1; + $self->{value} = $2; + $self->{opmask} = $3; + $opcode->size($self->size()); + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub size { + my $self = shift; + my $ret; + + if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } + elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } + elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } + elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } + elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } + elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } + + $ret; + } + sub out { + my $self = shift; + if ($gas) { sprintf "%s%%%s%s", $self->{asterisk}, + $self->{value}, + $self->{opmask}; } + else { $self->{opmask} =~ s/%(k[0-7])/$1/; + $self->{value}.$self->{opmask}; } + } +} +{ package label; # pick up labels, which end with : + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[\.\w]+)\:/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/^\.L/$decor/; + } + $ret; + } + sub out { + my $self = shift; + + if ($gas) { + my $func = ($globals{$self->{value}} or $self->{value}) . ":"; + if ($current_function->{name} eq $self->{value}) { + $func .= "\n.cfi_".cfi_directive::startproc() if ($dwarf); + $func .= "\n .byte 0xf3,0x0f,0x1e,0xfa\n"; # endbranch + if ($win64 && $current_function->{abi} eq "svr4") { + my $fp = $current_function->{unwind} ? "%r11" : "%rax"; + $func .= " movq %rdi,8(%rsp)\n"; + $func .= " movq %rsi,16(%rsp)\n"; + $func .= " movq %rsp,$fp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " movq %rcx,%rdi\n" if ($narg>0); + $func .= " movq %rdx,%rsi\n" if ($narg>1); + $func .= " movq %r8,%rdx\n" if ($narg>2); + $func .= " movq %r9,%rcx\n" if ($narg>3); + $func .= " movq 40(%rsp),%r8\n" if ($narg>4); + $func .= " movq 48(%rsp),%r9\n" if ($narg>5); + } + } + $func; + } elsif ($self->{value} ne "$current_function->{name}") { + # Make all labels in masm global. + $self->{value} .= ":" if ($masm); + $self->{value} . ":"; + } elsif ($win64 && $current_function->{abi} eq "svr4") { + my $func = "$current_function->{name}" . + ($nasm ? ":" : "\tPROC $current_function->{scope}") . + "\n"; + my $fp = $current_function->{unwind} ? "r11" : "rax"; + $func .= " DB 243,15,30,250\n"; # endbranch + $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n"; + $func .= " mov QWORD$PTR\[16+rsp\],rsi\n"; + $func .= " mov $fp,rsp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:"; + $func .= ":" if ($masm); + $func .= "\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " mov rdi,rcx\n" if ($narg>0); + $func .= " mov rsi,rdx\n" if ($narg>1); + $func .= " mov rdx,r8\n" if ($narg>2); + $func .= " mov rcx,r9\n" if ($narg>3); + $func .= " mov r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4); + $func .= " mov r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5); + $func .= "\n"; + } else { + "$current_function->{name}". + ($nasm ? ":" : "\tPROC $current_function->{scope}"). + "\n DB 243,15,30,250"; # endbranch + } + } +} +{ package expr; # pick up expressions + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[^,]+)/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/\@PLT// if (!$elf); + $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{value} =~ s/\.L/$decor/g; + $self->{opcode} = $opcode; + } + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +my @xdata_seg = (".section .xdata", ".align 8"); +my @pdata_seg = (".section .pdata", ".align 4"); + +{ package cfi_directive; + # CFI directives annotate instructions that are significant for + # stack unwinding procedure compliant with DWARF specification, + # see http://dwarfstd.org/. Besides naturally expected for this + # script platform-specific filtering function, this module adds + # three auxiliary synthetic directives not recognized by [GNU] + # assembler: + # + # - .cfi_push to annotate push instructions in prologue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_offset; + # - .cfi_pop to annotate pop instructions in epilogue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_restore; + # - [and most notably] .cfi_cfa_expression which encodes + # DW_CFA_def_cfa_expression and passes it to .cfi_escape as + # byte vector; + # + # CFA expressions were introduced in DWARF specification version + # 3 and describe how to deduce CFA, Canonical Frame Address. This + # becomes handy if your stack frame is variable and you can't + # spare register for [previous] frame pointer. Suggested directive + # syntax is made-up mix of DWARF operator suffixes [subset of] + # and references to registers with optional bias. Following example + # describes offloaded *original* stack pointer at specific offset + # from *current* stack pointer: + # + # .cfi_cfa_expression %rsp+40,deref,+8 + # + # Final +8 has everything to do with the fact that CFA is defined + # as reference to top of caller's stack, and on x86_64 call to + # subroutine pushes 8-byte return address. In other words original + # stack pointer upon entry to a subroutine is 8 bytes off from CFA. + # + # In addition the .cfi directives are re-purposed even for Win64 + # stack unwinding. Two more synthetic directives were added: + # + # - .cfi_end_prologue to denote point when all non-volatile + # registers are saved and stack or [chosen] frame pointer is + # stable; + # - .cfi_epilogue to denote point when all non-volatile registers + # are restored [and it even adds missing .cfi_restore-s]; + # + # Though it's not universal "miracle cure," it has its limitations. + # Most notably .cfi_cfa_expression won't start working... For more + # information see the end of this file. + + # Below constants are taken from "DWARF Expressions" section of the + # DWARF specification, section is numbered 7.7 in versions 3 and 4. + my %DW_OP_simple = ( # no-arg operators, mapped directly + deref => 0x06, dup => 0x12, + drop => 0x13, over => 0x14, + pick => 0x15, swap => 0x16, + rot => 0x17, xderef => 0x18, + + abs => 0x19, and => 0x1a, + div => 0x1b, minus => 0x1c, + mod => 0x1d, mul => 0x1e, + neg => 0x1f, not => 0x20, + or => 0x21, plus => 0x22, + shl => 0x24, shr => 0x25, + shra => 0x26, xor => 0x27, + ); + + my %DW_OP_complex = ( # used in specific subroutines + constu => 0x10, # uleb128 + consts => 0x11, # sleb128 + plus_uconst => 0x23, # uleb128 + lit0 => 0x30, # add 0-31 to opcode + reg0 => 0x50, # add 0-31 to opcode + breg0 => 0x70, # add 0-31 to opcole, sleb128 + regx => 0x90, # uleb28 + fbreg => 0x91, # sleb128 + bregx => 0x92, # uleb128, sleb128 + piece => 0x93, # uleb128 + ); + + # Following constants are defined in x86_64 ABI supplement, for + # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf, + # see section 3.7 "Stack Unwind Algorithm". + my %DW_reg_idx = ( + "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, + "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + + my ($cfa_reg, $cfa_off, $cfa_rsp, %saved_regs); + my @cfa_stack; + + # [us]leb128 format is variable-length integer representation base + # 2^128, with most significant bit of each byte being 0 denoting + # *last* most significant digit. See "Variable Length Data" in the + # DWARF specification, numbered 7.6 at least in versions 3 and 4. + sub sleb128 { + use integer; # get right shift extend sign + + my $val = shift; + my $sign = ($val < 0) ? -1 : 0; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if remaining bits are same and equal to most + # significant bit of the current digit, if so, it's + # last digit... + last if (($val>>6) == $sign); + + @ret[-1] |= 0x80; + $val >>= 7; + } + + return @ret; + } + sub uleb128 { + my $val = shift; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if it's last significant digit... + last if (($val >>= 7) == 0); + + @ret[-1] |= 0x80; + } + + return @ret; + } + sub const { + my $val = shift; + + if ($val >= 0 && $val < 32) { + return ($DW_OP_complex{lit0}+$val); + } + return ($DW_OP_complex{consts}, sleb128($val)); + } + sub reg { + my $val = shift; + + return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); + + my $reg = $DW_reg_idx{$1}; + my $off = eval ("0 $2 $3"); + + return (($DW_OP_complex{breg0} + $reg), sleb128($off)); + # Yes, we use DW_OP_bregX+0 to push register value and not + # DW_OP_regX, because latter would require even DW_OP_piece, + # which would be a waste under the circumstances. If you have + # to use DWP_OP_reg, use "regx:N"... + } + sub cfa_expression { + my $line = shift; + my @ret; + + foreach my $token (split(/,\s*/,$line)) { + if ($token =~ /^%r/) { + push @ret,reg($token); + } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { + push @ret,reg("$2+$1"); + } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { + my $i = 1*eval($2); + push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); + } elsif (my $i = 1*eval($token) or $token eq "0") { + if ($token =~ /^\+/) { + push @ret,$DW_OP_complex{plus_uconst},uleb128($i); + } else { + push @ret,const($i); + } + } else { + push @ret,$DW_OP_simple{$token}; + } + } + + # Finally we return DW_CFA_def_cfa_expression, 15, followed by + # length of the expression and of course the expression itself. + return (15,scalar(@ret),@ret); + } + + # Following constants are defined in "x64 exception handling" at + # https://docs.microsoft.com/ and match the register sequence in + # CONTEXT structure defined in winnt.h. + my %WIN64_reg_idx = ( + "%rax"=>0, "%rcx"=>1, "%rdx"=>2, "%rbx"=>3, + "%rsp"=>4, "%rbp"=>5, "%rsi"=>6, "%rdi"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + sub xdata { + our @dat = (); + our $len = 0; + + sub allocstack { + my $offset = shift; + + if ($offset) { + if ($offset <= 128) { + $offset = ($offset - 8) >> 3; + push @dat, [0,$offset<<4|2]; # UWOP_ALLOC_SMALL + } elsif ($offset < 0x80000) { + push @dat, [0,0x01,unpack("C2",pack("v",$offset>>3))]; + } else { + push @dat, [0,0x11,unpack("C4",pack("V",$offset))]; + } + $len += $#{@dat[-1]}+1; + } + } + + # allocate stack frame + if (my $offset = -8 - $cfa_rsp) { + # but see if frame pointer is among saved registers + if ($cfa_reg ne "%rsp" and my $fp_off = $saved_regs{$cfa_reg}) { + $fp_off = -8 - $fp_off; + allocstack($fp_off-8); + $offset -= $fp_off; + push @dat, [0,$WIN64_reg_idx{$cfa_reg}<<4]; # UWOP_PUSH_NONVOL + $len += $#{@dat[-1]}+1; + } + allocstack($offset); + } + # set up frame pointer + my $fp_info = 0; + if ($cfa_reg ne "%rsp") { + my $offset = $cfa_off - $cfa_rsp; + ($offset > 240 or $offset&0xf) and die "invalid FP offset $offset"; + $fp_info = ($offset&-16)|$WIN64_reg_idx{$cfa_reg}; + push @dat, [0,3]; # UWOP_SET_FPREG + $len += $#{@dat[-1]}+1; + } + # save registers + foreach my $key (sort { $saved_regs{$b} <=> $saved_regs{$a} } + keys(%saved_regs)) { + next if ($cfa_reg ne "%rsp" && $cfa_reg eq $key); + my $offset = $saved_regs{$key} - $cfa_rsp; + if ($key =~ /%xmm([0-9]+)/) { + if ($offset < 0x100000) { + push @dat, [0,($1<<4)|8,unpack("C2",pack("v",$offset>>4))]; + } else { + push @dat, [0,($1<<4)|9,unpack("C4",pack("V",$offset))]; + } + } else { + if ($offset < 0x80000) { + push @dat, [0,(($WIN64_reg_idx{$key})<<4)|4, + unpack("C2",pack("v",$offset>>3))]; + } else { + push @dat, [0,(($WIN64_reg_idx{$key})<<4)|5, + unpack("C4",pack("V",$offset))]; + } + } + $len += $#{@dat[-1]}+1; + } + + my @ret; + # generate 4-byte descriptor + push @ret, ".byte 1,0,".($len/2).",$fp_info"; + $len += 4; + # pad to 8*n + unshift @dat, [(0)x((-$len)&7)] if ($len&7); + # emit data + while(defined(my $row = pop @dat)) { + push @ret, ".byte ". join(",", + map { sprintf "0x%02x",$_ } @{$row}); + } + + return @ret; + } + sub startproc { + return if ($cfa_rsp == -8); + ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", -8, -8); + %saved_regs = (); + return "startproc"; + } + sub endproc { + return if ($cfa_rsp == 0); + ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", 0, 0); + %saved_regs = (); + return "endproc"; + } + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { + bless $self,$class; + $ret = $self; + undef $self->{value}; + my $dir = $1; + + SWITCH: for ($dir) { + # What is $cfa_rsp? Effectively it's difference between %rsp + # value and current CFA, Canonical Frame Address, which is + # why it starts with -8. Recall that CFA is top of caller's + # stack... + /startproc/ && do { $dir = startproc(); last; }; + /endproc/ && do { $dir = endproc(); + # .cfi_remember_state directives that are not + # matched with .cfi_restore_state are + # unnecessary. + die "unpaired .cfi_remember_state" if (@cfa_stack); + last; + }; + /def_cfa_register/ + && do { $cfa_off = $cfa_rsp if ($cfa_reg eq "%rsp"); + $cfa_reg = $$line; + last; + }; + /def_cfa_offset/ + && do { $cfa_off = -1*eval($$line); + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + last; + }; + /adjust_cfa_offset/ + && do { my $val = 1*eval($$line); + $cfa_off -= $val; + if ($cfa_reg eq "%rsp") { + $cfa_rsp -= $val; + } + last; + }; + /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) { + $cfa_reg = $1; + $cfa_off = -1*eval($2); + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + } + last; + }; + /push/ && do { $dir = undef; + $cfa_rsp -= 8; + if ($cfa_reg eq "%rsp") { + $cfa_off = $cfa_rsp; + $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; + } + $saved_regs{$$line} = $cfa_rsp; + $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; + last; + }; + /pop/ && do { $dir = undef; + $cfa_rsp += 8; + if ($cfa_reg eq "%rsp") { + $cfa_off = $cfa_rsp; + $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; + } + $self->{value} .= ".cfi_restore\t$$line"; + delete $saved_regs{$$line}; + last; + }; + /cfa_expression/ + && do { $dir = undef; + $self->{value} = ".cfi_escape\t" . + join(",", map(sprintf("0x%02x", $_), + cfa_expression($$line))); + last; + }; + /remember_state/ + && do { push @cfa_stack, + [$cfa_reg,$cfa_off,$cfa_rsp,%saved_regs]; + last; + }; + /restore_state/ + && do { ($cfa_reg,$cfa_off,$cfa_rsp,%saved_regs) + = @{pop @cfa_stack}; + last; + }; + /offset/ && do { if ($$line =~ /(%\w+)\s*,\s*(.+)/) { + $saved_regs{$1} = 1*eval($2); + $dir = undef if ($1 =~ /%xmm/); + } + last; + }; + /restore/ && do { delete $saved_regs{$$line}; last; }; + /end_prologue/ + && do { $dir = undef; + $self->{win64} = ".endprolog"; + last; + }; + /epilogue/ && do { $dir = undef; + $self->{win64} = ".epilogue"; + $self->{value} = join("\n", + map { ".cfi_restore\t$_" } + sort keys(%saved_regs)); + %saved_regs = (); + last; + }; + } + + $self->{value} = ".cfi_$dir\t$$line" if ($dir); + + $$line = ""; + } + + return $ret; + } + sub out { + my $self = shift; + return $self->{value} if ($dwarf); + + if ($win64 and $current_function->{unwind} + and my $ret = $self->{win64}) { + my ($reg, $off) = ($cfa_reg =~ /%(?!rsp)/) ? ($', $cfa_off) + : ("rsp", $cfa_rsp); + my $fname = $current_function->{name}; + + if ($ret eq ".endprolog") { + $saved_regs{"%rdi"} = 0; # relative to CFA, remember? + $saved_regs{"%rsi"} = 8; + + push @pdata_seg, + ".rva .LSEH_begin_${fname}", + ".rva .LSEH_body_${fname}", + ".rva .LSEH_info_${fname}_prologue",""; + push @xdata_seg, + ".LSEH_info_${fname}_prologue:", + ".byte 1,0,5,0x0b", # 5 unwind codes, %r11 is FP + ".byte 0,0x74,1,0", # %rdi at 8(%rsp) + ".byte 0,0x64,2,0", # %rsi at 16(%rsp) + ".byte 0,0x03", # set frame pointer + ".byte 0,0" # padding + ; + push @pdata_seg, + ".rva .LSEH_body_${fname}", + ".rva .LSEH_epilogue_${fname}", + ".rva .LSEH_info_${fname}_body",""; + push @xdata_seg,".LSEH_info_${fname}_body:", xdata(); + $ret = "${decor}SEH_body_${fname}:"; + $ret .= ":" if ($masm); $ret .= "\n"; + } elsif ($ret eq ".epilogue") { + %saved_regs = (); + $saved_regs{"%rdi"} = 0; # relative to CFA, remember? + $saved_regs{"%rsi"} = 8; + $cfa_rsp = $cfa_off; + + push @pdata_seg, + ".rva .LSEH_epilogue_${fname}", + ".rva .LSEH_end_${fname}", + ".rva .LSEH_info_${fname}_epilogue",""; + push @xdata_seg,".LSEH_info_${fname}_epilogue:", xdata(), ""; + $ret = "${decor}SEH_epilogue_${fname}:"; + $ret .= ":" if ($masm); $ret .= "\n"; + if ($gas) { + $ret .= " mov ".(0-$off)."(%$reg),%rdi\n"; + $ret .= " mov ".(8-$off)."(%$reg),%rsi\n"; + } else { + $ret .= " mov rdi,QWORD$PTR\[".(0-$off)."+$reg\]"; + $ret .= " ;WIN64 epilogue\n"; + $ret .= " mov rsi,QWORD$PTR\[".(8-$off)."+$reg\]\n"; + } + } + return $ret; + } + return; + } +} +{ package directive; # pick up directives, which start with . + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + my $dir; + + # chain-call to cfi_directive + $ret = cfi_directive->re($line) and return $ret; + + if ($$line =~ /^\s*(\.\w+)/) { + bless $self,$class; + $dir = $1; + $ret = $self; + undef $self->{value}; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + SWITCH: for ($dir) { + /\.global|\.globl|\.extern/ + && do { $globals{$$line} = $prefix . $$line; + $$line = $globals{$$line} if ($prefix); + last; + }; + /\.type/ && do { my ($sym,$type,$narg,$unwind) = split(',',$$line); + if ($type eq "\@function") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{abi} = "svr4"; + $current_function->{narg} = $narg; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + $current_function->{unwind} = $unwind; + } elsif ($type eq "\@abi-omnipotent") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + } + $$line =~ s/\@abi\-omnipotent/\@function/; + $$line =~ s/\@function.*/\@function/; + last; + }; + /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) { + $dir = ".byte"; + $$line = join(",",unpack("C*",$1),0); + } + last; + }; + /\.rva|\.long|\.quad/ + && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $$line =~ s/\.L/$decor/g; + last; + }; + } + + if ($gas) { + $self->{value} = $dir . "\t" . $$line; + + if ($dir =~ /\.extern/) { + $self->{value} = ""; # swallow extern + } elsif (!$elf && $dir =~ /\.type/) { + $self->{value} = ""; + $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . + (defined($globals{$1})?".scl 2;":".scl 3;") . + "\t.type 32;\t.endef" + if ($win64 && $$line =~ /([^,]+),\@function/); + } elsif ($dir =~ /\.size/) { + $self->{value} = "" if (!$elf); + if ($dwarf and my $endproc = cfi_directive::endproc()) { + $self->{value} = ".cfi_$endproc\n$self->{value}"; + } elsif (!$elf && defined($current_function)) { + $self->{value} .= "${decor}SEH_end_$current_function->{name}:" + if ($win64 && $current_function->{abi} eq "svr4"); + undef $current_function; + } + } elsif (!$elf && $dir =~ /\.align/) { + $self->{value} = ".p2align\t" . (log($$line)/log(2)); + } elsif ($dir eq ".section") { + $current_segment=$$line; + if (!$elf && $current_segment eq ".init") { + if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } + elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } + } + } elsif ($dir =~ /\.(text|data)/) { + $current_segment=".$1"; + } elsif ($dir =~ /\.hidden/) { + if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; } + elsif ($flavour eq "mingw64") { $self->{value} = ""; } + } elsif ($dir =~ /\.comm/) { + $self->{value} = "$dir\t$prefix$$line"; + $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); + } + $$line = ""; + return $self; + } + + # non-gas case or nasm/masm + SWITCH: for ($dir) { + /\.text/ && do { my $v=undef; + if ($nasm) { + $v="section .text code align=64\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = ".text\$"; + $v.="$current_segment\tSEGMENT "; + $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE"; + $v.=" 'CODE'"; + } + $self->{value} = $v; + last; + }; + /\.data/ && do { my $v=undef; + if ($nasm) { + $v="section .data data align=8\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT"; + } + $self->{value} = $v; + last; + }; + /\.section/ && do { my $v=undef; + $$line =~ s/([^,]*).*/$1/; + $$line = ".CRT\$XCU" if ($$line eq ".init"); + if ($nasm) { + $v="section $$line"; + if ($$line=~/\.([px])data/) { + $v.=" rdata align="; + $v.=$1 eq "p"? 4 : 8; + } elsif ($$line=~/\.CRT\$/i) { + $v.=" rdata align=8"; + } + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $v.="$$line\tSEGMENT"; + if ($$line=~/\.([px])data/) { + $v.=" READONLY"; + $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref); + } elsif ($$line=~/\.CRT\$/i) { + $v.=" READONLY "; + $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; + } + } + $current_segment = $$line; + $self->{value} = $v; + last; + }; + /\.extern/ && do { $self->{value} = "EXTERN\t".$$line; + $self->{value} .= ":NEAR" if ($masm); + last; + }; + /\.globl|.global/ + && do { $self->{value} = $masm?"PUBLIC":"global"; + $self->{value} .= "\t".$$line; + last; + }; + /\.size/ && do { if (defined($current_function)) { + undef $self->{value}; + if ($current_function->{abi} eq "svr4") { + $self->{value}="${decor}SEH_end_$current_function->{name}:"; + $self->{value}.=":\n" if($masm); + } + $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); + undef $current_function; + } + last; + }; + /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096; + $self->{value} = "ALIGN\t".($$line>$max?$max:$$line); + last; + }; + /\.(value|long|rva|quad)/ + && do { my $sz = substr($1,0,1); + my @arr = split(/,\s*/,$$line); + my $last = pop(@arr); + my $conv = sub { my $var=shift; + $var=~s/^(0b[0-1]+)/oct($1)/eig; + $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); + if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) + { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } + $var; + }; + + $sz =~ tr/bvlrq/BWDDQ/; + $self->{value} = "\tD$sz\t"; + for (@arr) { $self->{value} .= &$conv($_).","; } + $self->{value} .= &$conv($last); + last; + }; + /\.byte/ && do { my @str=split(/,\s*/,$$line); + map(s/(0b[0-1]+)/oct($1)/eig,@str); + map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); + while ($#str>15) { + $self->{value}.="DB\t" + .join(",",@str[0..15])."\n"; + foreach (0..15) { shift @str; } + } + $self->{value}.="DB\t" + .join(",",@str) if (@str); + last; + }; + /\.comm/ && do { my @str=split(/,\s*/,$$line); + my $v=undef; + if ($nasm) { + $v.="common $prefix@str[0] @str[1]"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT\n"; + $v.="COMM @str[0]:DWORD:".@str[1]/4; + } + $self->{value} = $v; + last; + }; + } + $$line = ""; + } + + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +# Upon initial x86_64 introduction SSE>2 extensions were not introduced +# yet. In order not to be bothered by tracing exact assembler versions, +# but at the same time to provide a bare security minimum of AES-NI, we +# hard-code some instructions. Extensions past AES-NI on the other hand +# are traced by examining assembler version in individual perlasm +# modules... + +my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, + "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); + +sub rex { + my $opcode=shift; + my ($dst,$src,$rex)=@_; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @$opcode,($rex|0x40) if ($rex); +} + +my $movq = sub { # elderly gas can't handle inter-register movq + my $arg = shift; + my @opcode=(0x66); + if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { + my ($src,$dst)=($1,$2); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x7e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { + my ($src,$dst)=($2,$1); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x6e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } else { + (); + } +}; + +my $pextrd = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } + elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } + rex(\@opcode,$src,$dst); + push @opcode,0x0f,0x3a,0x16; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pinsrd = sub { + if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($src =~ /%r([0-9]+)/) { $src = $1; } + elsif ($src =~ /%e/) { $src = $regrm{$src}; } + rex(\@opcode,$dst,$src); + push @opcode,0x0f,0x3a,0x22; + push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pshufb = sub { + if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$2,$1); + push @opcode,0x0f,0x38,0x00; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + @opcode; + } else { + (); + } +}; + +my $palignr = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x0f; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + push @opcode,$1; + @opcode; + } else { + (); + } +}; + +my $pclmulqdq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x44; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $rdrand = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf0|($dst&7); + @opcode; + } else { + (); + } +}; + +my $rdseed = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf8|($dst&7); + @opcode; + } else { + (); + } +}; + +# Not all AVX-capable assemblers recognize AMD XOP extension. Since we +# are using only two instructions hand-code them in order to be excused +# from chasing assembler versions... + +sub rxb { + my $opcode=shift; + my ($dst,$src1,$src2,$rxb)=@_; + + $rxb|=0x7<<5; + $rxb&=~(0x04<<5) if($dst>=8); + $rxb&=~(0x01<<5) if($src1>=8); + $rxb&=~(0x02<<5) if($src2>=8); + push @$opcode,$rxb; +} + +my $vprotd = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc2; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $vprotq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc3; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +# Intel Control-flow Enforcement Technology extension. All functions and +# indirect branch targets will have to start with this instruction... +# However, it should not be used in functions' prologues explicitly, as +# it's added automatically [and in the right spot]. Which leaves only +# non-function indirect branch targets, such as in a case-like dispatch +# table, as application area. + +my $endbr64 = sub { + (0xf3,0x0f,0x1e,0xfa); +}; + +######################################################################## + +if ($nasm) { + print <<___; +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +___ +} elsif ($masm) { + print <<___; +OPTION DOTNAME +___ +} + +sub process { + my $line = shift; + + $line =~ s|\R$||; # Better chomp + + $line =~ s|[#!].*$||; # get rid of asm-style comments... + $line =~ s|/\*.*\*/||; # ... and C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning + $line =~ s|\s+$||; # ... and at the end + + if (my $label=label->re(\$line)) { print $label->out(); } + + if (my $directive=directive->re(\$line)) { + printf "%s",$directive->out(); + } elsif (my $opcode=opcode->re(\$line)) { + my $asm = eval("\$".$opcode->mnemonic()); + + if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { + print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; + next; + } + + my @args; + ARGUMENT: while (1) { + my $arg; + + ($arg=register->re(\$line, $opcode))|| + ($arg=const->re(\$line)) || + ($arg=ea->re(\$line, $opcode)) || + ($arg=expr->re(\$line, $opcode)) || + last ARGUMENT; + + push @args,$arg; + + last ARGUMENT if ($line !~ /^,/); + + $line =~ s/^,\s*//; + } # ARGUMENT: + + if ($#args>=0) { + my $insn; + my $sz=$opcode->size(); + + if ($gas) { + $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); + @args = map($_->out($sz),@args); + printf "\t%s\t%s",$insn,join(",",@args); + } else { + $insn = $opcode->out(); + foreach (@args) { + my $arg = $_->out(); + # $insn.=$sz compensates for movq, pinsrw, ... + if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } + if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } + if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; } + if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } + } + @args = reverse(@args); + undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); + printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); + } + } else { + printf "\t%s",$opcode->out(); + } + } + + print $line,"\n"; +} + +while(<>) { process($_); } + +map { process($_) } @pdata_seg if ($win64); +map { process($_) } @xdata_seg if ($win64); + +# platform-specific epilogue +if ($masm) { + print "\n$current_segment\tENDS\n" if ($current_segment); + print "END\n"; +} elsif ($elf) { + # -fcf-protection segment, snatched from compiler -S output + my $align = ($flavour =~ /elf32/) ? 4 : 8; + print <<___; + +.section .note.GNU-stack,"",\@progbits +.section .note.gnu.property,"a",\@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align $align +2: +___ +} + +close STDOUT; + +################################################# +# Cross-reference x86_64 ABI "card" +# +# Unix Win64 +# %rax * * +# %rbx - - +# %rcx #4 #1 +# %rdx #3 #2 +# %rsi #2 - +# %rdi #1 - +# %rbp - - +# %rsp - - +# %r8 #5 #3 +# %r9 #6 #4 +# %r10 * * +# %r11 * * +# %r12 - - +# %r13 - - +# %r14 - - +# %r15 - - +# +# (*) volatile register +# (-) preserved by callee +# (#) Nth argument, volatile +# +# In Unix terms top of stack is argument transfer area for arguments +# which could not be accommodated in registers. Or in other words 7th +# [integer] argument resides at 8(%rsp) upon function entry point. +# 128 bytes above %rsp constitute a "red zone" which is not touched +# by signal handlers and can be used as temporal storage without +# allocating a frame. +# +# In Win64 terms N*8 bytes on top of stack is argument transfer area, +# which belongs to/can be overwritten by callee. N is the number of +# arguments passed to callee, *but* not less than 4! This means that +# upon function entry point 5th argument resides at 40(%rsp), as well +# as that 32 bytes from 8(%rsp) can always be used as temporal +# storage [without allocating a frame]. One can actually argue that +# one can assume a "red zone" above stack pointer under Win64 as well. +# Point is that at apparently no occasion Windows kernel would alter +# the area above user stack pointer in true asynchronous manner... +# +# All the above means that if assembler programmer adheres to Unix +# register and stack layout, but disregards the "red zone" existence, +# it's possible to use following prologue and epilogue to "gear" from +# Unix to Win64 ABI in leaf functions with not more than 6 arguments. +# +# omnipotent_function: +# ifdef WIN64 +# movq %rdi,8(%rsp) +# movq %rsi,16(%rsp) +# movq %rcx,%rdi ; if 1st argument is actually present +# movq %rdx,%rsi ; if 2nd argument is actually ... +# movq %r8,%rdx ; if 3rd argument is ... +# movq %r9,%rcx ; if 4th argument ... +# movq 40(%rsp),%r8 ; if 5th ... +# movq 48(%rsp),%r9 ; if 6th ... +# endif +# ... +# ifdef WIN64 +# movq 8(%rsp),%rdi +# movq 16(%rsp),%rsi +# endif +# ret +# +################################################# +# Win64 SEH, Structured Exception Handling. +# +# Unlike on Unix systems(*) lack of Win64 stack unwinding information +# has undesired side-effect at run-time: if an exception is raised in +# assembler subroutine such as those in question (basically we're +# referring to segmentation violations caused by malformed input +# parameters), the application is briskly terminated without invoking +# any exception handlers, most notably without generating memory dump +# or any user notification whatsoever. This poses a problem. It's +# possible to address it by registering custom language-specific +# handler that would restore processor context to the state at +# subroutine entry point and return "exception is not handled, keep +# unwinding" code. Writing such handler can be a challenge... But it's +# doable, though requires certain coding convention. Consider following +# snippet: +# +# .type function,@function +# function: +# movq %rsp,%rax # copy rsp to volatile register +# pushq %r15 # save non-volatile registers +# pushq %rbx +# pushq %rbp +# movq %rsp,%r11 +# subq %rdi,%r11 # prepare [variable] stack frame +# andq $-64,%r11 +# movq %rax,0(%r11) # check for exceptions +# movq %r11,%rsp # allocate [variable] stack frame +# movq %rax,0(%rsp) # save original rsp value +# magic_point: +# ... +# movq 0(%rsp),%rcx # pull original rsp value +# movq -24(%rcx),%rbp # restore non-volatile registers +# movq -16(%rcx),%rbx +# movq -8(%rcx),%r15 +# movq %rcx,%rsp # restore original rsp +# magic_epilogue: +# ret +# .size function,.-function +# +# The key is that up to magic_point copy of original rsp value remains +# in chosen volatile register and no non-volatile register, except for +# rsp, is modified. While past magic_point rsp remains constant till +# the very end of the function. In this case custom language-specific +# exception handler would look like this: +# +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +# { ULONG64 *rsp = (ULONG64 *)context->Rax; +# ULONG64 rip = context->Rip; +# +# if (rip >= magic_point) +# { rsp = (ULONG64 *)context->Rsp; +# if (rip < magic_epilogue) +# { rsp = (ULONG64 *)rsp[0]; +# context->Rbp = rsp[-3]; +# context->Rbx = rsp[-2]; +# context->R15 = rsp[-1]; +# } +# } +# context->Rsp = (ULONG64)rsp; +# context->Rdi = rsp[1]; +# context->Rsi = rsp[2]; +# +# memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); +# RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, +# dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, +# &disp->HandlerData,&disp->EstablisherFrame,NULL); +# return ExceptionContinueSearch; +# } +# +# It's appropriate to implement this handler in assembler, directly in +# function's module. In order to do that one has to know members' +# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant +# values. Here they are: +# +# CONTEXT.Rax 120 +# CONTEXT.Rcx 128 +# CONTEXT.Rdx 136 +# CONTEXT.Rbx 144 +# CONTEXT.Rsp 152 +# CONTEXT.Rbp 160 +# CONTEXT.Rsi 168 +# CONTEXT.Rdi 176 +# CONTEXT.R8 184 +# CONTEXT.R9 192 +# CONTEXT.R10 200 +# CONTEXT.R11 208 +# CONTEXT.R12 216 +# CONTEXT.R13 224 +# CONTEXT.R14 232 +# CONTEXT.R15 240 +# CONTEXT.Rip 248 +# CONTEXT.Xmm6 512 +# sizeof(CONTEXT) 1232 +# DISPATCHER_CONTEXT.ControlPc 0 +# DISPATCHER_CONTEXT.ImageBase 8 +# DISPATCHER_CONTEXT.FunctionEntry 16 +# DISPATCHER_CONTEXT.EstablisherFrame 24 +# DISPATCHER_CONTEXT.TargetIp 32 +# DISPATCHER_CONTEXT.ContextRecord 40 +# DISPATCHER_CONTEXT.LanguageHandler 48 +# DISPATCHER_CONTEXT.HandlerData 56 +# UNW_FLAG_NHANDLER 0 +# ExceptionContinueSearch 1 +# +# In order to tie the handler to the function one has to compose +# couple of structures: one for .xdata segment and one for .pdata. +# +# UNWIND_INFO structure for .xdata segment would be +# +# function_unwind_info: +# .byte 9,0,0,0 +# .rva handler +# +# This structure designates exception handler for a function with +# zero-length prologue, no stack frame or frame register. +# +# To facilitate composing of .pdata structures, auto-generated "gear" +# prologue copies rsp value to rax and denotes next instruction with +# .LSEH_begin_{function_name} label. This essentially defines the SEH +# styling rule mentioned in the beginning. Position of this label is +# chosen in such manner that possible exceptions raised in the "gear" +# prologue would be accounted to caller and unwound from latter's frame. +# End of function is marked with respective .LSEH_end_{function_name} +# label. To summarize, .pdata segment would contain +# +# .rva .LSEH_begin_function +# .rva .LSEH_end_function +# .rva function_unwind_info +# +# Reference to function_unwind_info from .xdata segment is the anchor. +# In case you wonder why references are 32-bit .rvas and not 64-bit +# .quads. References put into these two segments are required to be +# *relative* to the base address of the current binary module, a.k.a. +# image base. No Win64 module, be it .exe or .dll, can be larger than +# 2GB and thus such relative references can be and are accommodated in +# 32 bits. +# +# Having reviewed the example function code, one can argue that "movq +# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix +# rax would contain an undefined value. If this "offends" you, use +# another register and refrain from modifying rax till magic_point is +# reached, i.e. as if it was a non-volatile register. If more registers +# are required prior [variable] frame setup is completed, note that +# nobody says that you can have only one "magic point." You can +# "liberate" non-volatile registers by denoting last stack off-load +# instruction and reflecting it in finer grade unwind logic in handler. +# After all, isn't it why it's called *language-specific* handler... +# +# SE handlers are also involved in unwinding stack when executable is +# profiled or debugged. Profiling implies additional limitations that +# are too subtle to discuss here. For now it's sufficient to say that +# in order to simplify handlers one should either a) offload original +# %rsp to stack (like discussed above); or b) if you have a register to +# spare for frame pointer, choose volatile one. +# +# (*) Note that we're talking about run-time, not debug-time. Lack of +# unwind information makes debugging hard on both Windows and +# Unix. "Unlike" refers to the fact that on Unix signal handler +# will always be invoked, core dumped and appropriate exit code +# returned to parent (for user notification). +# +######################################################################## +# As of May 2020 an alternative approach that works with both exceptions +# and debugging/profiling was implemented by re-purposing DWARF .cfi +# annotations even for Win64 unwind tables' generation. Unfortunately, +# but not really unexpectedly, it imposes additional limitations on +# coding style. Probably most significant limitation is that frame +# pointer has to be at 16*n distance from stack pointer at the exit +# from prologue. But first things first. There are two additional +# synthetic .cfi directives, .cfi_end_prologue and .cfi_epilogue, +# that need to be added to all functions marked with additional .type +# tag (see example below). There are "do's and don'ts" for prologue +# and epilogue. It shouldn't come as surprise that in prologue one may +# not modify non-volatile registers, but one may not modify %r11 either. +# This is because it's used as temporary frame pointer(*). There is one +# exception to this rule, and it's setting up frame pointer that is +# non-volatile or %r11. But it must be last instruction in the prologue. +# Constraints for epilogue, or rather on its boundary, depend on whether +# the frame is fixed- or variable-length. In fixed-frame subroutine +# stack pointer has to be restored in the last instruction prior the +# .cfi_epilogue directive. If it's variable-frame subroutine, and a +# non-volatile register was used as frame pointer, then last instruction +# prior the directive has to restore its original value. This means that +# final stack pointer adjustment would have to be pushed past the +# directive. Normally this would render the epilogue non-unwindable, so +# special care has to be taken. To resolve the dilemma, copy frame +# pointer to a volatile register in advance. To give an example: +# +# .type rbp_as_frame_pointer,\@function,3,"unwind" # mind extra tag! +# rbp_as_frame_pointer: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# push %rbx +# .cfi_push %rbx +# mov %rsp,%rbp # last instruction in prologue +# .cfi_def_cfa_register %rbp # %rsp-%rbp has to be 16*n, e.g. 16*0 +# .cfi_end_prologue +# sub \$40,%rsp +# and \$-64,%rsp +# ... +# mov %rbp,%r11 +# .cfi_def_cfa_register %r11 # copy frame pointer to volatile %r11 +# mov 0(%rbp),%rbx +# mov 8(%rbp),%rbp # last instruction prior epilogue +# .cfi_epilogue # may not change %r11 in epilogue +# lea 16(%r11),%rsp +# ret +# .cfi_endproc +# .size rbp_as_frame_pointer,.-rbp_as_frame_pointer +# +# To give an example of fixed-frame subroutine for reference: +# +# .type fixed_frame,\@function,3,"unwind" # mind extra tag! +# fixed_frame: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# push %rbx +# .cfi_push %rbx +# sub \$40,%rsp +# .cfi_adjust_cfa_offset 40 +# .cfi_end_prologue +# ... +# mov 40(%rsp),%rbx +# mov 48(%rsp),%rbp +# lea 56(%rsp),%rsp +# .cfi_adjust_cfa_offset -56 +# .cfi_epilogue +# ret +# .cfi_endproc +# .size fixed_frame,.-fixed_frame +# +# As for epilogue itself, one can only work on non-volatile registers. +# "Non-volatile" in "Windows" sense, i.e. minus %rdi and %rsi. +# +# On a final note, mixing old-style and modernized subroutines in the +# same file takes some trickery. Ones of the new kind have to appear +# after old-style ones. This has everything to do with the fact that +# entries in the .pdata segment have to appear in strictly same order +# as corresponding subroutines, and auto-generated RUNTIME_FUNCTION +# structures get mechanically appended to whatever existing .pdata. +# +# (*) Just in case, why %r11 and not %rax. This has everything to do +# with the way UNWIND_INFO is, one just can't designate %rax as +# frame pointer. diff --git a/blst/assembly.S b/blst/assembly.S new file mode 100644 index 0000000..a1a7c54 --- /dev/null +++ b/blst/assembly.S @@ -0,0 +1,123 @@ +#if defined(__x86_64) || defined(__x86_64__) +# if defined(__ELF__) +# if defined(__BLST_PORTABLE__) +# include "elf/sha256-portable-x86_64.s" +# else +# include "elf/sha256-x86_64.s" +# endif +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "elf/ctx_inverse_mod_384-x86_64.s" +# else +# include "elf/ctq_inverse_mod_384-x86_64.s" +# endif +# include "elf/add_mod_384-x86_64.s" +# include "elf/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "elf/mulx_mont_384-x86_64.s" +# include "elf/mulx_mont_256-x86_64.s" +# else +# include "elf/mulq_mont_384-x86_64.s" +# include "elf/mulq_mont_256-x86_64.s" +# endif +# include "elf/add_mod_256-x86_64.s" +# include "elf/ct_inverse_mod_256-x86_64.s" +# include "elf/div3w-x86_64.s" +# include "elf/ct_is_square_mod_384-x86_64.s" +# elif defined(_WIN64) || defined(__CYGWIN__) +# if defined(__BLST_PORTABLE__) +# include "coff/sha256-portable-x86_64.s" +# else +# include "coff/sha256-x86_64.s" +# endif +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "coff/ctx_inverse_mod_384-x86_64.s" +# else +# include "coff/ctq_inverse_mod_384-x86_64.s" +# endif +# include "coff/add_mod_384-x86_64.s" +# include "coff/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "coff/mulx_mont_384-x86_64.s" +# include "coff/mulx_mont_256-x86_64.s" +# else +# include "coff/mulq_mont_384-x86_64.s" +# include "coff/mulq_mont_256-x86_64.s" +# endif +# include "coff/add_mod_256-x86_64.s" +# include "coff/ct_inverse_mod_256-x86_64.s" +# include "coff/div3w-x86_64.s" +# include "coff/ct_is_square_mod_384-x86_64.s" +# elif defined(__APPLE__) +# include "mach-o/sha256-x86_64.s" +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "mach-o/ctx_inverse_mod_384-x86_64.s" +# else +# include "mach-o/ctq_inverse_mod_384-x86_64.s" +# endif +# include "mach-o/add_mod_384-x86_64.s" +# include "mach-o/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "mach-o/mulx_mont_384-x86_64.s" +# include "mach-o/mulx_mont_256-x86_64.s" +# else +# include "mach-o/mulq_mont_384-x86_64.s" +# include "mach-o/mulq_mont_256-x86_64.s" +# endif +# include "mach-o/add_mod_256-x86_64.s" +# include "mach-o/ct_inverse_mod_256-x86_64.s" +# include "mach-o/div3w-x86_64.s" +# include "mach-o/ct_is_square_mod_384-x86_64.s" +# endif +#elif defined(__aarch64__) +# if defined(__ELF__) +# include "elf/sha256-armv8.S" +# include "elf/ct_inverse_mod_384-armv8.S" +# include "elf/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "elf/mul_mont_384-armv8.S" +# include "elf/mul_mont_256-armv8.S" +# include "elf/add_mod_256-armv8.S" +# include "elf/ct_inverse_mod_256-armv8.S" +# include "elf/div3w-armv8.S" +# include "elf/ct_is_square_mod_384-armv8.S" +# elif defined(_WIN64) +# include "coff/sha256-armv8.S" +# include "coff/ct_inverse_mod_384-armv8.S" +# include "coff/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "coff/mul_mont_384-armv8.S" +# include "coff/mul_mont_256-armv8.S" +# include "coff/add_mod_256-armv8.S" +# include "coff/ct_inverse_mod_256-armv8.S" +# include "coff/div3w-armv8.S" +# include "coff/ct_is_square_mod_384-armv8.S" +# elif defined(__APPLE__) +# include "mach-o/sha256-armv8.S" +# include "mach-o/ct_inverse_mod_384-armv8.S" +# include "mach-o/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "mach-o/mul_mont_384-armv8.S" +# include "mach-o/mul_mont_256-armv8.S" +# include "mach-o/add_mod_256-armv8.S" +# include "mach-o/ct_inverse_mod_256-armv8.S" +# include "mach-o/div3w-armv8.S" +# include "mach-o/ct_is_square_mod_384-armv8.S" +# endif +#elif defined(__BLST_NO_ASM__) || \ + (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4) +/* inaccurate way to detect a 32-bit processor, but it's close enough */ +#else +# error "unsupported platform" +#endif diff --git a/blst/blst.h b/blst/blst.h new file mode 100644 index 0000000..aaee107 --- /dev/null +++ b/blst/blst.h @@ -0,0 +1,480 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __BLST_H__ +#define __BLST_H__ + +#ifdef __SIZE_TYPE__ +typedef __SIZE_TYPE__ size_t; +#else +#include <stddef.h> +#endif + +#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ + && defined(__UINT64_TYPE__) +typedef __UINT8_TYPE__ uint8_t; +typedef __UINT32_TYPE__ uint32_t; +typedef __UINT64_TYPE__ uint64_t; +#else +#include <stdint.h> +#endif + +#ifdef __cplusplus +extern "C" { +#elif defined(__BLST_CGO__) +typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ +#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 +# define bool _Bool +#else +# define bool int +#endif + +#ifdef SWIG +# define DEFNULL =NULL +#elif defined __cplusplus +# define DEFNULL =0 +#else +# define DEFNULL +#endif + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, + BLST_BAD_SCALAR, +} BLST_ERROR; + +typedef uint8_t byte; +typedef uint64_t limb_t; + +typedef struct { byte b[256/8]; } blst_scalar; +typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr; +typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp; +/* 0 is "real" part, 1 is "imaginary" */ +typedef struct { blst_fp fp[2]; } blst_fp2; +typedef struct { blst_fp2 fp2[3]; } blst_fp6; +typedef struct { blst_fp6 fp6[2]; } blst_fp12; + +void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]); +void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a); +void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]); +void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a); +void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]); +void blst_bendian_from_scalar(byte out[32], const blst_scalar *a); +void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]); +void blst_lendian_from_scalar(byte out[32], const blst_scalar *a); +bool blst_scalar_fr_check(const blst_scalar *a); +bool blst_sk_check(const blst_scalar *a); +bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +void blst_sk_inverse(blst_scalar *out, const blst_scalar *a); +bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len); +bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len); + +#ifndef SWIG +/* + * BLS12-381-specifc Fr operations. + */ +void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a); +void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sqr(blst_fr *ret, const blst_fr *a); +void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag); +void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a); +void blst_fr_inverse(blst_fr *ret, const blst_fr *a); + +void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]); +void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a); +void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a); +void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a); + +/* + * BLS12-381-specifc Fp operations. + */ +void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a); +void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a); +void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count); +void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sqr(blst_fp *ret, const blst_fp *a); +void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag); +void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a); +void blst_fp_inverse(blst_fp *ret, const blst_fp *a); +bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a); + +void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]); +void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a); +void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]); +void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a); +void blst_fp_from_bendian(blst_fp *ret, const byte a[48]); +void blst_bendian_from_fp(byte ret[48], const blst_fp *a); +void blst_fp_from_lendian(blst_fp *ret, const byte a[48]); +void blst_lendian_from_fp(byte ret[48], const blst_fp *a); + +/* + * BLS12-381-specifc Fp2 operations. + */ +void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count); +void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag); +void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a); +bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a); + +/* + * BLS12-381-specifc Fp12 operations. + */ +void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b); +void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a, + const blst_fp6 *xy00z0); +void blst_fp12_conjugate(blst_fp12 *a); +void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n); +bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b); +bool blst_fp12_is_one(const blst_fp12 *a); +bool blst_fp12_in_group(const blst_fp12 *a); +const blst_fp12 *blst_fp12_one(); +#endif // SWIG + +/* + * BLS12-381-specifc point operations. + */ +typedef struct { blst_fp x, y, z; } blst_p1; +typedef struct { blst_fp x, y; } blst_p1_affine; + +void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_double(blst_p1 *out, const blst_p1 *a); +void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p1_cneg(blst_p1 *p, bool cbit); +void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in); +void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in); +bool blst_p1_on_curve(const blst_p1 *p); +bool blst_p1_in_g1(const blst_p1 *p); +bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b); +bool blst_p1_is_inf(const blst_p1 *a); +const blst_p1 *blst_p1_generator(); + +bool blst_p1_affine_on_curve(const blst_p1_affine *p); +bool blst_p1_affine_in_g1(const blst_p1_affine *p); +bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); +bool blst_p1_affine_is_inf(const blst_p1_affine *a); +const blst_p1_affine *blst_p1_affine_generator(); + +typedef struct { blst_fp2 x, y, z; } blst_p2; +typedef struct { blst_fp2 x, y; } blst_p2_affine; + +void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_double(blst_p2 *out, const blst_p2 *a); +void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); +void blst_p2_cneg(blst_p2 *p, bool cbit); +void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in); +void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in); +bool blst_p2_on_curve(const blst_p2 *p); +bool blst_p2_in_g2(const blst_p2 *p); +bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b); +bool blst_p2_is_inf(const blst_p2 *a); +const blst_p2 *blst_p2_generator(); + +bool blst_p2_affine_on_curve(const blst_p2_affine *p); +bool blst_p2_affine_in_g2(const blst_p2_affine *p); +bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); +bool blst_p2_affine_is_inf(const blst_p2_affine *a); +const blst_p2_affine *blst_p2_affine_generator(); + +/* + * Multi-scalar multiplications and other multi-point operations. + */ + +void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[], + size_t npoints); +void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints); + +size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits, + const blst_p1_affine *const points[], + size_t npoints); +size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[], + size_t npoints); +void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints); + +size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits, + const blst_p2_affine *const points[], + size_t npoints); +size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +/* + * Hash-to-curve operations. + */ +#ifndef SWIG +void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL); +void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL); +#endif + +void blst_encode_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +void blst_encode_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +/* + * Zcash-compatible serialization/deserialization. + */ +void blst_p1_serialize(byte out[96], const blst_p1 *in); +void blst_p1_compress(byte out[48], const blst_p1 *in); +void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in); +void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in); +BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]); +BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]); + +void blst_p2_serialize(byte out[192], const blst_p2 *in); +void blst_p2_compress(byte out[96], const blst_p2 *in); +void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in); +void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in); +BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]); +BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]); + +/* + * Specification defines two variants, 'minimal-signature-size' and + * 'minimal-pubkey-size'. To unify appearance we choose to distinguish + * them by suffix referring to the public key type, more specifically + * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to + * 'minimal-signature-size'. It might appear a bit counterintuitive + * in sign call, but no matter how you twist it, something is bound to + * turn a little odd. + */ +/* + * Secret-key operations. + */ +void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash, + const blst_scalar *SK); +void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, + const blst_scalar *SK); + +/* + * Pairing interface. + */ +#ifndef SWIG +void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, + const blst_p1_affine *P); +void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); +void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); +void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], + const blst_p1_affine *P); +bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2); +#endif + +#ifdef __BLST_CGO__ +typedef limb_t blst_pairing; +#elif defined(__BLST_RUST_BINDGEN__) +typedef struct {} blst_pairing; +#else +typedef struct blst_opaque blst_pairing; +#endif + +size_t blst_pairing_sizeof(); +void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, + const byte *DST DEFNULL, size_t DST_len DEFNULL); +const byte *blst_pairing_get_dst(const blst_pairing *ctx); +void blst_pairing_commit(blst_pairing *ctx); +BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1); +bool blst_pairing_finalverify(const blst_pairing *ctx, + const blst_fp12 *gtsig DEFNULL); + + +/* + * Customarily applications aggregate signatures separately. + * In which case application would have to pass NULLs for |signature| + * to blst_pairing_aggregate calls and pass aggregated signature + * collected with these calls to blst_pairing_finalverify. Inputs are + * Zcash-compatible "straight-from-wire" byte vectors, compressed or + * not. + */ +BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in, + const byte *zwire); +BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in, + const byte *zwire); + +void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature); +void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature); + +/* + * "One-shot" CoreVerify entry points. + */ +BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk, + const blst_p2_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk, + const blst_p1_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); + +extern const blst_p1_affine BLS12_381_G1; +extern const blst_p1_affine BLS12_381_NEG_G1; +extern const blst_p2_affine BLS12_381_G2; +extern const blst_p2_affine BLS12_381_NEG_G2; + +#include "blst_aux.h" + +#ifdef __cplusplus +} +#endif +#endif diff --git a/blst/blst_aux.h b/blst/blst_aux.h new file mode 100644 index 0000000..41c2901 --- /dev/null +++ b/blst/blst_aux.h @@ -0,0 +1,79 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_AUX_H__ +#define __BLST_AUX_H__ +/* + * This file lists interfaces that might be promoted to blst.h or removed, + * depending on their proven/unproven worthiness. + */ + +void blst_fr_to(blst_fr *ret, const blst_fr *a); +void blst_fr_from(blst_fr *ret, const blst_fr *a); + +void blst_fp_to(blst_fp *ret, const blst_fp *a); +void blst_fp_from(blst_fp *ret, const blst_fp *a); + +bool blst_fp_is_square(const blst_fp *a); +bool blst_fp2_is_square(const blst_fp2 *a); + +void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in); +void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in); + +/* + * Below functions produce both point and deserialized outcome of + * SkToPk and Sign. However, deserialized outputs are pre-decorated + * with sign and infinity bits. This means that you have to bring the + * output into compliance prior returning to application. If you want + * compressed point value, then do [equivalent of] + * + * byte temp[96]; + * blst_sk_to_pk2_in_g1(temp, out_pk, SK); + * temp[0] |= 0x80; + * memcpy(out, temp, 48); + * + * Otherwise do + * + * blst_sk_to_pk2_in_g1(out, out_pk, SK); + * out[0] &= ~0x20; + * + * Either |out| or |out_<point>| can be NULL. + */ +void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig, + const blst_p2 *hash, const blst_scalar *SK); +void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig, + const blst_p1 *hash, const blst_scalar *SK); + +typedef struct {} blst_uniq; + +size_t blst_uniq_sizeof(size_t n_nodes); +void blst_uniq_init(blst_uniq *tree); +bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len); + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +void blst_expand_message_xmd(byte *out, size_t out_len, + const byte *msg, size_t msg_len, + const byte *DST, size_t DST_len); +#endif + +void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); + +void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q, + const blst_p1_affine *p); +blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx); + +#endif diff --git a/blst/bulk_addition.c b/blst/bulk_addition.c new file mode 100644 index 0000000..81afc53 --- /dev/null +++ b/blst/bulk_addition.c @@ -0,0 +1,168 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" +#include "point.h" + +/* + * This implementation uses explicit addition formula: + * + * λ = (Y₂-Y₁)/(X₂-X₁) + * X₃ = λ²-(X₁+X₂) + * Y₃ = λ⋅(X₁-X₃)-Y₁ + * + * But since we don't know if we'll have to add point to itself, we need + * to eventually resort to corresponding doubling formula: + * + * λ = 3X₁²/2Y₁ + * X₃ = λ²-2X₁ + * Y₃ = λ⋅(X₁-X₃)-Y₁ + * + * The formulae use prohibitively expensive inversion, but whenever we + * have a lot of affine points to accumulate, we can amortize the cost + * by applying Montgomery's batch inversion approach. As a result, + * asymptotic[!] per-point cost for addition is as small as 5M+1S. For + * comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things + * considered, the improvement coefficient varies from 60% to 85% + * depending on platform and curve. + * + * THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an + * application that requires constant time-ness, speak up!] + */ + +/* + * Calculate λ's numerator and denominator. + * + * input: A x1 y1 - + * B x2 y2 - + * output: + * if A!=B: A x1 y1 (x2-x1)*mul_acc + * B x2+x1 y2-y1 (x2-x1) + * + * if A==B: A x y 2y*mul_acc + * B 2x 3*x^2 2y + * + * if A==-B: A 0 0 1*mul_acc + * B 0 3*x^2 0 + */ +#define HEAD(ptype, bits, field, one) \ +static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \ +{ \ + ptype *A = AB, *B = AB+1; \ + limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \ + vec_is_zero(B, sizeof(ptype##_affine)); \ + static const vec##bits zero = { 0 }; \ +\ + sub_##field(B->Z, B->X, A->X); /* X2-X1 */ \ + add_##field(B->X, B->X, A->X); /* X2+X1 */ \ + add_##field(A->Z, B->Y, A->Y); /* Y2+Y1 */ \ + sub_##field(B->Y, B->Y, A->Y); /* Y2-Y1 */ \ + if (vec_is_zero(B->Z, sizeof(B->Z))) { /* X2==X1 */ \ + inf = vec_is_zero(A->Z, sizeof(A->Z)); \ + vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \ + sqr_##field(B->Y, A->X); \ + mul_by_3_##field(B->Y, B->Y); /* 3*X1^2 */ \ + vec_copy(B->Z, A->Z, sizeof(B->Z)); /* 2*Y1 */ \ + } /* B->Y is numenator */ \ + /* B->Z is denominator */ \ + vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \ + vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \ + vec_select(A->Z, one, B->Z, sizeof(A->Z), inf); \ + vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \ + if (mul_acc != NULL) \ + mul_##field(A->Z, A->Z, mul_acc); /* chain multiplication */\ +} + +/* + * Calculate λ and resulting coordinates. + * + * input: A x1 y1 - + * B x2+x1 nominator - + * lambda 1/denominator + * output: D x3=(nom/den)^2-(x2+x1) y3=(nom/den)(x1-x3)-y1 + */ +#define TAIL(ptype, bits, field, one) \ +static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \ +{ \ + ptype *A = AB, *B = AB+1; \ + vec##bits llambda; \ + limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \ +\ + mul_##field(lambda, lambda, B->Y); /* λ = (Y2-Y1)/(X2-X1) */ \ + /* alt. 3*X1^2/2*Y1 */ \ + sqr_##field(llambda, lambda); \ + sub_##field(D->X, llambda, B->X); /* X3 = λ^2-X1-X2 */ \ +\ + sub_##field(D->Y, A->X, D->X); \ + mul_##field(D->Y, D->Y, lambda); \ + sub_##field(D->Y, D->Y, A->Y); /* Y3 = λ*(X1-X3)-Y1 */ \ +\ + vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \ + vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \ +} + +/* + * |points[]| is volatile buffer with |X|s and |Y|s initially holding + * input affine coordinates, and with |Z|s being used as additional + * temporary storage [unrelated to Jacobian coordinates]. |sum| is + * in-/output, initialize to infinity accordingly. + */ +#define ADDITION_BTREE(prefix, ptype, bits, field, one) \ +HEAD(ptype, bits, field, one) \ +TAIL(ptype, bits, field, one) \ +static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \ +{ \ + ptype *dst; \ + void *mul_acc; \ + size_t i; \ +\ + while (n >= 16) { \ + if (n & 1) \ + ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ + n /= 2; \ + for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \ + ptype##_head(points, mul_acc); \ +\ + reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \ +\ + for (dst = points, i = n; --i;) { \ + dst--; points -= 2; \ + mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \ + ptype##_tail(dst, points, points[-2].Z); \ + mul_##field(points[-2].Z, points[0].Z, points[1].Z); \ + } \ + dst--; points -= 2; \ + ptype##_tail(dst, points, points[0].Z); \ + points = dst; \ + } \ + while (n--) \ + ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ +} \ +\ +void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \ + size_t npoints) \ +{ \ + /* Performance with 288K scratch is within 1-2-3% from optimal */ \ + const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 2048 : 1024; \ + ptype *scratch = alloca((npoints > stride ? stride : npoints) * \ + sizeof(ptype)); \ + const ptype##_affine *point = NULL; \ +\ + vec_zero(sum, sizeof(*sum)); \ + while (npoints) { \ + size_t i, j = npoints > stride ? stride : npoints; \ + for (i=0; i<j; i++) { \ + point = *points ? *points++ : point+1; \ + vec_copy(&scratch[i], point, sizeof(*point)); \ + } \ + ptype##s_accumulate(sum, scratch, j); \ + npoints -= j; \ + } \ +} + +ADDITION_BTREE(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p2) + +ADDITION_BTREE(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) diff --git a/blst/client_min_pk.c b/blst/client_min_pk.c new file mode 100644 index 0000000..0fcf563 --- /dev/null +++ b/blst/client_min_pk.c @@ -0,0 +1,17 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e2.c" +#include "hash_to_field.c" +#include "map_to_g2.c" +#include "e1.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/blst/client_min_sig.c b/blst/client_min_sig.c new file mode 100644 index 0000000..8e4663d --- /dev/null +++ b/blst/client_min_sig.c @@ -0,0 +1,17 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e1.c" +#include "hash_to_field.c" +#include "map_to_g1.c" +#include "e2.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/blst/consts.c b/blst/consts.c new file mode 100644 index 0000000..021c878 --- /dev/null +++ b/blst/consts.c @@ -0,0 +1,36 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" + +/* z = -0xd201000000010000 */ +const vec384 BLS12_381_P = { /* (z-1)^2 * (z^4 - z^2 + 1)/3 + z */ + TO_LIMB_T(0xb9feffffffffaaab), TO_LIMB_T(0x1eabfffeb153ffff), + TO_LIMB_T(0x6730d2a0f6b0f624), TO_LIMB_T(0x64774b84f38512bf), + TO_LIMB_T(0x4b1ba7b6434bacd7), TO_LIMB_T(0x1a0111ea397fe69a) +}; +const limb_t BLS12_381_p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ + +const radix384 BLS12_381_Rx = { /* (1<<384)%P, "radix", one-in-Montgomery */ + { { ONE_MONT_P }, + { 0 } } +}; + +const vec384 BLS12_381_RR = { /* (1<<768)%P, "radix"^2, to-Montgomery */ + TO_LIMB_T(0xf4df1f341c341746), TO_LIMB_T(0x0a76e6a609d104f1), + TO_LIMB_T(0x8de5476c4c95b6d5), TO_LIMB_T(0x67eb88a9939d83c0), + TO_LIMB_T(0x9a793e85b519952d), TO_LIMB_T(0x11988fe592cae3aa) +}; + +const vec256 BLS12_381_r = { /* z^4 - z^2 + 1, group order */ + TO_LIMB_T(0xffffffff00000001), TO_LIMB_T(0x53bda402fffe5bfe), + TO_LIMB_T(0x3339d80809a1d805), TO_LIMB_T(0x73eda753299d7d48) +}; + +const vec256 BLS12_381_rRR = { /* (1<<512)%r, "radix"^2, to-Montgomery */ + TO_LIMB_T(0xc999e990f3f29c6d), TO_LIMB_T(0x2b6cedcb87925c23), + TO_LIMB_T(0x05d314967254398f), TO_LIMB_T(0x0748d9d99f59ff11) +}; diff --git a/blst/consts.h b/blst/consts.h new file mode 100644 index 0000000..cb391b8 --- /dev/null +++ b/blst/consts.h @@ -0,0 +1,30 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_CONST_H__ +#define __BLS12_381_ASM_CONST_H__ +#include "vect.h" + +extern const vec384 BLS12_381_P; +extern const limb_t BLS12_381_p0; +static const limb_t p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ +typedef union { vec384 p12[12]; vec384x p2; vec384 p; } radix384; +extern const radix384 BLS12_381_Rx; /* (1<<384)%P, "radix", one-in-Montgomery */ +extern const vec384 BLS12_381_RR; /* (1<<768)%P, "radix"^2, to-Montgomery */ + +#define ONE_MONT_P TO_LIMB_T(0x760900000002fffd), \ + TO_LIMB_T(0xebf4000bc40c0002), \ + TO_LIMB_T(0x5f48985753c758ba), \ + TO_LIMB_T(0x77ce585370525745), \ + TO_LIMB_T(0x5c071a97a256ec6d), \ + TO_LIMB_T(0x15f65ec3fa80e493) + +#define ZERO_384 (BLS12_381_Rx.p2[1]) + +extern const vec256 BLS12_381_r; /* order */ +static const limb_t r0 = (limb_t)0xfffffffeffffffff; /* -1/r */ +extern const vec256 BLS12_381_rRR; /* (1<<512)%r, "radix"^2, to-Montgomery */ + +#endif diff --git a/blst/e1.c b/blst/e1.c new file mode 100644 index 0000000..47fca14 --- /dev/null +++ b/blst/e1.c @@ -0,0 +1,558 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" +#include "errors.h" + +/* + * y^2 = x^3 + B + */ +static const vec384 B_E1 = { /* (4 << 384) % P */ + TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) +}; + +const POINTonE1 BLS12_381_G1 = { /* generator point [in Montgomery] */ + /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 + * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ + { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), + TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), + TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, + /* (0x08b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af6 + * 00db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1 << 384) % P */ + { TO_LIMB_T(0xbaac93d50ce72271), TO_LIMB_T(0x8c22631a7918fd8e), + TO_LIMB_T(0xdd595f13570725ce), TO_LIMB_T(0x51ac582950405194), + TO_LIMB_T(0x0e1c8c3fad0059c0), TO_LIMB_T(0x0bbc3efc5008a26a) }, + { ONE_MONT_P } +}; + +const POINTonE1 BLS12_381_NEG_G1 = { /* negative generator [in Montgomery] */ + /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 + * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ + { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), + TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), + TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, + /* (0x114d1d6855d545a8aa7d76c8cf2e21f267816aef1db507c9 + * 6655b9d5caac42364e6f38ba0ecb751bad54dcd6b939c2ca << 384) % P */ + { TO_LIMB_T(0xff526c2af318883a), TO_LIMB_T(0x92899ce4383b0270), + TO_LIMB_T(0x89d7738d9fa9d055), TO_LIMB_T(0x12caf35ba344c12a), + TO_LIMB_T(0x3cff1b76964b5317), TO_LIMB_T(0x0e44d2ede9774430) }, + { ONE_MONT_P } +}; + +static inline void mul_by_b_onE1(vec384 out, const vec384 in) +{ lshift_fp(out, in, 2); } + +static inline void mul_by_4b_onE1(vec384 out, const vec384 in) +{ lshift_fp(out, in, 4); } + +static void POINTonE1_cneg(POINTonE1 *p, bool_t cbit) +{ cneg_fp(p->Y, p->Y, cbit); } + +void blst_p1_cneg(POINTonE1 *a, int cbit) +{ POINTonE1_cneg(a, is_zero(cbit) ^ 1); } + +static void POINTonE1_from_Jacobian(POINTonE1 *out, const POINTonE1 *in) +{ + vec384 Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp(Z, in->Z); /* 1/Z */ + + sqr_fp(ZZ, Z); + mul_fp(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G1.Z, + sizeof(BLS12_381_G1.Z), inf); /* Z = inf ? 0 : 1 */ +} + +void blst_p1_from_jacobian(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_from_Jacobian(out, a); } + +static void POINTonE1_to_affine(POINTonE1_affine *out, const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +void blst_p1_to_affine(POINTonE1_affine *out, const POINTonE1 *a) +{ POINTonE1_to_affine(out, a); } + +void blst_p1_from_affine(POINTonE1 *out, const POINTonE1_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +static bool_t POINTonE1_affine_on_curve(const POINTonE1_affine *p) +{ + vec384 XXX, YY; + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, B_E1); /* X^3 + B */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +int blst_p1_affine_on_curve(const POINTonE1_affine *p) +{ return (int)(POINTonE1_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } + +static bool_t POINTonE1_on_curve(const POINTonE1 *p) +{ + vec384 XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp(BZ6, p->Z); + mul_fp(BZ6, BZ6, p->Z); + sqr_fp(BZ6, BZ6); /* Z^6 */ + mul_by_b_onE1(BZ6, BZ6); /* B*Z^6 */ + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +int blst_p1_on_curve(const POINTonE1 *p) +{ return (int)POINTonE1_on_curve(p); } + +static limb_t POINTonE1_affine_Serialize_BE(unsigned char out[96], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + from_fp(temp, in->Y); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mod_384(temp, BLS12_381_P); +} + +void blst_p1_affine_serialize(unsigned char out[96], + const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 96); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE1_affine_Serialize_BE(out, in); + } +} + +static limb_t POINTonE1_Serialize_BE(unsigned char out[96], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Serialize_BE(out, (const POINTonE1_affine *)in); +} + +static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 96); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE1_Serialize_BE(out, in); + } +} + +void blst_p1_serialize(unsigned char out[96], const POINTonE1 *in) +{ POINTonE1_Serialize(out, in); } + +static limb_t POINTonE1_affine_Compress_BE(unsigned char out[48], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + return sgn0_pty_mont_384(in->Y, BLS12_381_P, p0); +} + +void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 48); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE1_affine_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE1_Compress_BE(unsigned char out[48], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Compress_BE(out, (const POINTonE1_affine *)in); +} + +void blst_p1_compress(unsigned char out[48], const POINTonE1 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 48); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE1_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE1_Uncompress_BE(POINTonE1_affine *out, + const unsigned char in[48]) +{ + POINTonE1_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X, sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + mul_fp(ret.X, ret.X, BLS12_381_RR); + + sqr_fp(ret.Y, ret.X); + mul_fp(ret.Y, ret.Y, ret.X); + add_fp(ret.Y, ret.Y, B_E1); /* X^3 + B */ + if (!sqrt_fp(ret.Y, ret.Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return sgn0_pty_mont_384(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE1_Uncompress_Z(POINTonE1_affine *out, + const unsigned char in[48]) +{ + unsigned char in0 = in[0]; + limb_t sgn0_pty; + + if ((in0 & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 47)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } else { + return BLST_BAD_ENCODING; + } + } + + sgn0_pty = POINTonE1_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty >>= 1; /* skip over parity bit */ + sgn0_pty ^= (in0 & 0x20) >> 5; + cneg_fp(out->Y, out->Y, sgn0_pty); + + /* (0,±2) is not in group, but application might want to ignore? */ + return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP + : BLST_SUCCESS; +} + +BLST_ERROR blst_p1_uncompress(POINTonE1_affine *out, const unsigned char in[48]) +{ return POINTonE1_Uncompress_Z(out, in); } + +static BLST_ERROR POINTonE1_Deserialize_BE(POINTonE1_affine *out, + const unsigned char in[96]) +{ + POINTonE1_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); + limbs_from_be_bytes(ret.Y, in + 48, sizeof(ret.Y)); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X, sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y, sizeof(temp))) + return BLST_BAD_ENCODING; + + mul_fp(ret.X, ret.X, BLS12_381_RR); + mul_fp(ret.Y, ret.Y, BLS12_381_RR); + + if (!POINTonE1_affine_on_curve(&ret)) + return BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + /* (0,±2) is not in group, but application might want to ignore? */ + return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP + : BLST_SUCCESS; +} + +static BLST_ERROR POINTonE1_Deserialize_Z(POINTonE1_affine *out, + const unsigned char in[96]) +{ + unsigned char in0 = in[0]; + + if ((in0 & 0xe0) == 0) + return POINTonE1_Deserialize_BE(out, in); + + if (in0 & 0x80) /* compressed bit */ + return POINTonE1_Uncompress_Z(out, in); + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + } + + return BLST_BAD_ENCODING; +} + +BLST_ERROR blst_p1_deserialize(POINTonE1_affine *out, + const unsigned char in[96]) +{ return POINTonE1_Deserialize_Z(out, in); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE1, 384, fp) +POINT_DADD_AFFINE_IMPL_A0(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_ADD_IMPL(POINTonE1, 384, fp) +POINT_ADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_DOUBLE_IMPL_A0(POINTonE1, 384, fp) +POINT_IS_EQUAL_IMPL(POINTonE1, 384, fp) + +void blst_p1_add(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b) +{ POINTonE1_add(out, a, b); } + +void blst_p1_add_or_double(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1 *b) +{ POINTonE1_dadd(out, a, b, NULL); } + +void blst_p1_add_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1_affine *b) +{ POINTonE1_add_affine(out, a, b); } + +void blst_p1_add_or_double_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1_affine *b) +{ POINTonE1_dadd_affine(out, a, b); } + +void blst_p1_double(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_double(out, a); } + +int blst_p1_is_equal(const POINTonE1 *a, const POINTonE1 *b) +{ return (int)POINTonE1_is_equal(a, b); } + +#include "ec_mult.h" +POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 4) +POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 5) + +#ifdef __BLST_PRIVATE_TESTMODE__ +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE1) + +DECLARE_PRIVATE_POINTXZ(POINTonE1, 384) +POINT_LADDER_PRE_IMPL(POINTonE1, 384, fp) +POINT_LADDER_STEP_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_LADDER_POST_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE1) +#endif + +static const vec384 beta = { /* such that beta^3 - 1 = 0 */ + /* -1/2 * (1 + sqrt(-3)) = ((P-2)^(P-2)) * (1 + (P-3)^((P+1)/4)) */ + /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac << 384) % P */ + TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) +}; + +static void sigma(POINTonE1 *out, const POINTonE1 *in) +{ + vec_copy(out->X, in->X, 2*sizeof(out->X)); + mul_fp(out->Z, in->Z, beta); +} + +/* Gallant-Lambert-Vanstone, ~45% faster than POINTonE1_mult_w5 */ +static void POINTonE1_mult_glv(POINTonE1 *out, const POINTonE1 *in, + const pow256 SK) +{ + union { vec256 l; pow256 s; } val; + + /* SK/z^2 [in constant time] */ + + limbs_from_le_bytes(val.l, SK, 32); + div_by_zz(val.l); + le_bytes_from_limbs(val.s, val.l, 32); + + { + const byte *scalars[2] = { val.s+16, val.s }; + POINTonE1 table[2][1<<(5-1)]; /* 4.5KB */ + size_t i; + + POINTonE1_precompute_w5(table[0], in); + for (i = 0; i < 1<<(5-1); i++) { + mul_fp(table[1][i].X, table[0][i].X, beta); + cneg_fp(table[1][i].Y, table[0][i].Y, 1); + vec_copy(table[1][i].Z, table[0][i].Z, sizeof(table[1][i].Z)); + } + + POINTonE1s_mult_w5(out, NULL, 2, scalars, 128, table); + POINTonE1_cneg(out, 1); + mul_fp(out->Z, out->Z, beta); + mul_fp(out->Z, out->Z, beta); + } + + vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ +} + +static void POINTonE1_sign(POINTonE1 *out, const POINTonE1 *in, const pow256 SK) +{ + vec384 Z, ZZ; + limb_t inf; + + POINTonE1_mult_glv(out, in, SK); + + /* convert to affine to remove possible bias in out->Z */ + inf = vec_is_zero(out->Z, sizeof(out->Z)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + flt_reciprocal_fp(Z, out->Z); /* 1/Z */ +#else + reciprocal_fp(Z, out->Z); /* 1/Z */ +#endif + + sqr_fp(ZZ, Z); + mul_fp(out->X, out->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, out->Z, BLS12_381_G1.Z, sizeof(BLS12_381_G1.Z), + inf); /* Z = inf ? 0 : 1 */ +} + +void blst_sk_to_pk_in_g1(POINTonE1 *out, const pow256 SK) +{ POINTonE1_sign(out, &BLS12_381_G1, SK); } + +void blst_sign_pk_in_g2(POINTonE1 *out, const POINTonE1 *msg, const pow256 SK) +{ POINTonE1_sign(out, msg, SK); } + +void blst_sk_to_pk2_in_g1(unsigned char out[96], POINTonE1_affine *PK, + const pow256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_sign(P, &BLS12_381_G1, SK); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g2(unsigned char out[96], POINTonE1_affine *sig, + const POINTonE1 *hash, const pow256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_sign(P, hash, SK); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p1_mult(POINTonE1 *out, const POINTonE1 *a, + const byte *scalar, size_t nbits) +{ + if (nbits < 176) { + if (nbits) + POINTonE1_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); + } else if (nbits <= 256) { + union { vec256 l; pow256 s; } val; + size_t i, j, top, mask = (size_t)0 - 1; + + /* this is not about constant-time-ness, but branch optimization */ + for (top = (nbits + 7)/8, i=0, j=0; i<sizeof(val.s);) { + val.s[i++] = scalar[j] & mask; + mask = 0 - ((i - top) >> (8*sizeof(top)-1)); + j += 1 & mask; + } + + if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ + POINTonE1_mult_glv(out, a, val.s); + else /* should never be the case, added for formal completeness */ + POINTonE1_mult_w5(out, a, scalar, nbits); + + vec_zero(val.l, sizeof(val)); + } else { /* should never be the case, added for formal completeness */ + POINTonE1_mult_w5(out, a, scalar, nbits); + } +} + +void blst_p1_unchecked_mult(POINTonE1 *out, const POINTonE1 *a, + const byte *scalar, size_t nbits) +{ + if (nbits) + POINTonE1_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); +} + +int blst_p1_affine_is_equal(const POINTonE1_affine *a, + const POINTonE1_affine *b) +{ return (int)vec_is_equal(a, b, sizeof(*a)); } + +int blst_p1_is_inf(const POINTonE1 *p) +{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } + +const POINTonE1 *blst_p1_generator(void) +{ return &BLS12_381_G1; } + +int blst_p1_affine_is_inf(const POINTonE1_affine *p) +{ return (int)vec_is_zero(p, sizeof(*p)); } + +const POINTonE1_affine *blst_p1_affine_generator(void) +{ return (const POINTonE1_affine *)&BLS12_381_G1; } diff --git a/blst/e2.c b/blst/e2.c new file mode 100644 index 0000000..eafc486 --- /dev/null +++ b/blst/e2.c @@ -0,0 +1,632 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" +#include "errors.h" + +/* + * y^2 = x^3 + B + */ +static const vec384x B_E2 = { /* 4 + 4*i */ + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }, + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) } +}; + +const POINTonE2 BLS12_381_G2 = { /* generator point [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0ce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a7 + 6d429a695160d12c923ac9cc3baca289e193548608b82801 << 384) % P */ + { TO_LIMB_T(0x4c730af860494c4a), TO_LIMB_T(0x597cfa1f5e369c5a), + TO_LIMB_T(0xe7e6856caa0a635a), TO_LIMB_T(0xbbefb5e96e0d495f), + TO_LIMB_T(0x07d3a975f0ef25a2), TO_LIMB_T(0x0083fd8e7e80dae5) }, + /* (0x0606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af + 267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be << 384) % P */ + { TO_LIMB_T(0xadc0fc92df64b05d), TO_LIMB_T(0x18aa270a2b1461dc), + TO_LIMB_T(0x86adac6a3be4eba0), TO_LIMB_T(0x79495c4ec93da33a), + TO_LIMB_T(0xe7175850a43ccaed), TO_LIMB_T(0x0b2bc2a163de1bf2) }, +}, +{ { ONE_MONT_P }, { 0 } } +}; + +const POINTonE2 BLS12_381_NEG_G2 = { /* negative generator [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0d1b3cc2c7027888be51d9ef691d77bcb679afda66c73f17 + f9ee3837a55024f78c71363275a75d75d86bab79f74782aa << 384) % P */ + { TO_LIMB_T(0x6d8bf5079fb65e61), TO_LIMB_T(0xc52f05df531d63a5), + TO_LIMB_T(0x7f4a4d344ca692c9), TO_LIMB_T(0xa887959b8577c95f), + TO_LIMB_T(0x4347fe40525c8734), TO_LIMB_T(0x197d145bbaff0bb5) }, + /* (0x13fa4d4a0ad8b1ce186ed5061789213d993923066dddaf10 + 40bc3ff59f825c78df74f2d75467e25e0f55f8a00fa030ed << 384) % P */ + { TO_LIMB_T(0x0c3e036d209afa4e), TO_LIMB_T(0x0601d8f4863f9e23), + TO_LIMB_T(0xe0832636bacc0a84), TO_LIMB_T(0xeb2def362a476f84), + TO_LIMB_T(0x64044f659f0ee1e9), TO_LIMB_T(0x0ed54f48d5a1caa7) } +}, +{ { ONE_MONT_P }, { 0 } } +}; + +static void mul_by_b_onE2(vec384x out, const vec384x in) +{ + sub_fp(out[0], in[0], in[1]); + add_fp(out[1], in[0], in[1]); + lshift_fp(out[0], out[0], 2); + lshift_fp(out[1], out[1], 2); +} + +static void mul_by_4b_onE2(vec384x out, const vec384x in) +{ + sub_fp(out[0], in[0], in[1]); + add_fp(out[1], in[0], in[1]); + lshift_fp(out[0], out[0], 4); + lshift_fp(out[1], out[1], 4); +} + +static void POINTonE2_cneg(POINTonE2 *p, bool_t cbit) +{ cneg_fp2(p->Y, p->Y, cbit); } + +void blst_p2_cneg(POINTonE2 *a, int cbit) +{ POINTonE2_cneg(a, is_zero(cbit) ^ 1); } + +static void POINTonE2_from_Jacobian(POINTonE2 *out, const POINTonE2 *in) +{ + vec384x Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp2(Z, in->Z); /* 1/Z */ + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G2.Z, + sizeof(BLS12_381_G2.Z), inf); /* Z = inf ? 0 : 1 */ +} + +void blst_p2_from_jacobian(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_from_Jacobian(out, a); } + +static void POINTonE2_to_affine(POINTonE2_affine *out, const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +void blst_p2_to_affine(POINTonE2_affine *out, const POINTonE2 *a) +{ POINTonE2_to_affine(out, a); } + +void blst_p2_from_affine(POINTonE2 *out, const POINTonE2_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p2, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +static bool_t POINTonE2_affine_on_curve(const POINTonE2_affine *p) +{ + vec384x XXX, YY; + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, B_E2); /* X^3 + B */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +int blst_p2_affine_on_curve(const POINTonE2_affine *p) +{ return (int)(POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } + +static bool_t POINTonE2_on_curve(const POINTonE2 *p) +{ + vec384x XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp2(BZ6, p->Z); + mul_fp2(BZ6, BZ6, p->Z); + sqr_fp2(XXX, BZ6); /* Z^6 */ + mul_by_b_onE2(BZ6, XXX); /* B*Z^6 */ + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +int blst_p2_on_curve(const POINTonE2 *p) +{ return (int)POINTonE2_on_curve(p); } + +static limb_t POINTonE2_affine_Serialize_BE(unsigned char out[192], + const POINTonE2_affine *in) +{ + vec384x temp; + + from_fp(temp[1], in->X[1]); + be_bytes_from_limbs(out, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->X[0]); + be_bytes_from_limbs(out + 48, temp[0], sizeof(temp[0])); + + from_fp(temp[1], in->Y[1]); + be_bytes_from_limbs(out + 96, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->Y[0]); + be_bytes_from_limbs(out + 144, temp[0], sizeof(temp[0])); + + return sgn0_pty_mod_384x(temp, BLS12_381_P); +} + +void blst_p2_affine_serialize(unsigned char out[192], + const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 192); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE2_affine_Serialize_BE(out, in); + } +} + +static limb_t POINTonE2_Serialize_BE(unsigned char out[192], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Serialize_BE(out, (const POINTonE2_affine *)in); +} + +static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 192); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE2_Serialize_BE(out, in); + } +} + +void blst_p2_serialize(unsigned char out[192], const POINTonE2 *in) +{ POINTonE2_Serialize(out, in); } + +static limb_t POINTonE2_affine_Compress_BE(unsigned char out[96], + const POINTonE2_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X[1]); + be_bytes_from_limbs(out, temp, sizeof(temp)); + from_fp(temp, in->X[0]); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mont_384x(in->Y, BLS12_381_P, p0); +} + +void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 96); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE2_affine_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE2_Compress_BE(unsigned char out[96], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Compress_BE(out, (const POINTonE2_affine *)in); +} + +void blst_p2_compress(unsigned char out[96], const POINTonE2 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 96); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE2_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE2_Uncompress_BE(POINTonE2_affine *out, + const unsigned char in[96]) +{ + POINTonE2_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); + limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + + add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + + mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); + mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); + + sqr_fp2(ret.Y, ret.X); + mul_fp2(ret.Y, ret.Y, ret.X); + add_fp2(ret.Y, ret.Y, B_E2); /* X^3 + B */ + if (!sqrt_fp2(ret.Y, ret.Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return sgn0_pty_mont_384x(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE2_Uncompress_Z(POINTonE2_affine *out, + const unsigned char in[96]) +{ + unsigned char in0 = in[0]; + limb_t sgn0_pty; + + if ((in0 & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } else { + return BLST_BAD_ENCODING; + } + } + + sgn0_pty = POINTonE2_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty >>= 1; /* skip over parity bit */ + sgn0_pty ^= (in0 & 0x20) >> 5; + cneg_fp2(out->Y, out->Y, sgn0_pty); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_p2_uncompress(POINTonE2_affine *out, const unsigned char in[96]) +{ return POINTonE2_Uncompress_Z(out, in); } + +static BLST_ERROR POINTonE2_Deserialize_BE(POINTonE2_affine *out, + const unsigned char in[192]) +{ + POINTonE2_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); + limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); + limbs_from_be_bytes(ret.Y[1], in + 96, sizeof(ret.Y[1])); + limbs_from_be_bytes(ret.Y[0], in + 144, sizeof(ret.Y[0])); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y[1], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y[0], sizeof(temp))) + return BLST_BAD_ENCODING; + + mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); + mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); + mul_fp(ret.Y[0], ret.Y[0], BLS12_381_RR); + mul_fp(ret.Y[1], ret.Y[1], BLS12_381_RR); + + if (!POINTonE2_affine_on_curve(&ret)) + return BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return BLST_SUCCESS; +} + +static BLST_ERROR POINTonE2_Deserialize_Z(POINTonE2_affine *out, + const unsigned char in[192]) +{ + unsigned char in0 = in[0]; + + if ((in0 & 0xe0) == 0) + return POINTonE2_Deserialize_BE(out, in); + + if (in0 & 0x80) /* compressed bit */ + return POINTonE2_Uncompress_Z(out, in); + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 191)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + } + + return BLST_BAD_ENCODING; +} + +BLST_ERROR blst_p2_deserialize(POINTonE2_affine *out, + const unsigned char in[192]) +{ return POINTonE2_Deserialize_Z(out, in); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE2, 384x, fp2) +POINT_DADD_AFFINE_IMPL_A0(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_ADD_IMPL(POINTonE2, 384x, fp2) +POINT_ADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_DOUBLE_IMPL_A0(POINTonE2, 384x, fp2) +POINT_IS_EQUAL_IMPL(POINTonE2, 384x, fp2) + +void blst_p2_add(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b) +{ POINTonE2_add(out, a, b); } + +void blst_p2_add_or_double(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2 *b) +{ POINTonE2_dadd(out, a, b, NULL); } + +void blst_p2_add_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2_affine *b) +{ POINTonE2_add_affine(out, a, b); } + +void blst_p2_add_or_double_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2_affine *b) +{ POINTonE2_dadd_affine(out, a, b); } + +void blst_p2_double(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_double(out, a); } + +int blst_p2_is_equal(const POINTonE2 *a, const POINTonE2 *b) +{ return (int)POINTonE2_is_equal(a, b); } + +#include "ec_mult.h" +POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 4) +POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 5) + +#ifdef __BLST_PRIVATE_TESTMODE__ +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE2) + +DECLARE_PRIVATE_POINTXZ(POINTonE2, 384x) +POINT_LADDER_PRE_IMPL(POINTonE2, 384x, fp2) +POINT_LADDER_STEP_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_LADDER_POST_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE2) +#endif + +static void psi(POINTonE2 *out, const POINTonE2 *in) +{ + static const vec384x frobenius_x = { /* 1/(1 + i)^((P-1)/3) */ + { 0 }, + { /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad << 384) % P */ + TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) } + }; + static const vec384x frobenius_y = { /* 1/(1 + i)^((P-1)/2) */ + { /* (0x135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60 + ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2 << 384) % P */ + TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { /* (0x06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e + 77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09 << 384) % P */ + TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + }; + + vec_copy(out, in, sizeof(*out)); + cneg_fp(out->X[1], out->X[1], 1); mul_fp2(out->X, out->X, frobenius_x); + cneg_fp(out->Y[1], out->Y[1], 1); mul_fp2(out->Y, out->Y, frobenius_y); + cneg_fp(out->Z[1], out->Z[1], 1); +} + +/* Galbraith-Lin-Scott, ~67% faster than POINTonE2_mul_w5 */ +static void POINTonE2_mult_gls(POINTonE2 *out, const POINTonE2 *in, + const pow256 SK) +{ + union { vec256 l; pow256 s; } val; + + /* break down SK to "digits" with |z| as radix [in constant time] */ + + limbs_from_le_bytes(val.l, SK, 32); + div_by_zz(val.l); + div_by_z(val.l); + div_by_z(val.l + NLIMBS(256)/2); + le_bytes_from_limbs(val.s, val.l, 32); + + { + const byte *scalars[2] = { val.s, NULL }; + POINTonE2 table[4][1<<(5-1)]; /* 18KB */ + size_t i; + + POINTonE2_precompute_w5(table[0], in); + for (i = 0; i < 1<<(5-1); i++) { + psi(&table[1][i], &table[0][i]); + psi(&table[2][i], &table[1][i]); + psi(&table[3][i], &table[2][i]); + POINTonE2_cneg(&table[1][i], 1); /* account for z being negative */ + POINTonE2_cneg(&table[3][i], 1); + } + + POINTonE2s_mult_w5(out, NULL, 4, scalars, 64, table); + } + + vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ +} + +static void POINTonE2_sign(POINTonE2 *out, const POINTonE2 *in, const pow256 SK) +{ + vec384x Z, ZZ; + limb_t inf; + + POINTonE2_mult_gls(out, in, SK); + + /* convert to affine to remove possible bias in out->Z */ + inf = vec_is_zero(out->Z, sizeof(out->Z)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + flt_reciprocal_fp2(Z, out->Z); /* 1/Z */ +#else + reciprocal_fp2(Z, out->Z); /* 1/Z */ +#endif + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, out->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, out->Z, BLS12_381_G2.Z, sizeof(BLS12_381_G2.Z), + inf); /* Z = inf ? 0 : 1 */ +} + +void blst_sk_to_pk_in_g2(POINTonE2 *out, const pow256 SK) +{ POINTonE2_sign(out, &BLS12_381_G2, SK); } + +void blst_sign_pk_in_g1(POINTonE2 *out, const POINTonE2 *msg, const pow256 SK) +{ POINTonE2_sign(out, msg, SK); } + +void blst_sk_to_pk2_in_g2(unsigned char out[192], POINTonE2_affine *PK, + const pow256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_sign(P, &BLS12_381_G2, SK); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g1(unsigned char out[192], POINTonE2_affine *sig, + const POINTonE2 *hash, const pow256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_sign(P, hash, SK); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p2_mult(POINTonE2 *out, const POINTonE2 *a, + const byte *scalar, size_t nbits) +{ + if (nbits < 144) { + if (nbits) + POINTonE2_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); + } else if (nbits <= 256) { + union { vec256 l; pow256 s; } val; + size_t i, j, top, mask = (size_t)0 - 1; + + /* this is not about constant-time-ness, but branch optimization */ + for (top = (nbits + 7)/8, i=0, j=0; i<sizeof(val.s);) { + val.s[i++] = scalar[j] & mask; + mask = 0 - ((i - top) >> (8*sizeof(top)-1)); + j += 1 & mask; + } + + if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ + POINTonE2_mult_gls(out, a, val.s); + else /* should never be the case, added for formal completeness */ + POINTonE2_mult_w5(out, a, scalar, nbits); + + vec_zero(val.l, sizeof(val)); + } else { /* should never be the case, added for formal completeness */ + POINTonE2_mult_w5(out, a, scalar, nbits); + } +} + +void blst_p2_unchecked_mult(POINTonE2 *out, const POINTonE2 *a, + const byte *scalar, size_t nbits) +{ + if (nbits) + POINTonE2_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); +} + +int blst_p2_affine_is_equal(const POINTonE2_affine *a, + const POINTonE2_affine *b) +{ return (int)vec_is_equal(a, b, sizeof(*a)); } + +int blst_p2_is_inf(const POINTonE2 *p) +{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } + +const POINTonE2 *blst_p2_generator(void) +{ return &BLS12_381_G2; } + +int blst_p2_affine_is_inf(const POINTonE2_affine *p) +{ return (int)vec_is_zero(p, sizeof(*p)); } + +const POINTonE2_affine *blst_p2_affine_generator(void) +{ return (const POINTonE2_affine *)&BLS12_381_G2; } diff --git a/blst/ec_mult.h b/blst/ec_mult.h new file mode 100644 index 0000000..192f733 --- /dev/null +++ b/blst/ec_mult.h @@ -0,0 +1,289 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_EC_MULT_H__ +#define __BLS12_381_ASM_EC_MULT_H__ + +#include "point.h" + +/* Works up to 9 bits */ +static limb_t get_wval(const byte *d, size_t off, size_t bits) +{ + size_t top = off + bits - 1; + limb_t ret; + + ret = ((limb_t)d[top / 8] << 8) | d[off / 8]; + + return ret >> (off%8); +} + +/* Works up to 25 bits. */ +static limb_t get_wval_limb(const byte *d, size_t off, size_t bits) +{ + size_t i, top = (off + bits - 1)/8; + limb_t ret, mask = (limb_t)0 - 1; + + d += off/8; + top -= off/8-1; + + /* this is not about constant-time-ness, but branch optimization */ + for (ret=0, i=0; i<4;) { + ret |= (*d & mask) << (8*i); + mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1)); + d += 1 & mask; + } + + return ret >> (off%8); +} + +/* + * Window value encoding that utilizes the fact that -P is trivially + * calculated, which allows to halve the size of pre-computed table, + * is attributed to A. D. Booth, hence the name of the subroutines... + */ +static limb_t booth_encode(limb_t wval, size_t sz) +{ + limb_t mask = 0 - (wval >> sz); /* "sign" bit -> mask */ + + wval = (wval + 1) >> 1; + wval = (wval & ~mask) | ((0-wval) & mask); + + /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */ + return wval; +} + +/* + * Key feature of these constant-time subroutines is that they tolerate + * zeros in most significant bit positions of the scalar[s], or in other + * words, zero-padded scalar values. This means that one can and should + * pass order's bit-length, which is customarily publicly known, instead + * of the factual scalars' bit-lengths. This is facilitated by point + * addition subroutines implemented to handle points at infinity, which + * are encoded as Z==0. [Doubling agorithms handle such points at + * infinity "naturally," since resulting Z is product of original Z.] + */ +#define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \ +static void ptype##_gather_booth_w##SZ(ptype *restrict p, \ + const ptype table[1<<(SZ-1)], \ + limb_t booth_idx) \ +{ \ + size_t i; \ + bool_t booth_sign = (booth_idx >> SZ) & 1; \ +\ + booth_idx &= (1<<SZ) - 1; \ + vec_zero(p, sizeof(ptype)); /* implicit infinity at table[-1] */\ + /* ~6% with -Os, ~2% with -O3 ... */\ + for (i = 1; i <= 1<<(SZ-1); i++) \ + ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \ +\ + ptype##_cneg(p, booth_sign); \ +} \ +\ +static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \ +{ \ + size_t i, j; \ + /* row[-1] is implicit infinity */\ + vec_copy(&row[0], point, sizeof(ptype)); /* row[0]=p*1 */\ + ptype##_double(&row[1], point); /* row[1]=p*(1+1) */\ + for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \ + ptype##_add(&row[i], &row[j], &row[j-1]), /* row[2]=p*(2+1) */\ + ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ +} /* row[4] ... */\ +\ +static void ptype##s_mult_w##SZ(ptype *ret, \ + const ptype *points[], size_t npoints, \ + const byte *scalars[], size_t bits, \ + ptype table[][1<<(SZ-1)]) \ +{ \ + limb_t wmask, wval; \ + size_t i, j, window, nbytes; \ + const byte *scalar, **scalar_s = scalars; \ + ptype temp[1]; \ +\ + if (table == NULL) \ + table = (ptype (*)[1<<(SZ-1)])alloca((1<<(SZ-1)) * sizeof(ptype) * \ + npoints); \ +\ + if (points != NULL) { \ + const ptype *point = NULL; \ + for (i = 0; i < npoints; i++) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_w##SZ(table[i], point); \ + } \ +\ + nbytes = (bits + 7)/8; /* convert |bits| to bytes */ \ + scalar = *scalar_s++; \ +\ + /* top excess bits modulo target window size */ \ + window = bits % SZ; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + bits -= window; \ + if (bits > 0) \ + wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ + else \ + wval = (scalar[0] << 1) & wmask; \ +\ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(ret, table[0], wval); \ +\ + i = 1; \ + while (bits > 0) { \ + for (; i < npoints; i++) { \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table[i], wval); \ + ptype##_dadd(ret, ret, temp, NULL); \ + } \ +\ + for (j = 0; j < SZ; j++) \ + ptype##_double(ret, ret); \ +\ + window = SZ; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ + i = 0; scalar_s = scalars; \ + } \ +\ + for (; i < npoints; i++) { \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = (scalar[0] << 1) & wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table[i], wval); \ + ptype##_dadd(ret, ret, temp, NULL); \ + } \ +} \ +\ +static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \ + const byte *scalar, size_t bits) \ +{ \ + limb_t wmask, wval; \ + size_t j, window; \ + ptype temp[1]; \ + ptype table[1<<(SZ-1)]; \ +\ + ptype##_precompute_w##SZ(table, point); \ +\ + /* top excess bits modulo target window size */ \ + window = bits % SZ; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + bits -= window; \ + wval = bits ? get_wval(scalar, bits - 1, window + 1) \ + : (limb_t)scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(ret, table, wval); \ +\ + while (bits > 0) { \ + for (j = 0; j < SZ; j++) \ + ptype##_double(ret, ret); \ +\ + window = SZ; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ +\ + wval = bits ? get_wval(scalar, bits - 1, window + 1) \ + : (limb_t)scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table, wval); \ + if (bits > 0) ptype##_add(ret, ret, temp); \ + else ptype##_dadd(ret, ret, temp, NULL); \ + } \ +} + +#if 0 +/* ~50%, or ~2x[!] slower than w5... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *ret, const ptype *p, \ + const byte *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + bool_t bit, pbit = 0; \ +\ + vec_copy(sum, p, sizeof(ptype)); \ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + bit = is_bit_set(scalar, bits); \ + bit ^= pbit; \ + ptype##_cswap(ret, sum, bit); \ + ptype##_add(sum, sum, ret); \ + ptype##_double(ret, ret); \ + pbit ^= bit; \ + } \ + ptype##_cswap(ret, sum, pbit); \ +} +#else +/* >40% better performance than above, [and ~30% slower than w5]... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *out, const ptype *p, \ + const byte *scalar, size_t bits) \ +{ \ + ptype##xz sum[1]; \ + ptype##xz pxz[1]; \ + ptype##xz ret[1]; \ + bool_t bit, pbit = 0; \ +\ + ptype##xz_ladder_pre(pxz, p); \ + vec_copy(sum, pxz, sizeof(ptype##xz)); \ + vec_zero(ret, sizeof(ptype##xz)); /* infinity */ \ +\ + while (bits--) { \ + bit = is_bit_set(scalar, bits); \ + bit ^= pbit; \ + ptype##xz_cswap(ret, sum, bit); \ + ptype##xz_ladder_step(ret, sum, pxz); \ + pbit ^= bit; \ + } \ + ptype##xz_cswap(ret, sum, pbit); \ + ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \ +} +#endif + +/* + * Sole reason for existence of this implementation is that addition + * with affine point renders a share of multiplications redundant by + * virtue of Z==1. And since pre-defined generator point can be and + * customarily is instantiated affine, it would be hardly appropriate + * to pass on this opportunity. Though while it's faster than the + * generic ladder implementation, by ~25%, it's not faster than XZ one + * above, <15% slower. Just in case, it's faster than generic ladder + * even if one accounts for prior conversion to affine coordinates, + * so that choice [for resource-constrained case] is actually between + * this plus said conversion and XZ ladder... + * + * To summarize, if ptype##_mult_w5 executed in one unit of time, then + * - naive ptype##_mult_ladder would execute in ~2; + * - XZ version above - in ~1.4; + * - ptype##_affine_mult_ladder below - in ~1.65; + * - [small-footprint ptype##_to_affine would run in ~0.18]. + * + * Caveat lector, |p_affine|*(order+2) produces wrong result, because + * addition doesn't handle doubling. Indeed, P*(order+1) is P and it + * fails to add with itself producing infinity in last addition. But + * as long as |scalar| is reduced modulo order, as it should be, it's + * not a problem... + */ +#define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \ +static void ptype##_affine_mult_ladder(ptype *ret, \ + const ptype##_affine *p_affine, \ + const byte *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + bool_t bit; \ +\ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + ptype##_double(ret, ret); \ + ptype##_add_affine(sum, ret, p_affine); \ + bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ + ptype##_ccopy(ret, sum, bit); \ + } \ +} +#endif diff --git a/blst/ec_ops.h b/blst/ec_ops.h new file mode 100644 index 0000000..0d531f8 --- /dev/null +++ b/blst/ec_ops.h @@ -0,0 +1,787 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_384_ASM_EC_OPS_H__ +#define __BLS12_384_ASM_EC_OPS_H__ +/* + * Addition that can handle doubling [as well as points at infinity, + * which are encoded as Z==0] in constant time. It naturally comes at + * cost, but this subroutine should be called only when independent + * points are processed, which is considered reasonable compromise. + * For example, ptype##s_mult_w5 calls it, but since *major* gain is + * result of pure doublings being effectively divided by amount of + * points, slightly slower addition can be tolerated. But what is the + * additional cost more specifically? Best addition result is 11M+5S, + * while this routine takes 13M+5S (+1M+1S if a4!=0), as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1*Z2^2 | U1 = X1 + * U2 = X2*Z1^2 | + * S1 = Y1*Z2^3 | S1 = Y1 + * S2 = Y2*Z1^3 | + * zz = Z1*Z2 | zz = Z1 + * H = U2-U1 | H' = 2*Y1 + * R = S2-S1 | R' = 3*X1^2[+a*Z1^4] + * sx = U1+U2 | sx = X1+X1 + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = H*zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_IMPL(ptype, bits, field) \ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4) \ +{ \ + ptype p3; /* starts as (U1, S1, zz) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + bool_t p1inf, p2inf, is_dbl; \ +\ + add_##field(dbl.sx, p1->X, p1->X); /* sx = X1+X1 */\ + sqr_##field(dbl.R, p1->X); /* X1^2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X1^2 */\ + add_##field(dbl.H, p1->Y, p1->Y); /* H = 2*Y1 */\ +\ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ + sqr_##field(p3.X, p2->Z); /* Z2^2 */\ + mul_##field(p3.Z, p1->Z, p2->Z); /* Z1*Z2 */\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ +\ + if (a4 != NULL) { \ + sqr_##field(p3.Y, add.H); /* Z1^4, [borrow p3.Y] */\ + mul_##field(p3.Y, p3.Y, a4); \ + add_##field(dbl.R, dbl.R, p3.Y);/* R = 3*X1^2+a*Z1^4 */\ + } \ +\ + mul_##field(p3.Y, p1->Y, p2->Z); \ + mul_##field(p3.Y, p3.Y, p3.X); /* S1 = Y1*Z2^3 */\ + mul_##field(add.R, p2->Y, p1->Z); \ + mul_##field(add.R, add.R, add.H); /* S2 = Y2*Z1^3 */\ + sub_##field(add.R, add.R, p3.Y); /* R = S2-S1 */\ +\ + mul_##field(p3.X, p3.X, p1->X); /* U1 = X1*Z2^2 */\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + add_##field(add.sx, add.H, p3.X); /* sx = U1+U2 */\ + sub_##field(add.H, add.H, p3.X); /* H = U2-U1 */\ +\ + /* make the choice between addition and doubling */\ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(&p3, p1, &p3, sizeof(p3), is_dbl); \ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + mul_##field(p3.Z, p3.Z, add.H); /* Z3 = H*Z1*Z2 */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ + mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * Addition with affine point that can handle doubling [as well as + * points at infinity, with |p1| being encoded as Z==0 and |p2| as + * X,Y==0] in constant time. But at what additional cost? Best + * addition result is 7M+4S, while this routine takes 8M+5S, as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1 | U1 = X2 + * U2 = X2*Z1^2 | + * S1 = Y1 | S1 = Y2 + * S2 = Y2*Z1^3 | + * H = U2-X1 | H' = 2*Y2 + * R = S2-Y1 | R' = 3*X2^2[+a] + * sx = X1+U2 | sx = X2+X2 + * zz = H*Z1 | zz = H' + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_AFFINE_IMPL_A0(ptype, bits, field, one) \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype p3; /* starts as (,, H*Z1) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + bool_t p1inf, p2inf, is_dbl; \ +\ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ + add_##field(dbl.sx, p2->X, p2->X); /* sx = X2+X2 */\ + sqr_##field(dbl.R, p2->X); /* X2^2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X2^2 */\ + add_##field(dbl.H, p2->Y, p2->Y); /* H = 2*Y2 */\ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ + mul_##field(add.R, add.H, p1->Z); /* Z1^3 */\ + mul_##field(add.R, add.R, p2->Y); /* S2 = Y2*Z1^3 */\ + sub_##field(add.R, add.R, p1->Y); /* R = S2-Y1 */\ +\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + add_##field(add.sx, add.H, p1->X); /* sx = X1+U2 */\ + sub_##field(add.H, add.H, p1->X); /* H = U2-X1 */\ +\ + mul_##field(p3.Z, add.H, p1->Z); /* Z3 = H*Z1 */\ +\ + /* make the choice between addition and doubling */ \ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(p3.X, p2, p1, 2*sizeof(p3.X), is_dbl); \ + vec_select(p3.Z, dbl.H, p3.Z, sizeof(p3.Z), is_dbl);\ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ + mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ + vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl + * with twist to handle either input at infinity, which are encoded as Z==0. + */ +#define POINT_ADD_IMPL(ptype, bits, field) \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, Z2Z2, U1, S1, H, I, J; \ + bool_t p1inf, p2inf; \ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ + mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ +\ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(S1, Z2Z2, p2->Z); /* Z2*Z2Z2 */\ + mul_##field(S1, S1, p1->Y); /* S1 = Y1*Z2*Z2Z2 */\ +\ + sub_##field(p3.Z, p3.Z, S1); /* S2-S1 */\ + add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-S1) */\ +\ + mul_##field(U1, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + sub_##field(H, H, U1); /* H = U2-U1 */\ +\ + add_##field(I, H, H); /* 2*H */\ + sqr_##field(I, I); /* I = (2*H)^2 */\ +\ + mul_##field(J, H, I); /* J = H*I */\ + mul_##field(S1, S1, J); /* S1*J */\ +\ + mul_##field(p3.Y, U1, I); /* V = U1*I */\ +\ + sqr_##field(p3.X, p3.Z); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, p3.Y); \ + sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ + sub_##field(p3.Y, p3.Y, S1); \ + sub_##field(p3.Y, p3.Y, S1); /* Y3 = r*(V-X3)-2*S1*J */\ +\ + add_##field(p3.Z, p1->Z, p2->Z); /* Z1+Z2 */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+Z2)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+Z2)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, Z2Z2); /* (Z1+Z2)^2-Z1Z1-Z2Z2 */\ + mul_##field(p3.Z, p3.Z, H); /* Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H */\ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl + * with twist to handle either input at infinity, with |p1| encoded as Z==0, + * and |p2| as X==Y==0. + */ +#define POINT_ADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, H, HH, I, J; \ + bool_t p1inf, p2inf; \ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ + mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ +\ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ +\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ + sub_##field(H, H, p1->X); /* H = U2-X1 */\ +\ + sqr_##field(HH, H); /* HH = H^2 */\ + add_##field(I, HH, HH); \ + add_##field(I, I, I); /* I = 4*HH */\ +\ + mul_##field(p3.Y, p1->X, I); /* V = X1*I */\ + mul_##field(J, H, I); /* J = H*I */\ + mul_##field(I, J, p1->Y); /* Y1*J */\ +\ + sub_##field(p3.Z, p3.Z, p1->Y); /* S2-Y1 */\ + add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-Y1) */\ +\ + sqr_##field(p3.X, p3.Z); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, p3.Y); \ + sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ + sub_##field(p3.Y, p3.Y, I); \ + sub_##field(p3.Y, p3.Y, I); /* Y3 = r*(V-X3)-2*Y1*J */\ +\ + add_##field(p3.Z, p1->Z, H); /* Z1+H */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+H)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+H)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */\ +\ + vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ + vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l + */ +#define POINT_DOUBLE_IMPL_A0(ptype, bits, field) \ +static void ptype##_double(ptype *p3, const ptype *p1) \ +{ \ + vec##bits A, B, C; \ +\ + sqr_##field(A, p1->X); /* A = X1^2 */\ + sqr_##field(B, p1->Y); /* B = Y1^2 */\ + sqr_##field(C, B); /* C = B^2 */\ +\ + add_##field(B, B, p1->X); /* X1+B */\ + sqr_##field(B, B); /* (X1+B)^2 */\ + sub_##field(B, B, A); /* (X1+B)^2-A */\ + sub_##field(B, B, C); /* (X1+B)^2-A-C */\ + add_##field(B, B, B); /* D = 2*((X1+B)^2-A-C) */\ +\ + mul_by_3_##field(A, A); /* E = 3*A */\ +\ + sqr_##field(p3->X, A); /* F = E^2 */\ + sub_##field(p3->X, p3->X, B); \ + sub_##field(p3->X, p3->X, B); /* X3 = F-2*D */\ +\ + add_##field(p3->Z, p1->Z, p1->Z); /* 2*Z1 */\ + mul_##field(p3->Z, p3->Z, p1->Y); /* Z3 = 2*Z1*Y1 */\ +\ + mul_by_8_##field(C, C); /* 8*C */\ + sub_##field(p3->Y, B, p3->X); /* D-X3 */\ + mul_##field(p3->Y, p3->Y, A); /* E*(D-X3) */\ + sub_##field(p3->Y, p3->Y, C); /* Y3 = E*(D-X3)-8*C */\ +} + +#define POINT_LADDER_PRE_IMPL(ptype, bits, field) \ +static void ptype##xz_ladder_pre(ptype##xz *pxz, const ptype *p) \ +{ \ + mul_##field(pxz->X, p->X, p->Z); /* X2 = X1*Z1 */\ + sqr_##field(pxz->Z, p->Z); \ + mul_##field(pxz->Z, pxz->Z, p->Z); /* Z2 = Z1^3 */\ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-xz.html#ladder-ladd-2002-it-3 + * with twist to handle either input at infinity, which are encoded as Z==0. + * Just in case, order of doubling and addition is reverse in comparison to + * hyperelliptic.org entry. This was done to minimize temporary storage. + * + * XZ1 is |p|, XZ2&XZ4 are in&out |r|, XZ3&XZ5 are in&out |s|. + */ +#define POINT_LADDER_STEP_IMPL_A0(ptype, bits, field, suffix4b) \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p) \ +{ \ + ptype##xz p5; \ + vec##bits A, B, C, D, XX, ZZ; \ + bool_t r_inf, s_inf; \ + /* s += r */\ + mul_##field(A, r->X, s->X); /* A = X2*X3 */\ + mul_##field(B, r->Z, s->Z); /* B = Z2*Z3 */\ + mul_##field(C, r->X, s->Z); /* C = X2*Z3 */\ + mul_##field(D, r->Z, s->X); /* D = X3*Z2 */\ +\ + sqr_##field(A, A); /* (A[-a*B])^2 */\ + add_##field(p5.X, C, D); /* C+D */\ + mul_##field(p5.X, p5.X, B); /* B*(C+D) */\ + mul_by_4b_##suffix4b(B, p5.X); /* b4*B*(C+D) */\ + sub_##field(p5.X, A, B); /* (A[-a*B])^2-b4*B*(C+D) */\ + mul_##field(p5.X, p5.X, p->Z); /* X5 = Z1*((A[-a*B])^2-b4*B*(C+D)) */\ +\ + sub_##field(p5.Z, C, D); /* C-D */\ + sqr_##field(p5.Z, p5.Z); /* (C-D)^2 */\ + mul_##field(p5.Z, p5.Z, p->X); /* Z5 = X1*(C-D)^2 */\ +\ + r_inf = vec_is_zero(r->Z, sizeof(r->Z)); \ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ +\ + vec_select(&p5, r, &p5, sizeof(ptype##xz), s_inf); \ + vec_select(s, s, &p5, sizeof(ptype##xz), r_inf); \ + /* r *= 2 */\ + sqr_##field(XX, r->X); /* XX = X2^2 */\ + sqr_##field(ZZ, r->Z); /* ZZ = Z2^2 */\ +\ + add_##field(r->Z, r->X, r->Z); /* X2+Z2 */\ + sqr_##field(r->Z, r->Z); /* (X2+Z2)^2 */\ + sub_##field(r->Z, r->Z, XX); /* (X2+Z2)^2-XX */\ + sub_##field(r->Z, r->Z, ZZ); /* E = (X2+Z2)^2-XX-ZZ */\ +\ + sqr_##field(A, XX); /* (XX[-a*ZZ])^2 */\ + mul_##field(B, r->Z, ZZ); /* E*ZZ */\ + mul_by_4b_##suffix4b(C, B); /* b4*E*ZZ */\ + sub_##field(r->X, A, C); /* X4 = (XX[-a*ZZ])^2-b4*E*ZZ */\ +\ + sqr_##field(ZZ, ZZ); /* ZZ^2 */\ + mul_by_4b_##suffix4b(B, ZZ); /* b4*ZZ^2 */\ + mul_##field(r->Z, r->Z, XX); /* E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, r->Z); /* 2*E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, B); /* Z4 = 2*E*(XX[+a*ZZ])+b4*ZZ^2 */\ +} + +/* + * Recover the |r|'s y-coordinate using Eq. (8) from Brier-Joye, + * "Weierstraß Elliptic Curves and Side-Channel Attacks", with XZ twist + * and conversion to Jacobian coordinates from <openssl>/.../ecp_smpl.c, + * and with twist to recover from |s| at infinity [which occurs when + * multiplying by (order-1)]. + * + * X4 = 2*Y1*X2*Z3*Z1*Z2 + * Y4 = 2*b*Z3*(Z1*Z2)^2 + Z3*(a*Z1*Z2+X1*X2)*(X1*Z2+X2*Z1) - X3*(X1*Z2-X2*Z1)^2 + * Z4 = 2*Y1*Z3*Z2^2*Z1 + * + * Z3x2 = 2*Z3 + * Y1Z3x2 = Y1*Z3x2 + * Z1Z2 = Z1*Z2 + * X1Z2 = X1*Z2 + * X2Z1 = X2*Z1 + * X4 = Y1Z3x2*X2*Z1Z2 + * A = b*Z3x2*(Z1Z2)^2 + * B = Z3*(a*Z1Z2+X1*X2)*(X1Z2+X2Z1) + * C = X3*(X1Z2-X2Z1)^2 + * Y4 = A+B-C + * Z4 = Y1Z3x2*Z1Z2*Z2 + * + * XZ1 is |p|, XZ2 is |r|, XZ3 is |s|, 'a' is 0. + */ +#define POINT_LADDER_POST_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##xz_ladder_post(ptype *p4, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1) \ +{ \ + vec##bits Z3x2, Y1Z3x2, Z1Z2, X1Z2, X2Z1, A, B, C; \ + bool_t s_inf; \ +\ + add_##field(Z3x2, s->Z, s->Z); /* Z3x2 = 2*Z3 */\ + mul_##field(Y1Z3x2, Y1, Z3x2); /* Y1Z3x2 = Y1*Z3x2 */\ + mul_##field(Z1Z2, p->Z, r->Z); /* Z1Z2 = Z1*Z2 */\ + mul_##field(X1Z2, p->X, r->Z); /* X1Z2 = X1*Z2 */\ + mul_##field(X2Z1, r->X, p->Z); /* X2Z1 = X2*Z1 */\ +\ + mul_##field(p4->X, Y1Z3x2, r->X); /* Y1Z3x2*X2 */\ + mul_##field(p4->X, p4->X, Z1Z2); /* X4 = Y1Z3x2*X2*Z1Z2 */\ +\ + sqr_##field(A, Z1Z2); /* (Z1Z2)^2 */\ + mul_##field(B, A, Z3x2); /* Z3x2*(Z1Z2)^2 */\ + mul_by_b_##suffixb(A, B); /* A = b*Z3x2*(Z1Z2)^2 */\ +\ + mul_##field(B, p->X, r->X); /* [a*Z1Z2+]X1*X2 */\ + mul_##field(B, B, s->Z); /* Z3*([a*Z1Z2+]X1*X2) */\ + add_##field(C, X1Z2, X2Z1); /* X1Z2+X2Z1 */\ + mul_##field(B, B, C); /* B = Z3*([a*Z2Z1+]X1*X2)*(X1Z2+X2Z1) */\ +\ + sub_##field(C, X1Z2, X2Z1); /* X1Z2-X2Z1 */\ + sqr_##field(C, C); /* (X1Z2-X2Z1)^2 */\ + mul_##field(C, C, s->X); /* C = X3*(X1Z2-X2Z1)^2 */\ +\ + add_##field(A, A, B); /* A+B */\ + sub_##field(A, A, C); /* Y4 = A+B-C */\ +\ + mul_##field(p4->Z, Z1Z2, r->Z); /* Z1Z2*Z2 */\ + mul_##field(p4->Z, p4->Z, Y1Z3x2); /* Y1Z3x2*Z1Z2*Z2 */\ +\ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ + vec_select(p4->X, p->X, p4->X, sizeof(p4->X), s_inf); \ + vec_select(p4->Y, Y1, A, sizeof(p4->Y), s_inf); \ + vec_select(p4->Z, p->Z, p4->Z, sizeof(p4->Z), s_inf); \ + ptype##_cneg(p4, s_inf); \ + /* to Jacobian */\ + mul_##field(p4->X, p4->X, p4->Z); /* X4 = X4*Z4 */\ + sqr_##field(B, p4->Z); \ + mul_##field(p4->Y, p4->Y, B); /* Y4 = Y4*Z4^2 */\ +} + +#define POINT_IS_EQUAL_IMPL(ptype, bits, field) \ +static limb_t ptype##_is_equal(const ptype *p1, const ptype *p2) \ +{ \ + vec##bits Z1Z1, Z2Z2; \ + ptype##_affine a1, a2; \ + bool_t is_inf1 = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + bool_t is_inf2 = vec_is_zero(p2->Z, sizeof(p2->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(a1.X, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(a2.X, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + mul_##field(a1.Y, p1->Y, p2->Z); /* Y1*Z2 */\ + mul_##field(a2.Y, p2->Y, p1->Z); /* Y2*Z1 */\ +\ + mul_##field(a1.Y, a1.Y, Z2Z2); /* S1 = Y1*Z2*Z2Z2 */\ + mul_##field(a2.Y, a2.Y, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */\ +\ + return vec_is_equal(&a1, &a2, sizeof(a1)) & (is_inf1 ^ is_inf2 ^ 1); \ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 7 with a twist to handle + * |p3| pointing at either |p1| or |p2|. This is resolved by adding |t5| + * and replacing few first references to |X3| in the formula, up to step + * 21, with it. 12M[+27A], doubling and infinity are handled by the + * formula itself. Infinity is to be encoded as [0, !0, 0]. + */ +#define POINT_PROJ_DADD_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_dadd(ptype##proj *p3, const ptype##proj *p1, \ + const ptype##proj *p2) \ +{ \ + vec##bits t0, t1, t2, t3, t4, t5; \ +\ + mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ + mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ + mul_##field(t2, p1->Z, p2->Z); /* 3. t2 = Z1*Z2 */\ + add_##field(t3, p1->X, p1->Y); /* 4. t3 = X1+Y1 */\ + add_##field(t4, p2->X, p2->Y); /* 5. t4 = X2+Y2 */\ + mul_##field(t3, t3, t4); /* 6. t3 = t3*t4 */\ + add_##field(t4, t0, t1); /* 7. t4 = t0+t1 */\ + sub_##field(t3, t3, t4); /* 8. t3 = t3-t4 */\ + add_##field(t4, p1->Y, p1->Z); /* 9. t4 = Y1+Z1 */\ + add_##field(t5, p2->Y, p2->Z); /* 10. t5 = Y2+Z2 */\ + mul_##field(t4, t4, t5); /* 11. t4 = t4*t5 */\ + add_##field(t5, t1, t2); /* 12. t5 = t1+t2 */\ + sub_##field(t4, t4, t5); /* 13. t4 = t4-t5 */\ + add_##field(t5, p1->X, p1->Z); /* 14. t5 = X1+Z1 */\ + add_##field(p3->Y, p2->X, p2->Z); /* 15. Y3 = X2+Z2 */\ + mul_##field(t5, t5, p3->Y); /* 16. t5 = t5*Y3 */\ + add_##field(p3->Y, t0, t2); /* 17. Y3 = t0+t2 */\ + sub_##field(p3->Y, t5, p3->Y); /* 18. Y3 = t5-Y3 */\ + mul_by_3_##field(t0, t0); /* 19-20. t0 = 3*t0 */\ + mul_by_3_##field(t5, t2); /* 21. t5 = 3*t2 */\ + mul_by_b_##suffixb(t2, t5); /* 21. t2 = b*t5 */\ + add_##field(p3->Z, t1, t2); /* 22. Z3 = t1+t2 */\ + sub_##field(t1, t1, t2); /* 23. t1 = t1-t2 */\ + mul_by_3_##field(t5, p3->Y); /* 24. t5 = 3*Y3 */\ + mul_by_b_##suffixb(p3->Y, t5); /* 24. Y3 = b*t5 */\ + mul_##field(p3->X, t4, p3->Y); /* 25. X3 = t4*Y3 */\ + mul_##field(t2, t3, t1); /* 26. t2 = t3*t1 */\ + sub_##field(p3->X, t2, p3->X); /* 27. X3 = t2-X3 */\ + mul_##field(p3->Y, p3->Y, t0); /* 28. Y3 = Y3*t0 */\ + mul_##field(t1, t1, p3->Z); /* 29. t1 = t1*Z3 */\ + add_##field(p3->Y, t1, p3->Y); /* 30. Y3 = t1+Y3 */\ + mul_##field(t0, t0, t3); /* 31. t0 = t0*t3 */\ + mul_##field(p3->Z, p3->Z, t4); /* 32. Z3 = Z3*t4 */\ + add_##field(p3->Z, p3->Z, t0); /* 33. Z3 = Z3+t0 */\ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 8 with a twist to handle + * |p2| being infinity encoded as [0, 0]. 11M[+21A]. + */ +#define POINT_PROJ_DADD_AFFINE_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_dadd_affine(ptype##proj *out, const ptype##proj *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype##proj p3[1]; \ + vec##bits t0, t1, t2, t3, t4; \ + limb_t p2inf = vec_is_zero(p2, sizeof(*p2)); \ +\ + mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ + mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ + add_##field(t3, p1->X, p1->Y); /* 3. t3 = X1+Y1 */\ + add_##field(t4, p2->X, p2->Y); /* 4. t4 = X2+Y2 */\ + mul_##field(t3, t3, t4); /* 5. t3 = t3*t4 */\ + add_##field(t4, t0, t1); /* 6. t4 = t0+t1 */\ + sub_##field(t3, t3, t4); /* 7. t3 = t3-t4 */\ + mul_##field(t4, p2->Y, p1->Z); /* 8. t4 = Y2*Z1 */\ + add_##field(t4, t4, p1->Y); /* 9. t4 = t4+Y1 */\ + mul_##field(p3->Y, p2->X, p1->Z); /* 10. Y3 = X2*Z1 */\ + add_##field(p3->Y, p3->Y, p1->X); /* 11. Y3 = Y3+X1 */\ + mul_by_3_##field(t0, t0); /* 12-13. t0 = 3*t0 */\ + mul_by_b_##suffixb(t2, p1->Z); /* 14. t2 = b*Z1 */\ + mul_by_3_##field(t2, t2); /* 14. t2 = 3*t2 */\ + add_##field(p3->Z, t1, t2); /* 15. Z3 = t1+t2 */\ + sub_##field(t1, t1, t2); /* 16. t1 = t1-t2 */\ + mul_by_b_##suffixb(t2, p3->Y); /* 17. t2 = b*Y3 */\ + mul_by_3_##field(p3->Y, t2); /* 17. Y3 = 3*t2 */\ + mul_##field(p3->X, t4, p3->Y); /* 18. X3 = t4*Y3 */\ + mul_##field(t2, t3, t1); /* 19. t2 = t3*t1 */\ + sub_##field(p3->X, t2, p3->X); /* 20. X3 = t2-X3 */\ + mul_##field(p3->Y, p3->Y, t0); /* 21. Y3 = Y3*t0 */\ + mul_##field(t1, t1, p3->Z); /* 22. t1 = t1*Z3 */\ + add_##field(p3->Y, t1, p3->Y); /* 23. Y3 = t1+Y3 */\ + mul_##field(t0, t0, t3); /* 24. t0 = t0*t3 */\ + mul_##field(p3->Z, p3->Z, t4); /* 25. Z3 = Z3*t4 */\ + add_##field(p3->Z, p3->Z, t0); /* 26. Z3 = Z3+t0 */\ +\ + vec_select(out, p1, p3, sizeof(*out), p2inf); \ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 9 with a twist to handle + * |p3| pointing at |p1|. This is resolved by adding |t3| to hold X*Y + * and reordering operations to bring references to |p1| forward. + * 6M+2S[+13A]. + */ +#define POINT_PROJ_DOUBLE_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_double(ptype##proj *p3, const ptype##proj *p1) \ +{ \ + vec##bits t0, t1, t2, t3; \ +\ + sqr_##field(t0, p1->Y); /* 1. t0 = Y*Y */\ + mul_##field(t1, p1->Y, p1->Z); /* 5. t1 = Y*Z */\ + sqr_##field(t2, p1->Z); /* 6. t2 = Z*Z */\ + mul_##field(t3, p1->X, p1->Y); /* 16. t3 = X*Y */\ + lshift_##field(p3->Z, t0, 3); /* 2-4. Z3 = 8*t0 */\ + mul_by_b_##suffixb(p3->X, t2); /* 7. t2 = b*t2 */\ + mul_by_3_##field(t2, p3->X); /* 7. t2 = 3*t2 */\ + mul_##field(p3->X, t2, p3->Z); /* 8. X3 = t2*Z3 */\ + add_##field(p3->Y, t0, t2); /* 9. Y3 = t0+t2 */\ + mul_##field(p3->Z, t1, p3->Z); /* 10. Z3 = t1*Z3 */\ + mul_by_3_##field(t2, t2); /* 11-12. t2 = 3*t2 */\ + sub_##field(t0, t0, t2); /* 13. t0 = t0-t2 */\ + mul_##field(p3->Y, t0, p3->Y); /* 14. Y3 = t0*Y3 */\ + add_##field(p3->Y, p3->X, p3->Y); /* 15. Y3 = X3+Y3 */\ + mul_##field(p3->X, t0, t3); /* 17. X3 = t0*t3 */\ + add_##field(p3->X, p3->X, p3->X); /* 18. X3 = X3+X3 */\ +} + +#define POINT_PROJ_TO_JACOBIAN_IMPL(ptype, bits, field) \ +static void ptype##proj_to_Jacobian(ptype *out, const ptype##proj *in) \ +{ \ + vec##bits ZZ; \ +\ + sqr_##field(ZZ, in->Z); \ + mul_##field(out->X, in->X, in->Z); \ + mul_##field(out->Y, in->Y, ZZ); \ + vec_copy(out->Z, in->Z, sizeof(out->Z)); \ +} + +#define POINT_TO_PROJECTIVE_IMPL(ptype, bits, field, one) \ +static void ptype##_to_projective(ptype##proj *out, const ptype *in) \ +{ \ + vec##bits ZZ; \ + limb_t is_inf = vec_is_zero(in->Z, sizeof(in->Z)); \ +\ + sqr_##field(ZZ, in->Z); \ + mul_##field(out->X, in->X, in->Z); \ + vec_select(out->Y, one, in->Y, sizeof(out->Y), is_inf); \ + mul_##field(out->Z, ZZ, in->Z); \ +} + +/******************* !!!!! NOT CONSTANT TIME !!!!! *******************/ + +/* + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 + * with twist to handle either input at infinity. Addition costs 12M+2S, + * while conditional doubling - 4M+6M+3S. + */ +#define POINTXYZZ_DADD_IMPL(ptype, bits, field) \ +static void ptype##xyzz_dadd(ptype##xyzz *p3, const ptype##xyzz *p1, \ + const ptype##xyzz *p2) \ +{ \ + vec##bits U, S, P, R; \ +\ + if (vec_is_zero(p2->ZZZ, 2*sizeof(p2->ZZZ))) { \ + vec_copy(p3, p1, sizeof(*p3)); \ + return; \ + } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ + vec_copy(p3, p2, sizeof(*p3)); \ + return; \ + } \ +\ + mul_##field(U, p1->X, p2->ZZ); /* U1 = X1*ZZ2 */\ + mul_##field(S, p1->Y, p2->ZZZ); /* S1 = Y1*ZZZ2 */\ + mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ + mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ + sub_##field(P, P, U); /* P = U2-U1 */\ + sub_##field(R, R, S); /* R = S2-S1 */\ +\ + if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ + vec##bits PP, PPP, Q; /* add |p1| and |p2| */\ +\ + sqr_##field(PP, P); /* PP = P^2 */\ + mul_##field(PPP, PP, P); /* PPP = P*PP */\ + mul_##field(Q, U, PP); /* Q = U1*PP */\ + sqr_##field(p3->X, R); /* R^2 */\ + add_##field(P, Q, Q); \ + sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ + sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ + sub_##field(Q, Q, p3->X); \ + mul_##field(Q, Q, R); /* R*(Q-X3) */\ + mul_##field(p3->Y, S, PPP); /* S1*PPP */\ + sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-S1*PPP */\ + mul_##field(p3->ZZ, p1->ZZ, p2->ZZ); /* ZZ1*ZZ2 */\ + mul_##field(p3->ZZZ, p1->ZZZ, p2->ZZZ); /* ZZZ1*ZZZ2 */\ + mul_##field(p3->ZZ, p3->ZZ, PP); /* ZZ3 = ZZ1*ZZ2*PP */\ + mul_##field(p3->ZZZ, p3->ZZZ, PPP); /* ZZZ3 = ZZZ1*ZZZ2*PPP */\ + } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ + vec##bits V, W, M; /* double |p1| */\ +\ + add_##field(U, p1->Y, p1->Y); /* U = 2*Y1 */\ + sqr_##field(V, U); /* V = U^2 */\ + mul_##field(W, V, U); /* W = U*V */\ + mul_##field(S, p1->X, V); /* S = X1*V */\ + sqr_##field(M, p1->X); \ + mul_by_3_##field(M, M); /* M = 3*X1^2[+a*ZZ1^2] */\ + sqr_##field(p3->X, M); \ + add_##field(U, S, S); /* 2*S */\ + sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ + mul_##field(p3->Y, W, p1->Y); /* W*Y1 */\ + sub_##field(S, S, p3->X); \ + mul_##field(S, S, M); /* M*(S-X3) */\ + sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ + mul_##field(p3->ZZ, p1->ZZ, V); /* ZZ3 = V*ZZ1 */\ + mul_##field(p3->ZZZ, p1->ZZZ, W); /* ZZ3 = W*ZZZ1 */\ + } else { /* X1==X2 && Y1==-Y2 */\ + vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ + } \ +} + +/* + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-madd-2008-s + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-mdbl-2008-s-1 + * with twists to handle even subtractions and either input at infinity. + * Addition costs 8M+2S, while conditional doubling - 2M+4M+3S. + */ +#define POINTXYZZ_DADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##xyzz_dadd_affine(ptype##xyzz *p3, const ptype##xyzz *p1, \ + const ptype##_affine *p2, \ + bool_t subtract) \ +{ \ + vec##bits P, R; \ +\ + if (vec_is_zero(p2, sizeof(*p2))) { \ + vec_copy(p3, p1, sizeof(*p3)); \ + return; \ + } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ + vec_copy(p3->X, p2->X, 2*sizeof(p3->X));\ + cneg_##field(p3->ZZZ, one, subtract); \ + vec_copy(p3->ZZ, one, sizeof(p3->ZZ)); \ + return; \ + } \ +\ + mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ + mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ + cneg_##field(R, R, subtract); \ + sub_##field(P, P, p1->X); /* P = U2-X1 */\ + sub_##field(R, R, p1->Y); /* R = S2-Y1 */\ +\ + if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ + vec##bits PP, PPP, Q; /* add |p2| to |p1| */\ +\ + sqr_##field(PP, P); /* PP = P^2 */\ + mul_##field(PPP, PP, P); /* PPP = P*PP */\ + mul_##field(Q, p1->X, PP); /* Q = X1*PP */\ + sqr_##field(p3->X, R); /* R^2 */\ + add_##field(P, Q, Q); \ + sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ + sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ + sub_##field(Q, Q, p3->X); \ + mul_##field(Q, Q, R); /* R*(Q-X3) */\ + mul_##field(p3->Y, p1->Y, PPP); /* Y1*PPP */\ + sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-Y1*PPP */\ + mul_##field(p3->ZZ, p1->ZZ, PP); /* ZZ3 = ZZ1*PP */\ + mul_##field(p3->ZZZ, p1->ZZZ, PPP); /* ZZZ3 = ZZZ1*PPP */\ + } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ + vec##bits U, S, M; /* double |p2| */\ +\ + add_##field(U, p2->Y, p2->Y); /* U = 2*Y1 */\ + sqr_##field(p3->ZZ, U); /* [ZZ3 =] V = U^2 */\ + mul_##field(p3->ZZZ, p3->ZZ, U); /* [ZZZ3 =] W = U*V */\ + mul_##field(S, p2->X, p3->ZZ); /* S = X1*V */\ + sqr_##field(M, p2->X); \ + mul_by_3_##field(M, M); /* M = 3*X1^2[+a] */\ + sqr_##field(p3->X, M); \ + add_##field(U, S, S); /* 2*S */\ + sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ + mul_##field(p3->Y, p3->ZZZ, p2->Y); /* W*Y1 */\ + sub_##field(S, S, p3->X); \ + mul_##field(S, S, M); /* M*(S-X3) */\ + sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ + cneg_##field(p3->ZZZ, p3->ZZZ, subtract); \ + } else { /* X1==X2 && Y1==-Y2 */\ + vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ + } \ +} + +#define POINTXYZZ_TO_JACOBIAN_IMPL(ptype, bits, field) \ +static void ptype##xyzz_to_Jacobian(ptype *out, const ptype##xyzz *in) \ +{ \ + mul_##field(out->X, in->X, in->ZZ); \ + mul_##field(out->Y, in->Y, in->ZZZ); \ + vec_copy(out->Z, in->ZZ, sizeof(out->Z)); \ +} + +#define POINT_TO_XYZZ_IMPL(ptype, bits, field) \ +static void ptype##_to_xyzz(ptype##xyzz *out, const ptype *in) \ +{ \ + vec_copy(out->X, in->X, 2*sizeof(out->X)); \ + sqr_##field(out->ZZ, in->Z); \ + mul_##field(out->ZZZ, out->ZZ, in->Z); \ +} + +#endif diff --git a/blst/elf/add_mod_256-armv8.S b/blst/elf/add_mod_256-armv8.S new file mode 100644 index 0000000..57476aa --- /dev/null +++ b/blst/elf/add_mod_256-armv8.S @@ -0,0 +1,379 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,%function +.align 5 +add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret +.size add_mod_256,.-add_mod_256 + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,%function +.align 5 +mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,%function +.align 5 +lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,.Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret +.size lshift_mod_256,.-lshift_mod_256 + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,%function +.align 5 +rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,.Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret +.size rshift_mod_256,.-rshift_mod_256 + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,%function +.align 5 +cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret +.size cneg_mod_256,.-cneg_mod_256 + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,%function +.align 5 +sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret +.size sub_mod_256,.-sub_mod_256 + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,%function +.align 5 +check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret +.size check_mod_256,.-check_mod_256 + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,%function +.align 5 +add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret +.size add_n_check_mod_256,.-add_n_check_mod_256 + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,%function +.align 5 +sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret +.size sub_n_check_mod_256,.-sub_n_check_mod_256 diff --git a/blst/elf/add_mod_256-x86_64.s b/blst/elf/add_mod_256-x86_64.s new file mode 100644 index 0000000..2f41781 --- /dev/null +++ b/blst/elf/add_mod_256-x86_64.s @@ -0,0 +1,572 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,@function +.align 32 +add_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_256,.-add_mod_256 + + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,@function +.align 32 +mul_by_3_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,@function +.align 32 +__lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_256,.-__lshift_mod_256 + + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,@function +.align 32 +lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz .Loop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,@function +.align 32 +rshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz .Loop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,@function +.align 32 +cneg_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,@function +.align 32 +sub_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 + + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,@function +.align 32 +check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size check_mod_256,.-check_mod_256 + + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,@function +.align 32 +add_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_n_check_mod_256,.-add_n_check_mod_256 + + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,@function +.align 32 +sub_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_n_check_mod_256,.-sub_n_check_mod_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/add_mod_384-armv8.S b/blst/elf/add_mod_384-armv8.S new file mode 100644 index 0000000..55e0888 --- /dev/null +++ b/blst/elf/add_mod_384-armv8.S @@ -0,0 +1,931 @@ +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,%function +.align 5 +add_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,%function +.align 5 +add_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size add_mod_384x,.-add_mod_384x + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,%function +.align 5 +rshift_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,%function +.align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,%function +.align 5 +div_by_2_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size div_by_2_mod_384,.-div_by_2_mod_384 + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,%function +.align 5 +lshift_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,%function +.align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,%function +.align 5 +mul_by_3_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,%function +.align 5 +mul_by_8_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,%function +.align 5 +mul_by_3_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,%function +.align 5 +mul_by_8_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,%function +.align 5 +cneg_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size cneg_mod_384,.-cneg_mod_384 + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,%function +.align 5 +sub_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,%function +.align 5 +sub_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sub_mod_384x,.-sub_mod_384x + +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,%function +.align 5 +mul_by_1_plus_i_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x + +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,%function +.align 5 +sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,%function +.align 5 +sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +.globl vec_select_48 +.hidden vec_select_48 +.type vec_select_48,%function +.align 5 +vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_48,.-vec_select_48 +.globl vec_select_96 +.hidden vec_select_96 +.type vec_select_96,%function +.align 5 +vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_96,.-vec_select_96 +.globl vec_select_192 +.hidden vec_select_192 +.type vec_select_192,%function +.align 5 +vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_192,.-vec_select_192 +.globl vec_select_144 +.hidden vec_select_144 +.type vec_select_144,%function +.align 5 +vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_144,.-vec_select_144 +.globl vec_select_288 +.hidden vec_select_288 +.type vec_select_288,%function +.align 5 +vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_288,.-vec_select_288 +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,%function +.align 5 +vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret +.size vec_prefetch,.-vec_prefetch diff --git a/blst/elf/add_mod_384-x86_64.s b/blst/elf/add_mod_384-x86_64.s new file mode 100644 index 0000000..df61986 --- /dev/null +++ b/blst/elf/add_mod_384-x86_64.s @@ -0,0 +1,1809 @@ +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,@function +.align 32 +add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,@function +.align 32 +add_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,@function +.align 32 +rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz .Loop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,@function +.align 32 +__rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,@function +.align 32 +div_by_2_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size div_by_2_mod_384,.-div_by_2_mod_384 + + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,@function +.align 32 +lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz .Loop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,@function +.align 32 +__lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_384,.-__lshift_mod_384 + + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,@function +.align 32 +mul_by_3_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,@function +.align 32 +mul_by_8_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,@function +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,@function +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,@function +.align 32 +cneg_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,@function +.align 32 +sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,@function +.align 32 +sub_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,@function +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $56,%rsp +.cfi_adjust_cfa_offset 56 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 +.cfi_restore %r15 + movq 56+8(%rsp),%r14 +.cfi_restore %r14 + movq 56+16(%rsp),%r13 +.cfi_restore %r13 + movq 56+24(%rsp),%r12 +.cfi_restore %r12 + movq 56+32(%rsp),%rbx +.cfi_restore %rbx + movq 56+40(%rsp),%rbp +.cfi_restore %rbp + leaq 56+48(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,@function +.align 32 +sgn0_pty_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,@function +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +.globl vec_select_48 +.hidden vec_select_48 +.type vec_select_48,@function +.align 32 +vec_select_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 24(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 24(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 24(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-24(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_48,.-vec_select_48 +.globl vec_select_96 +.hidden vec_select_96 +.type vec_select_96,@function +.align 32 +vec_select_96: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 48(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 48(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 48(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_96,.-vec_select_96 +.globl vec_select_192 +.hidden vec_select_192 +.type vec_select_192,@function +.align 32 +vec_select_192: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 96(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 96(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 96(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_192,.-vec_select_192 +.globl vec_select_144 +.hidden vec_select_144 +.type vec_select_144,@function +.align 32 +vec_select_144: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 72(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 72(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 72(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_144,.-vec_select_144 +.globl vec_select_288 +.hidden vec_select_288 +.type vec_select_288,@function +.align 32 +vec_select_288: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 144(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 144(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 144(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rdi) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rdi) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rdi) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rdi) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rdi) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rdi) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_288,.-vec_select_288 +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,@function +.align 32 +vec_prefetch: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rdi,%rsi,1),%rsi + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + prefetchnta (%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_prefetch,.-vec_prefetch + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/add_mod_384x384-x86_64.s b/blst/elf/add_mod_384x384-x86_64.s new file mode 100644 index 0000000..084f3d8 --- /dev/null +++ b/blst/elf/add_mod_384x384-x86_64.s @@ -0,0 +1,252 @@ +.text + +.type __add_mod_384x384,@function +.align 32 +__add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384x384,.-__add_mod_384x384 + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,@function +.align 32 +add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,@function +.align 32 +sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/ct_inverse_mod_256-armv8.S b/blst/elf/ct_inverse_mod_256-armv8.S new file mode 100644 index 0000000..347eb31 --- /dev/null +++ b/blst/elf/ct_inverse_mod_256-armv8.S @@ -0,0 +1,784 @@ +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256, %function +.align 5 +ct_inverse_mod_256: + .inst 0xd503233f + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + .inst 0xd50323bf + ret +.size ct_inverse_mod_256,.-ct_inverse_mod_256 + +//////////////////////////////////////////////////////////////////////// +.type __smul_256x63, %function +.align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret +.size __smul_256x63,.-__smul_256x63 + +.type __smul_512x63_tail, %function +.align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret +.size __smul_512x63_tail,.-__smul_512x63_tail + +.type __smul_256_n_shift_by_31, %function +.align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret +.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 +.type __ab_approximation_31_256, %function +.align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +.Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 + +.type __inner_loop_31_256, %function +.align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, .Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256, %function +.align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +.Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 diff --git a/blst/elf/ct_inverse_mod_256-x86_64.s b/blst/elf/ct_inverse_mod_256-x86_64.s new file mode 100644 index 0000000..c4d8d6d --- /dev/null +++ b/blst/elf/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1185 @@ +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256,@function +.align 32 +ct_inverse_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1072,%rsp +.cfi_adjust_cfa_offset 1072 + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1072-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_inverse_mod_256,.-ct_inverse_mod_256 +.type __smulq_512x63,@function +.align 32 +__smulq_512x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_512x63,.-__smulq_512x63 + +.type __smulq_256x63,@function +.align 32 +__smulq_256x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_256x63,.-__smulq_256x63 +.type __smulq_256_n_shift_by_31,@function +.align 32 +__smulq_256_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 +.type __ab_approximation_31_256,@function +.align 32 +__ab_approximation_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_31_256,.-__ab_approximation_31_256 +.type __inner_loop_31_256,@function +.align 32 +__inner_loop_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz .Loop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256,@function +.align 32 +__inner_loop_62_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +.Loop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz .Loop_62_256 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62_256,.-__inner_loop_62_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/ct_inverse_mod_384-armv8.S b/blst/elf/ct_inverse_mod_384-armv8.S new file mode 100644 index 0000000..d7eca17 --- /dev/null +++ b/blst/elf/ct_inverse_mod_384-armv8.S @@ -0,0 +1,717 @@ +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383, %function +.align 5 +ct_inverse_mod_383: + .inst 0xd503233f + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl .Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + .inst 0xd50323bf + ret +.size ct_inverse_mod_383,.-ct_inverse_mod_383 + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.type __smul_383x63, %function +.align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret +.size __smul_383x63,.-__smul_383x63 + +.type __smul_767x63_tail, %function +.align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret +.size __smul_767x63_tail,.-__smul_767x63_tail + +.type __smul_383_n_shift_by_62, %function +.align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret +.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 +.type __ab_approximation_62, %function +.align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +.Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret +.size __ab_approximation_62,.-__ab_approximation_62 +.type __inner_loop_62, %function +.align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +.Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 diff --git a/blst/elf/ct_is_square_mod_384-armv8.S b/blst/elf/ct_is_square_mod_384-armv8.S new file mode 100644 index 0000000..ce670b7 --- /dev/null +++ b/blst/elf/ct_is_square_mod_384-armv8.S @@ -0,0 +1,324 @@ +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384, %function +.align 5 +ct_is_square_mod_384: + .inst 0xd503233f + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the .Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b .Loop_is_square + +.align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // just load + mov x14, x9 // ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + .inst 0xd50323bf + ret +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smul_384_n_shift_by_30, %function +.align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret +.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 +.type __ab_approximation_30, %function +.align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret +.size __ab_approximation_30,.-__ab_approximation_30 + +.type __inner_loop_30, %function +.align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, .Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret +.size __inner_loop_30,.-__inner_loop_30 +.type __inner_loop_48, %function +.align 4 +__inner_loop_48: +.Loop_48: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x15, x15, #1 + and x21, x14, x24 + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x14, #2 + lsr x8, x8, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 diff --git a/blst/elf/ct_is_square_mod_384-x86_64.s b/blst/elf/ct_is_square_mod_384-x86_64.s new file mode 100644 index 0000000..fec1493 --- /dev/null +++ b/blst/elf/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,479 @@ +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384,@function +.align 32 +ct_is_square_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $536,%rsp +.cfi_adjust_cfa_offset 536 + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp .Loop_is_square + +.align 32 +.Loop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz .Loop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -536-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smulq_384_n_shift_by_30,@function +.align 32 +__smulq_384_n_shift_by_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 +.type __ab_approximation_30,@function +.align 32 +__ab_approximation_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_30,.-__ab_approximation_30 +.type __inner_loop_30,@function +.align 32 +__inner_loop_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +.Loop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz .Loop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_30,.-__inner_loop_30 + +.type __inner_loop_48,@function +.align 32 +__inner_loop_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +.Loop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz .Loop_48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_48,.-__inner_loop_48 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/ctq_inverse_mod_384-x86_64.s b/blst/elf/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 0000000..b702262 --- /dev/null +++ b/blst/elf/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1195 @@ +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383,@function +.align 32 +ct_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_inverse_mod_383,.-ct_inverse_mod_383 +.type __smulq_767x63,@function +.align 32 +__smulq_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_767x63,.-__smulq_767x63 +.type __smulq_383x63,@function +.align 32 +__smulq_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_383x63,.-__smulq_383x63 +.type __smulq_383_n_shift_by_62,@function +.align 32 +__smulq_383_n_shift_by_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 +.type __ab_approximation_62,@function +.align 32 +__ab_approximation_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_62,.-__ab_approximation_62 +.type __inner_loop_62,@function +.align 8 +.long 0 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +.Loop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62,.-__inner_loop_62 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/ctx_inverse_mod_384-x86_64.s b/blst/elf/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 0000000..25a5fa5 --- /dev/null +++ b/blst/elf/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1574 @@ +.text + +.globl ctx_inverse_mod_383 +.type ctx_inverse_mod_383,@function +.align 32 +ctx_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 +.type __smulx_767x63,@function +.align 32 +__smulx_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_767x63,.-__smulx_767x63 +.type __smulx_383x63,@function +.align 32 +__smulx_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_383x63,.-__smulx_383x63 +.type __smulx_383_n_shift_by_31,@function +.align 32 +__smulx_383_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 +.type __smulx_191_n_shift_by_31,@function +.align 32 +__smulx_191_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 +.type __ab_approximation_31,@function +.align 32 +__ab_approximation_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_31,.-__ab_approximation_31 +.type __inner_loop_31,@function +.align 32 +__inner_loop_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz .Loop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_31,.-__inner_loop_31 + +.type __inner_loop_62,@function +.align 32 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +.Loop_62: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62,.-__inner_loop_62 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/div3w-armv8.S b/blst/elf/div3w-armv8.S new file mode 100644 index 0000000..a2b1d67 --- /dev/null +++ b/blst/elf/div3w-armv8.S @@ -0,0 +1,88 @@ +.text + +.globl div_3_limbs +.type div_3_limbs,%function +.align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +.globl quot_rem_128 +.type quot_rem_128,%function +.align 5 +quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +.globl quot_rem_64 +.type quot_rem_64,%function +.align 5 +quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret +.size quot_rem_64,.-quot_rem_64 diff --git a/blst/elf/div3w-x86_64.s b/blst/elf/div3w-x86_64.s new file mode 100644 index 0000000..00ae569 --- /dev/null +++ b/blst/elf/div3w-x86_64.s @@ -0,0 +1,123 @@ +.text + +.globl div_3_limbs +.hidden div_3_limbs +.type div_3_limbs,@function +.align 32 +div_3_limbs: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +.Loop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz .Loop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size div_3_limbs,.-div_3_limbs +.globl quot_rem_128 +.hidden quot_rem_128 +.type quot_rem_128,@function +.align 32 +quot_rem_128: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size quot_rem_128,.-quot_rem_128 + + + + + +.globl quot_rem_64 +.hidden quot_rem_64 +.type quot_rem_64,@function +.align 32 +quot_rem_64: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size quot_rem_64,.-quot_rem_64 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/mul_mont_256-armv8.S b/blst/elf/mul_mont_256-armv8.S new file mode 100644 index 0000000..8bb1197 --- /dev/null +++ b/blst/elf/mul_mont_256-armv8.S @@ -0,0 +1,464 @@ +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,%function +.align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size mul_mont_sparse_256,.-mul_mont_sparse_256 +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,%function +.align 5 +sqr_mont_sparse_256: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,%function +.align 5 +from_mont_256: + .inst 0xd503233f + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + .inst 0xd50323bf + ret +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,%function +.align 5 +redc_mont_256: + .inst 0xd503233f + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + .inst 0xd50323bf + ret +.size redc_mont_256,.-redc_mont_256 + +.type __mul_by_1_mont_256,%function +.align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret +.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 diff --git a/blst/elf/mul_mont_384-armv8.S b/blst/elf/mul_mont_384-armv8.S new file mode 100644 index 0000000..c048e81 --- /dev/null +++ b/blst/elf/mul_mont_384-armv8.S @@ -0,0 +1,2372 @@ +.text + +.globl add_mod_384x384 +.type add_mod_384x384,%function +.align 5 +add_mod_384x384: + .inst 0xd503233f + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + .inst 0xd50323bf + ret +.size add_mod_384x384,.-add_mod_384x384 + +.type __add_mod_384x384,%function +.align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.globl sub_mod_384x384 +.type sub_mod_384x384,%function +.align 5 +sub_mod_384x384: + .inst 0xd503233f + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + .inst 0xd50323bf + ret +.size sub_mod_384x384,.-sub_mod_384x384 + +.type __sub_mod_384x384,%function +.align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,%function +.align 5 +mul_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_mont_384x,.-mul_mont_384x + +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,%function +.align 5 +sqr_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,%function +.align 5 +mul_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_mont_384,.-mul_mont_384 + +.type __mul_mont_384,%function +.align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret +.size __mul_mont_384,.-__mul_mont_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,%function +.align 5 +sqr_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_384,.-sqr_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,%function +.align 5 +sqr_n_mul_mont_383: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +.Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,.Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __sqr_384,%function +.align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret +.size __sqr_384,.-__sqr_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,%function +.align 5 +sqr_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_384,.-sqr_384 + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,%function +.align 5 +redc_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size redc_mont_384,.-redc_mont_384 + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,%function +.align 5 +from_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size from_mont_384,.-from_mont_384 + +.type __mul_by_1_mont_384,%function +.align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret +.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 + +.type __redc_tail_mont_384,%function +.align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl mul_384 +.hidden mul_384 +.type mul_384,%function +.align 5 +mul_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_384,.-mul_384 + +.type __mul_384,%function +.align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret +.size __mul_384,.-__mul_384 + +.globl mul_382x +.hidden mul_382x +.type mul_382x,%function +.align 5 +mul_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_382x,.-mul_382x + +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,%function +.align 5 +sqr_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_382x,.-sqr_382x + +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,%function +.align 5 +sqr_mont_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_382x,.-sqr_mont_382x + +.type __mul_mont_383_nonred,%function +.align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret +.size __mul_mont_383_nonred,.-__mul_mont_383_nonred + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,%function +.align 5 +sgn0_pty_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,%function +.align 5 +sgn0_pty_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x diff --git a/blst/elf/mulq_mont_256-x86_64.s b/blst/elf/mulq_mont_256-x86_64.s new file mode 100644 index 0000000..37abd43 --- /dev/null +++ b/blst/elf/mulq_mont_256-x86_64.s @@ -0,0 +1,714 @@ +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,@function +.align 32 +mul_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_sparse_256,.-mul_mont_sparse_256 + +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,@function +.align 32 +sqr_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.type __mulq_mont_sparse_256,@function +.align 32 +__mulq_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,@function +.align 32 +from_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,@function +.align 32 +redc_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_256,.-redc_mont_256 +.type __mulq_by_1_mont_256,@function +.align 32 +__mulq_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/mulq_mont_384-x86_64.s b/blst/elf/mulq_mont_384-x86_64.s new file mode 100644 index 0000000..fa9dd35 --- /dev/null +++ b/blst/elf/mulq_mont_384-x86_64.s @@ -0,0 +1,3620 @@ +.text + + + + + + + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,@function +.align 32 +mul_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,@function +.align 32 +sqr_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,@function +.align 32 +mul_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_382x,.-mul_382x +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,@function +.align 32 +sqr_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_382x,.-sqr_382x +.globl mul_384 +.hidden mul_384 +.type mul_384,@function +.align 32 +mul_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,@function +.align 32 +__mulq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_384,.-__mulq_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,@function +.align 32 +sqr_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sqrq_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,@function +.align 32 +__sqrq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,@function +.align 32 +sqr_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $120,%rsp +.cfi_adjust_cfa_offset 8*15 + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*21 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 + + + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,@function +.align 32 +redc_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + + + + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,@function +.align 32 +from_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_384,.-from_mont_384 +.type __mulq_by_1_mont_384,@function +.align 32 +__mulq_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redc_tail_mont_384,@function +.align 32 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,@function +.align 32 +sgn0_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,@function +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,@function +.align 32 +mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +.type __mulq_mont_384,@function +.align 32 +__mulq_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_384,.-__mulq_mont_384 +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,@function +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz .Loop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,@function +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz .Loop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __mulq_mont_383_nonred,@function +.align 32 +__mulq_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,@function +.align 32 +sqr_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/mulx_mont_256-x86_64.s b/blst/elf/mulx_mont_256-x86_64.s new file mode 100644 index 0000000..20a0207 --- /dev/null +++ b/blst/elf/mulx_mont_256-x86_64.s @@ -0,0 +1,627 @@ +.text + +.globl mulx_mont_sparse_256 +.hidden mulx_mont_sparse_256 +.type mulx_mont_sparse_256,@function +.align 32 +mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_sparse_256,.-mulx_mont_sparse_256 + +.globl sqrx_mont_sparse_256 +.hidden sqrx_mont_sparse_256 +.type sqrx_mont_sparse_256,@function +.align 32 +sqrx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256 +.type __mulx_mont_sparse_256,@function +.align 32 +__mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256 +.globl fromx_mont_256 +.hidden fromx_mont_256 +.type fromx_mont_256,@function +.align 32 +fromx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_256,.-fromx_mont_256 + +.globl redcx_mont_256 +.hidden redcx_mont_256 +.type redcx_mont_256,@function +.align 32 +redcx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_256,.-redcx_mont_256 +.type __mulx_by_1_mont_256,@function +.align 32 +__mulx_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/mulx_mont_384-x86_64.s b/blst/elf/mulx_mont_384-x86_64.s new file mode 100644 index 0000000..9f9f740 --- /dev/null +++ b/blst/elf/mulx_mont_384-x86_64.s @@ -0,0 +1,2968 @@ +.text + + + + + + + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 +.globl mulx_mont_384x +.hidden mulx_mont_384x +.type mulx_mont_384x,@function +.align 32 +mulx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,@function +.align 32 +sqrx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,@function +.align 32 +mulx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_382x,.-mulx_382x +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,@function +.align 32 +sqrx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,@function +.align 32 +mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,@function +.align 32 +__mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_384,.-__mulx_384 +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,@function +.align 32 +sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + call __sqrx_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_384,.-sqrx_384 +.type __sqrx_384,@function +.align 32 +__sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrx_384,.-__sqrx_384 + + + +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,@function +.align 32 +redcx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + + + + +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,@function +.align 32 +fromx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +.type __mulx_by_1_mont_384,@function +.align 32 +__mulx_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redc_tail_mont_384,@function +.align 32 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,@function +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,@function +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,@function +.align 32 +mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +.type __mulx_mont_384,@function +.align 32 +__mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,@function +.align 32 +sqrx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,@function +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +.Loop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,@function +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +.Loop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +.type __mulx_mont_383_nonred,@function +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,@function +.align 32 +sqrx_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/sha256-armv8.S b/blst/elf/sha256-armv8.S new file mode 100644 index 0000000..7341dec --- /dev/null +++ b/blst/elf/sha256-armv8.S @@ -0,0 +1,1077 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with raionale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.text + +.align 6 +.type .LK256,%object +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator +.size .LK256,.-.LK256 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.align 2 +.globl blst_sha256_block_armv8 +.type blst_sha256_block_armv8,%function +.align 6 +blst_sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size blst_sha256_block_armv8,.-blst_sha256_block_armv8 +.globl blst_sha256_block_data_order +.type blst_sha256_block_data_order,%function +.align 4 +blst_sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,%function +.align 4 +blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,%function +.align 4 +blst_sha256_bcopy: +.Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,.Loop_bcopy + ret +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,%function +.align 4 +blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret +.size blst_sha256_hcopy,.-blst_sha256_hcopy diff --git a/blst/elf/sha256-portable-x86_64.s b/blst/elf/sha256-portable-x86_64.s new file mode 100644 index 0000000..20b5c41 --- /dev/null +++ b/blst/elf/sha256-portable-x86_64.s @@ -0,0 +1,1754 @@ +.text + +.globl blst_sha256_block_data_order +.type blst_sha256_block_data_order,@function +.align 16 +blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+24,%rsp +.cfi_adjust_cfa_offset 16*4+3*8 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + leaq 64+24+48(%rsp),%r11 +.cfi_def_cfa %r11,8 + movq 64+24(%rsp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,@function +.align 16 +blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,@function +.align 16 +blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,@function +.align 16 +blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_hcopy,.-blst_sha256_hcopy + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/sha256-x86_64.s b/blst/elf/sha256-x86_64.s new file mode 100644 index 0000000..47fdc5b --- /dev/null +++ b/blst/elf/sha256-x86_64.s @@ -0,0 +1,1446 @@ +.text + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_block_data_order_shaext +.hidden blst_sha256_block_data_order_shaext +.type blst_sha256_block_data_order_shaext,@function +.align 64 +blst_sha256_block_data_order_shaext: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext +.globl blst_sha256_block_data_order +.hidden blst_sha256_block_data_order +.type blst_sha256_block_data_order,@function +.align 64 +blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $40,%rsp +.cfi_adjust_cfa_offset 40 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + xorps %xmm0,%xmm0 + leaq 40+48(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movq 40(%rbp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbx +.cfi_restore %rbx + movq -8(%r11),%rbp +.cfi_restore %rbp + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,@function +.align 16 +blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,@function +.align 16 +blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,@function +.align 16 +blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_hcopy,.-blst_sha256_hcopy + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/errors.h b/blst/errors.h new file mode 100644 index 0000000..425daeb --- /dev/null +++ b/blst/errors.h @@ -0,0 +1,19 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_ERRORS_H__ +#define __BLS12_381_ASM_ERRORS_H__ + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, +} BLST_ERROR; + +#endif diff --git a/blst/exp.c b/blst/exp.c new file mode 100644 index 0000000..55c5c5a --- /dev/null +++ b/blst/exp.c @@ -0,0 +1,55 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +/* + * |out| = |inp|^|pow|, small footprint, public exponent + */ +static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ +#if 1 + vec384 ret; + + vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ + --pow_bits; /* most significant bit is set, skip over */ + while (pow_bits--) { + sqr_mont_384(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* out = ret */ +#else + unsigned int i; + vec384 sqr; + + vec_copy(sqr, inp, sizeof(sqr)); + for (i = 0; !is_bit_set(pow, i++);) + sqr_mont_384(sqr, sqr, sqr, p, n0); + vec_copy(out, sqr, sizeof(sqr)); + for (; i < pow_bits; i++) { + sqr_mont_384(sqr, sqr, sqr, p, n0); + if (is_bit_set(pow, i)) + mul_mont_384(out, out, sqr, p, n0); + } +#endif +} + +static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ + vec384x ret; + + vec_copy(ret, inp, sizeof(ret)); /* |ret| = |inp|^1 */ + --pow_bits; /* most significant bit is accounted for, skip over */ + while (pow_bits--) { + sqr_mont_384x(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384x(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* |out| = |ret| */ +} diff --git a/blst/exports.c b/blst/exports.c new file mode 100644 index 0000000..833c18a --- /dev/null +++ b/blst/exports.c @@ -0,0 +1,584 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * Why this file? Overall goal is to ensure that all internal calls + * remain internal after linking application. This is to both + * + * a) minimize possibility of external name conflicts (since all + * non-blst-prefixed and [assembly subroutines] remain static); + * b) preclude possibility of unintentional internal reference + * overload in shared library context (one can achieve same + * effect with -Bsymbolic, but we don't want to rely on end-user + * to remember to use it); + */ + +#include "fields.h" + +/* + * BLS12-381-specifc Fr shortcuts to assembly. + */ +void blst_fr_add(vec256 ret, const vec256 a, const vec256 b) +{ add_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_sub(vec256 ret, const vec256 a, const vec256 b) +{ sub_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_mul_by_3(vec256 ret, const vec256 a) +{ mul_by_3_mod_256(ret, a, BLS12_381_r); } + +void blst_fr_lshift(vec256 ret, const vec256 a, size_t count) +{ lshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_rshift(vec256 ret, const vec256 a, size_t count) +{ rshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b) +{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } + +void blst_fr_sqr(vec256 ret, const vec256 a) +{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_cneg(vec256 ret, const vec256 a, int flag) +{ cneg_mod_256(ret, a, is_zero(flag) ^ 1, BLS12_381_r); } + +void blst_fr_to(vec256 ret, const vec256 a) +{ mul_mont_sparse_256(ret, a, BLS12_381_rRR, BLS12_381_r, r0); } + +void blst_fr_from(vec256 ret, const vec256 a) +{ from_mont_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_from_scalar(vec256 ret, const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret == (uptr_t)a && is_endian.little) { + mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + } else { + vec256 out; + limbs_from_le_bytes(out, a, 32); + mul_mont_sparse_256(ret, out, BLS12_381_rRR, BLS12_381_r, r0); + vec_zero(out, sizeof(out)); + } +} + +void blst_scalar_from_fr(pow256 ret, const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret == (uptr_t)a && is_endian.little) { + from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); + } else { + vec256 out; + from_mont_256(out, a, BLS12_381_r, r0); + le_bytes_from_limbs(ret, out, 32); + vec_zero(out, sizeof(out)); + } +} + +int blst_scalar_fr_check(const pow256 a) +{ return (int)(check_mod_256(a, BLS12_381_r) | + bytes_are_zero(a, sizeof(pow256))); +} + +int blst_sk_check(const pow256 a) +{ return (int)check_mod_256(a, BLS12_381_r); } + +int blst_sk_add_n_check(pow256 ret, const pow256 a, const pow256 b) +{ return (int)add_n_check_mod_256(ret, a, b, BLS12_381_r); } + +int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b) +{ return (int)sub_n_check_mod_256(ret, a, b, BLS12_381_r); } + +int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b) +{ + vec256 a_fr, b_fr; + const union { + long one; + char little; + } is_endian = { 1 }; + + if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) { + limbs_from_le_bytes(a_fr, a, sizeof(a_fr)); + limbs_from_le_bytes(b_fr, b, sizeof(a_fr)); + a = (const byte *)a_fr; + b = (const byte *)b_fr; + } + mul_mont_sparse_256(a_fr, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + mul_mont_sparse_256(b_fr, (const limb_t *)b, BLS12_381_rRR, + BLS12_381_r, r0); + mul_mont_sparse_256(a_fr, a_fr, b_fr, BLS12_381_r, r0); + from_mont_256(a_fr, a_fr, BLS12_381_r, r0); + le_bytes_from_limbs(ret, a_fr, sizeof(a_fr)); + + return (int)(vec_is_zero(a_fr, sizeof(a_fr)) ^ 1); +} + +void blst_sk_inverse(pow256 ret, const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (((size_t)a|(size_t)ret)%sizeof(limb_t) == 0 && is_endian.little) { + limb_t *out = (limb_t *)ret; + mul_mont_sparse_256(out, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + reciprocal_fr(out, out); + from_mont_256(out, out, BLS12_381_r, r0); + } else { + vec256 out; + limbs_from_le_bytes(out, a, 32); + mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); + reciprocal_fr(out, out); + from_mont_256(out, out, BLS12_381_r, r0); + le_bytes_from_limbs(ret, out, 32); + vec_zero(out, sizeof(out)); + } +} + +/* + * BLS12-381-specifc Fp shortcuts to assembly. + */ +void blst_fp_add(vec384 ret, const vec384 a, const vec384 b) +{ add_fp(ret, a, b); } + +void blst_fp_sub(vec384 ret, const vec384 a, const vec384 b) +{ sub_fp(ret, a, b); } + +void blst_fp_mul_by_3(vec384 ret, const vec384 a) +{ mul_by_3_fp(ret, a); } + +void blst_fp_mul_by_8(vec384 ret, const vec384 a) +{ mul_by_8_fp(ret, a); } + +void blst_fp_lshift(vec384 ret, const vec384 a, size_t count) +{ lshift_fp(ret, a, count); } + +void blst_fp_mul(vec384 ret, const vec384 a, const vec384 b) +{ mul_fp(ret, a, b); } + +void blst_fp_sqr(vec384 ret, const vec384 a) +{ sqr_fp(ret, a); } + +void blst_fp_cneg(vec384 ret, const vec384 a, int flag) +{ cneg_fp(ret, a, is_zero(flag) ^ 1); } + +void blst_fp_to(vec384 ret, const vec384 a) +{ mul_fp(ret, a, BLS12_381_RR); } + +void blst_fp_from(vec384 ret, const vec384 a) +{ from_fp(ret, a); } + +/* + * Fp serialization/deserialization. + */ +void blst_fp_from_uint32(vec384 ret, const unsigned int a[12]) +{ + if (sizeof(limb_t) == 8) { + int i; + for (i = 0; i < 6; i++) + ret[i] = a[2*i] | ((limb_t)a[2*i+1] << (32 & (8*sizeof(limb_t)-1))); + a = (const unsigned int *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint32_from_fp(unsigned int ret[12], const vec384 a) +{ + if (sizeof(limb_t) == 4) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) { + limb_t limb = out[i]; + ret[2*i] = (unsigned int)limb; + ret[2*i+1] = (unsigned int)(limb >> (32 & (8*sizeof(limb_t)-1))); + } + } +} + +void blst_fp_from_uint64(vec384 ret, const unsigned long long a[6]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 6; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint64_from_fp(unsigned long long ret[6], const vec384 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + } +} + +void blst_fp_from_bendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_be_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_bendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + be_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +void blst_fp_from_lendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_le_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_lendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + le_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +/* + * BLS12-381-specifc Fp2 shortcuts to assembly. + */ +void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b) +{ add_fp2(ret, a, b); } + +void blst_fp2_sub(vec384x ret, const vec384x a, const vec384x b) +{ sub_fp2(ret, a, b); } + +void blst_fp2_mul_by_3(vec384x ret, const vec384x a) +{ mul_by_3_fp2(ret, a); } + +void blst_fp2_mul_by_8(vec384x ret, const vec384x a) +{ mul_by_8_fp2(ret, a); } + +void blst_fp2_lshift(vec384x ret, const vec384x a, size_t count) +{ lshift_fp2(ret, a, count); } + +void blst_fp2_mul(vec384x ret, const vec384x a, const vec384x b) +{ mul_fp2(ret, a, b); } + +void blst_fp2_sqr(vec384x ret, const vec384x a) +{ sqr_fp2(ret, a); } + +void blst_fp2_cneg(vec384x ret, const vec384x a, int flag) +{ cneg_fp2(ret, a, is_zero(flag) ^ 1); } + +/* + * Scalar serialization/deseriazation + */ +void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 8; i++) { + unsigned int w = a[i]; + *ret++ = (byte)w; + *ret++ = (byte)(w >> 8); + *ret++ = (byte)(w >> 16); + *ret++ = (byte)(w >> 24); + } +} + +void blst_uint32_from_scalar(unsigned int ret[8], const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 8; i++) { + unsigned int w = (unsigned int)(*a++); + w |= (unsigned int)(*a++) << 8; + w |= (unsigned int)(*a++) << 16; + w |= (unsigned int)(*a++) << 24; + ret[i] = w; + } +} + +void blst_scalar_from_uint64(pow256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 4; i++) { + unsigned long long w = a[i]; + *ret++ = (byte)w; + *ret++ = (byte)(w >> 8); + *ret++ = (byte)(w >> 16); + *ret++ = (byte)(w >> 24); + *ret++ = (byte)(w >> 32); + *ret++ = (byte)(w >> 40); + *ret++ = (byte)(w >> 48); + *ret++ = (byte)(w >> 56); + } +} + +void blst_uint64_from_scalar(unsigned long long ret[4], const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 4; i++) { + unsigned long long w = (unsigned long long)(*a++); + w |= (unsigned long long)(*a++) << 8; + w |= (unsigned long long)(*a++) << 16; + w |= (unsigned long long)(*a++) << 24; + w |= (unsigned long long)(*a++) << 32; + w |= (unsigned long long)(*a++) << 40; + w |= (unsigned long long)(*a++) << 48; + w |= (unsigned long long)(*a++) << 56; + ret[i] = w; + } +} + +void blst_scalar_from_bendian(pow256 ret, const unsigned char a[32]) +{ + vec256 out; + limbs_from_be_bytes(out, a, sizeof(out)); + le_bytes_from_limbs(ret, out, sizeof(out)); + vec_zero(out, sizeof(out)); +} + +void blst_bendian_from_scalar(unsigned char ret[32], const pow256 a) +{ + vec256 out; + limbs_from_le_bytes(out, a, sizeof(out)); + be_bytes_from_limbs(ret, out, sizeof(out)); + vec_zero(out, sizeof(out)); +} + +void blst_scalar_from_lendian(pow256 ret, const unsigned char a[32]) +{ + size_t i; + + if ((uptr_t)ret==(uptr_t)a) + return; + + for (i = 0; i < 32; i++) + ret[i] = a[i]; +} + +void blst_lendian_from_scalar(unsigned char ret[32], const pow256 a) +{ + size_t i; + + if ((uptr_t)ret==(uptr_t)a) + return; + + for (i = 0; i < 32; i++) + ret[i] = a[i]; +} + +void blst_fr_from_uint64(vec256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 4; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); + } else { + vec256 out; + int i; + + from_mont_256(out, a, BLS12_381_r, r0); + for (i = 0; i < 4; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + vec_zero(out, sizeof(out)); + } +} + +int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n) +{ + struct { vec256 out, digit, radix; } t; + limb_t ret; + + vec_zero(t.out, sizeof(t.out)); + vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix)); + + while (n > 32) { + limbs_from_le_bytes(t.digit, bytes, 32); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0); + bytes += 32; + n -= 32; + } + + vec_zero(t.digit, sizeof(t.digit)); + limbs_from_le_bytes(t.digit, bytes, n); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + + ret = vec_is_zero(t.out, sizeof(t.out)); + le_bytes_from_limbs(out, t.out, 32); + vec_zero(t.out, 2*sizeof(t.out)); + + return (int)(ret^1); +} + +int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n) +{ + struct { vec256 out, digit, radix; } t; + limb_t ret; + + vec_zero(t.out, sizeof(t.out)); + vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix)); + + bytes += n; + while (n > 32) { + limbs_from_be_bytes(t.digit, bytes -= 32, 32); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0); + n -= 32; + } + + vec_zero(t.digit, sizeof(t.digit)); + limbs_from_be_bytes(t.digit, bytes -= n, n); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + + ret = vec_is_zero(t.out, sizeof(t.out)); + le_bytes_from_limbs(out, t.out, 32); + vec_zero(t.out, 2*sizeof(t.out)); + + return (int)(ret^1); +} + +/* + * Test facilitator + */ +static unsigned char nibble(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'a' && c <= 'f') + return 10 + c - 'a'; + else if (c >= 'A' && c <= 'F') + return 10 + c - 'A'; + else + return 16; +} + +static void limbs_from_hexascii(limb_t *ret, size_t sz, const char *hex) +{ + size_t len; + limb_t limb = 0; + + if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) + hex += 2; + + for (len = 0; len<2*sz && nibble(hex[len])<16; len++) ; + + vec_zero(ret, sz); + + while(len--) { + limb <<= 4; + limb |= nibble(*hex++); + if (len % (2*sizeof(limb_t)) == 0) + ret[len / (2*sizeof(limb_t))] = limb; + } +} + +void blst_scalar_from_hexascii(vec256 ret, const char *hex) +{ limbs_from_hexascii(ret, sizeof(vec256), hex); } + +void blst_fp_from_hexascii(vec384 ret, const char *hex) +{ + limbs_from_hexascii(ret, sizeof(vec384), hex); + mul_fp(ret, ret, BLS12_381_RR); +} diff --git a/blst/fields.h b/blst/fields.h new file mode 100644 index 0000000..3e451c4 --- /dev/null +++ b/blst/fields.h @@ -0,0 +1,211 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_FIELDS_H__ +#define __BLS12_381_ASM_FIELDS_H__ + +#include "vect.h" +#include "consts.h" + +#ifndef __CUDA_ARCH__ +/* + * BLS12-381-specifc Fp shortcuts to assembly. + */ +static inline void add_fp(vec384 ret, const vec384 a, const vec384 b) +{ add_mod_384(ret, a, b, BLS12_381_P); } + +static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b) +{ sub_mod_384(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp(vec384 ret, const vec384 a) +{ mul_by_3_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp(vec384 ret, const vec384 a) +{ mul_by_8_mod_384(ret, a, BLS12_381_P); } + +static inline void lshift_fp(vec384 ret, const vec384 a, size_t count) +{ lshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void rshift_fp(vec384 ret, const vec384 a, size_t count) +{ rshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void div_by_2_fp(vec384 ret, const vec384 a) +{ div_by_2_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b) +{ mul_mont_384(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp(vec384 ret, const vec384 a) +{ sqr_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp(vec384 ret, const vec384 a, bool_t flag) +{ cneg_mod_384(ret, a, flag, BLS12_381_P); } + +static inline void from_fp(vec384 ret, const vec384 a) +{ from_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void redc_fp(vec384 ret, const vec768 a) +{ redc_mont_384(ret, a, BLS12_381_P, p0); } + +/* + * BLS12-381-specifc Fp2 shortcuts to assembly. + */ +static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) +{ add_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b) +{ sub_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp2(vec384x ret, const vec384x a) +{ mul_by_3_mod_384x(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp2(vec384x ret, const vec384x a) +{ mul_by_8_mod_384x(ret, a, BLS12_381_P); } + +static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count) +{ + lshift_mod_384(ret[0], a[0], count, BLS12_381_P); + lshift_mod_384(ret[1], a[1], count, BLS12_381_P); +} + +static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b) +{ mul_mont_384x(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp2(vec384x ret, const vec384x a) +{ sqr_mont_384x(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp2(vec384x ret, const vec384x a, bool_t flag) +{ + cneg_mod_384(ret[0], a[0], flag, BLS12_381_P); + cneg_mod_384(ret[1], a[1], flag, BLS12_381_P); +} + +#define vec_load_global vec_copy + +static void reciprocal_fp(vec384 out, const vec384 inp); +static void flt_reciprocal_fp(vec384 out, const vec384 inp); +static bool_t recip_sqrt_fp(vec384 out, const vec384 inp); +static bool_t sqrt_fp(vec384 out, const vec384 inp); + +static void reciprocal_fp2(vec384x out, const vec384x inp); +static void flt_reciprocal_fp2(vec384x out, const vec384x inp); +static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, + const vec384x recip_ZZZ, const vec384x magic_ZZZ); +static bool_t sqrt_fp2(vec384x out, const vec384x inp); +static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp); + +typedef vec384x vec384fp2; +typedef vec384fp2 vec384fp6[3]; +typedef vec384fp6 vec384fp12[2]; + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b); +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0); +static void conjugate_fp12(vec384fp12 a); +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n); + +#else + +extern "C" { +__device__ void mul_fp(vec384 ret, const vec384 a, const vec384 b); +__device__ void sqr_fp(vec384 ret, const vec384 a); +__device__ void add_fp(vec384 ret, const vec384 a, const vec384 b); +__device__ void sub_fp(vec384 ret, const vec384 a, const vec384 b); +__device__ void cneg_fp(vec384 ret, const vec384 ap, unsigned int flag); +__device__ void rshift_fp(vec384 ret, const vec384 a, unsigned int cnt); +__device__ void lshift_fp(vec384 ret, const vec384 a, unsigned int cnt); +__device__ void mul_by_3_fp(vec384 ret, const vec384 a); +__device__ void from_fp(vec384 ret, const vec384 a); + +#pragma diag_suppress 3151 +__device__ void mul_384(vec768 ret, const vec384 a, const vec384 b); +__device__ void sqr_384(vec768 ret, const vec384 a); +#pragma diag_default 3151 +__device__ void redc_fp(vec384 ret, const vec768 a); +__device__ void add_fpx2(vec768 ret, const vec768 a, const vec768 b); +__device__ void sub_fpx2(vec768 ret, const vec768 a, const vec768 b); + +__device__ void vec_load_global(limb_t *ret, const limb_t *a, + unsigned int sz = 48); +} + +static inline void mul_by_8_fp(vec384 ret, const vec384 a) +{ lshift_fp(ret, a, 3); } + +static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) +{ + add_fp(ret[0], a[0], b[0]); + add_fp(ret[1], a[1], b[1]); +} + +static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b) +{ + sub_fp(ret[0], a[0], b[0]); + sub_fp(ret[1], a[1], b[1]); +} + +static inline void mul_by_3_fp2(vec384x ret, const vec384x a) +{ + mul_by_3_fp(ret[0], a[0]); + mul_by_3_fp(ret[1], a[1]); +} + +static inline void mul_by_8_fp2(vec384x ret, const vec384x a) +{ + lshift_fp(ret[0], a[0], 3); + lshift_fp(ret[1], a[1], 3); +} + +static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count) +{ + lshift_fp(ret[0], a[0], count); + lshift_fp(ret[1], a[1], count); +} + +static inline void cneg_fp2(vec384x ret, const vec384x a, limb_t flag) +{ + cneg_fp(ret[0], a[0], flag); + cneg_fp(ret[1], a[1], flag); +} + +static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b) +{ + vec384 aa, bb, cc; + + add_fp(aa, a[0], a[1]); + add_fp(bb, b[0], b[1]); + mul_fp(bb, bb, aa); + + mul_fp(aa, a[0], b[0]); + mul_fp(cc, a[1], b[1]); + + sub_fp(ret[0], aa, cc); + sub_fp(ret[1], bb, aa); + sub_fp(ret[1], ret[1], cc); +} + +static inline void sqr_fp2(vec384x ret, const vec384x a) +{ + vec384 t0, t1; + + add_fp(t0, a[0], a[1]); + sub_fp(t1, a[0], a[1]); + + mul_fp(ret[1], a[0], a[1]); + add_fp(ret[1], ret[1], ret[1]); + + mul_fp(ret[0], t0, t1); +} +#endif + +#define neg_fp(r,a) cneg_fp((r),(a),1) +#define neg_fp2(r,a) cneg_fp2((r),(a),1) + +#endif /* __BLS12_381_ASM_FIELDS_H__ */ diff --git a/blst/fp12_tower.c b/blst/fp12_tower.c new file mode 100644 index 0000000..037b7db --- /dev/null +++ b/blst/fp12_tower.c @@ -0,0 +1,771 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +/* + * Fp2 = Fp[u] / (u^2 + 1) + * Fp6 = Fp2[v] / (v^3 - u - 1) + * Fp12 = Fp6[w] / (w^2 - v) + */ + +static inline void mul_by_u_plus_1_fp2(vec384x ret, const vec384x a) +{ mul_by_1_plus_i_mod_384x(ret, a, BLS12_381_P); } + +#if 1 && !defined(__BLST_NO_ASM__) +#define __FP2x2__ +/* + * Fp2x2 is a "widened" version of Fp2, which allows to consolidate + * reductions from several multiplications. In other words instead of + * "mul_redc-mul_redc-add" we get "mul-mul-add-redc," where latter + * addition is double-width... To be more specific this gives ~7-10% + * faster pairing depending on platform... + */ +typedef vec768 vec768x[2]; + +static inline void add_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + add_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + add_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void sub_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + sub_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + sub_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void mul_by_u_plus_1_fp2x2(vec768x ret, const vec768x a) +{ + /* caveat lector! |ret| may not be same as |a| */ + sub_mod_384x384(ret[0], a[0], a[1], BLS12_381_P); + add_mod_384x384(ret[1], a[0], a[1], BLS12_381_P); +} + +static inline void redc_fp2x2(vec384x ret, const vec768x a) +{ + redc_mont_384(ret[0], a[0], BLS12_381_P, p0); + redc_mont_384(ret[1], a[1], BLS12_381_P, p0); +} + +static void mul_fp2x2(vec768x ret, const vec384x a, const vec384x b) +{ +#if 1 + mul_382x(ret, a, b, BLS12_381_P); /* +~6% in Miller loop */ +#else + union { vec384 x[2]; vec768 x2; } t; + + add_mod_384(t.x[0], a[0], a[1], BLS12_381_P); + add_mod_384(t.x[1], b[0], b[1], BLS12_381_P); + mul_384(ret[1], t.x[0], t.x[1]); + + mul_384(ret[0], a[0], b[0]); + mul_384(t.x2, a[1], b[1]); + + sub_mod_384x384(ret[1], ret[1], ret[0], BLS12_381_P); + sub_mod_384x384(ret[1], ret[1], t.x2, BLS12_381_P); + + sub_mod_384x384(ret[0], ret[0], t.x2, BLS12_381_P); +#endif +} + +static void sqr_fp2x2(vec768x ret, const vec384x a) +{ +#if 1 + sqr_382x(ret, a, BLS12_381_P); /* +~5% in final exponentiation */ +#else + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], BLS12_381_P); + sub_mod_384(t1, a[0], a[1], BLS12_381_P); + + mul_384(ret[1], a[0], a[1]); + add_mod_384x384(ret[1], ret[1], ret[1], BLS12_381_P); + + mul_384(ret[0], t0, t1); +#endif +} +#endif /* __FP2x2__ */ + +/* + * Fp6 extension + */ +#if defined(__FP2x2__) /* ~10-13% improvement for mul_fp12 and sqr_fp12 */ +typedef vec768x vec768fp6[3]; + +static inline void sub_fp6x2(vec768fp6 ret, const vec768fp6 a, + const vec768fp6 b) +{ + sub_fp2x2(ret[0], a[0], b[0]); + sub_fp2x2(ret[1], a[1], b[1]); + sub_fp2x2(ret[2], a[2], b[2]); +} + +static void mul_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768x t0, t1, t2; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + mul_fp2x2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(aa, a[1], a[2]); + add_fp2(bb, b[1], b[2]); + mul_fp2x2(ret[0], aa, bb); + sub_fp2x2(ret[0], ret[0], t1); + sub_fp2x2(ret[0], ret[0], t2); + mul_by_u_plus_1_fp2x2(ret[1], ret[0]); /* borrow ret[1] for a moment */ + add_fp2x2(ret[0], ret[1], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2x2(ret[2], t2); /* borrow ret[2] for a moment */ + add_fp2x2(ret[1], ret[1], ret[2]); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(aa, a[0], a[2]); + add_fp2(bb, b[0], b[2]); + mul_fp2x2(ret[2], aa, bb); + sub_fp2x2(ret[2], ret[2], t0); + sub_fp2x2(ret[2], ret[2], t2); + add_fp2x2(ret[2], ret[2], t1); +} + +static inline void redc_fp6x2(vec384fp6 ret, const vec768fp6 a) +{ + redc_fp2x2(ret[0], a[0]); + redc_fp2x2(ret[1], a[1]); + redc_fp2x2(ret[2], a[2]); +} + +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768fp6 r; + + mul_fp6x2(r, a, b); + redc_fp6x2(ret, r); /* narrow to normal width */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec768x s0, m01, m12, s2, rx; + + sqr_fp2x2(s0, a[0]); + + mul_fp2x2(m01, a[0], a[1]); + add_fp2x2(m01, m01, m01); + + mul_fp2x2(m12, a[1], a[2]); + add_fp2x2(m12, m12, m12); + + sqr_fp2x2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2x2(rx, ret[2]); + sub_fp2x2(rx, rx, s0); + sub_fp2x2(rx, rx, s2); + sub_fp2x2(rx, rx, m01); + sub_fp2x2(rx, rx, m12); + redc_fp2x2(ret[2], rx); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2x2(rx, m12); + add_fp2x2(rx, rx, s0); + redc_fp2x2(ret[0], rx); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2x2(rx, s2); + add_fp2x2(rx, rx, m01); + redc_fp2x2(ret[1], rx); +} +#else +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, t2, t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + mul_fp2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(t4, a[1], a[2]); + add_fp2(t5, b[1], b[2]); + mul_fp2(t3, t4, t5); + sub_fp2(t3, t3, t1); + sub_fp2(t3, t3, t2); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2(t4, t2); + add_fp2(ret[1], ret[1], t4); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(t4, a[0], a[2]); + add_fp2(t5, b[0], b[2]); + mul_fp2(ret[2], t4, t5); + sub_fp2(ret[2], ret[2], t0); + sub_fp2(ret[2], ret[2], t2); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x s0, m01, m12, s2; + + sqr_fp2(s0, a[0]); + + mul_fp2(m01, a[0], a[1]); + add_fp2(m01, m01, m01); + + mul_fp2(m12, a[1], a[2]); + add_fp2(m12, m12, m12); + + sqr_fp2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2(ret[2], ret[2]); + sub_fp2(ret[2], ret[2], s0); + sub_fp2(ret[2], ret[2], s2); + sub_fp2(ret[2], ret[2], m01); + sub_fp2(ret[2], ret[2], m12); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2(ret[0], m12); + add_fp2(ret[0], ret[0], s0); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2(ret[1], s2); + add_fp2(ret[1], ret[1], m01); +} +#endif + +static void add_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + add_fp2(ret[0], a[0], b[0]); + add_fp2(ret[1], a[1], b[1]); + add_fp2(ret[2], a[2], b[2]); +} + +static void sub_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + sub_fp2(ret[0], a[0], b[0]); + sub_fp2(ret[1], a[1], b[1]); + sub_fp2(ret[2], a[2], b[2]); +} + +static void neg_fp6(vec384fp6 ret, const vec384fp6 a) +{ + neg_fp2(ret[0], a[0]); + neg_fp2(ret[1], a[1]); + neg_fp2(ret[2], a[2]); +} + +#if 0 +#define mul_by_v_fp6 mul_by_v_fp6 +static void mul_by_v_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x t; + + mul_by_u_plus_1_fp2(t, a[2]); + vec_copy(ret[2], a[1], sizeof(a[1])); + vec_copy(ret[1], a[0], sizeof(a[0])); + vec_copy(ret[0], t, sizeof(t)); +} +#endif + +/* + * Fp12 extension + */ +#if defined(__FP2x2__) +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec768fp6 t0, t1, rx; + vec384fp6 t2; + + mul_fp6x2(t0, a[0], b[0]); + mul_fp6x2(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6x2(rx, ret[1], t2); + sub_fp6x2(rx, rx, t0); + sub_fp6x2(rx, rx, t1); + redc_fp6x2(ret[1], rx); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rx[0], t1[2]); + add_fp2x2(rx[0], t0[0], rx[0]); + add_fp2x2(rx[1], t0[1], t1[0]); + add_fp2x2(rx[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rx); +} + +static inline void mul_by_0y0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + mul_fp2x2(ret[1], a[2], b); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + mul_fp2x2(ret[1], a[0], b); + mul_fp2x2(ret[2], a[1], b); +} + +static void mul_by_xy0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp6 b) +{ + vec768x t0, t1; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2x2(ret[1], a[2], b[1]); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + add_fp2x2(ret[0], ret[0], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2x2(ret[2], a[2], b[0]); + add_fp2x2(ret[2], ret[2], t1); +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec768fp6 t0, t1, rr; + vec384fp6 t2; + + mul_by_xy0_fp6x2(t0, a[0], xy00z0); + mul_by_0y0_fp6x2(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6x2(rr, ret[1], t2); + sub_fp6x2(rr, rr, t0); + sub_fp6x2(rr, rr, t1); + redc_fp6x2(ret[1], rr); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rr[0], t1[2]); + add_fp2x2(rr[0], t0[0], rr[0]); + add_fp2x2(rr[1], t0[1], t1[0]); + add_fp2x2(rr[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rr); +} +#else +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec384fp6 t0, t1, t2; + + mul_fp6(t0, a[0], b[0]); + mul_fp6(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} + +static inline void mul_by_0y0_fp6(vec384fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + vec384x t; + + mul_fp2(t, a[2], b); + mul_fp2(ret[2], a[1], b); + mul_fp2(ret[1], a[0], b); + mul_by_u_plus_1_fp2(ret[0], t); +} + +static void mul_by_xy0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, /*t2,*/ t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2(t3, a[2], b[1]); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2(ret[2], a[2], b[0]); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec384fp6 t0, t1, t2; + + mul_by_xy0_fp6(t0, a[0], xy00z0); + mul_by_0y0_fp6(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} +#endif + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + add_fp6(t0, a[0], a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, a[1]); + add_fp6(t1, a[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], a[1][2]); + add_fp2(t1[0], a[0][0], t1[2]); + add_fp2(t1[1], a[0][1], a[1][0]); + add_fp2(t1[2], a[0][2], a[1][1]); +#endif + mul_fp6(t0, t0, t1); + mul_fp6(t1, a[0], a[1]); + + /* ret[1] = 2*(a0*a1) */ + add_fp6(ret[1], t1, t1); + + /* ret[0] = (a0 + a1)*(a0 + a1*v) - a0*a1 - a0*a1*v + = a0^2 + a1^2*v */ + sub_fp6(ret[0], t0, t1); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(ret[0], ret[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(ret[0][0], ret[0][0], t1[2]); + sub_fp2(ret[0][1], ret[0][1], t1[0]); + sub_fp2(ret[0][2], ret[0][2], t1[1]); +#endif +} + +static void conjugate_fp12(vec384fp12 a) +{ neg_fp6(a[1], a[1]); } + +static void inverse_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x c0, c1, c2, t0, t1; + + /* c0 = a0^2 - (a1*a2)*(u+1) */ + sqr_fp2(c0, a[0]); + mul_fp2(t0, a[1], a[2]); + mul_by_u_plus_1_fp2(t0, t0); + sub_fp2(c0, c0, t0); + + /* c1 = a2^2*(u+1) - (a0*a1) */ + sqr_fp2(c1, a[2]); + mul_by_u_plus_1_fp2(c1, c1); + mul_fp2(t0, a[0], a[1]); + sub_fp2(c1, c1, t0); + + /* c2 = a1^2 - a0*a2 */ + sqr_fp2(c2, a[1]); + mul_fp2(t0, a[0], a[2]); + sub_fp2(c2, c2, t0); + + /* (a2*c1 + a1*c2)*(u+1) + a0*c0 */ + mul_fp2(t0, c1, a[2]); + mul_fp2(t1, c2, a[1]); + add_fp2(t0, t0, t1); + mul_by_u_plus_1_fp2(t0, t0); + mul_fp2(t1, c0, a[0]); + add_fp2(t0, t0, t1); + + reciprocal_fp2(t1, t0); + + mul_fp2(ret[0], c0, t1); + mul_fp2(ret[1], c1, t1); + mul_fp2(ret[2], c2, t1); +} + +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + sqr_fp6(t0, a[0]); + sqr_fp6(t1, a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(t0, t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(t0[0], t0[0], t1[2]); + sub_fp2(t0[1], t0[1], t1[0]); + sub_fp2(t0[2], t0[2], t1[1]); +#endif + + inverse_fp6(t1, t0); + + mul_fp6(ret[0], a[0], t1); + mul_fp6(ret[1], a[1], t1); + neg_fp6(ret[1], ret[1]); +} + +typedef vec384x vec384fp4[2]; + +#if defined(__FP2x2__) +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec768x t0, t1, t2; + + sqr_fp2x2(t0, a0); + sqr_fp2x2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2x2(t2, t1); + add_fp2x2(t2, t2, t0); + redc_fp2x2(ret[0], t2); + + sqr_fp2x2(t2, ret[1]); + sub_fp2x2(t2, t2, t0); + sub_fp2x2(t2, t2, t1); + redc_fp2x2(ret[1], t2); +} +#else +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec384x t0, t1; + + sqr_fp2(t0, a0); + sqr_fp2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2(ret[0], t1); + add_fp2(ret[0], ret[0], t0); + + sqr_fp2(ret[1], ret[1]); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); +} +#endif + +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp4 t0, t1, t2; + + sqr_fp4(t0, a[0][0], a[1][1]); + sqr_fp4(t1, a[1][0], a[0][2]); + sqr_fp4(t2, a[0][1], a[1][2]); + + sub_fp2(ret[0][0], t0[0], a[0][0]); + add_fp2(ret[0][0], ret[0][0], ret[0][0]); + add_fp2(ret[0][0], ret[0][0], t0[0]); + + sub_fp2(ret[0][1], t1[0], a[0][1]); + add_fp2(ret[0][1], ret[0][1], ret[0][1]); + add_fp2(ret[0][1], ret[0][1], t1[0]); + + sub_fp2(ret[0][2], t2[0], a[0][2]); + add_fp2(ret[0][2], ret[0][2], ret[0][2]); + add_fp2(ret[0][2], ret[0][2], t2[0]); + + mul_by_u_plus_1_fp2(t2[1], t2[1]); + add_fp2(ret[1][0], t2[1], a[1][0]); + add_fp2(ret[1][0], ret[1][0], ret[1][0]); + add_fp2(ret[1][0], ret[1][0], t2[1]); + + add_fp2(ret[1][1], t0[1], a[1][1]); + add_fp2(ret[1][1], ret[1][1], ret[1][1]); + add_fp2(ret[1][1], ret[1][1], t0[1]); + + add_fp2(ret[1][2], t1[1], a[1][2]); + add_fp2(ret[1][2], ret[1][2], ret[1][2]); + add_fp2(ret[1][2], ret[1][2], t1[1]); +} + +/* + * caveat lector! |n| has to be non-zero and not more than 3! + */ +static inline void frobenius_map_fp2(vec384x ret, const vec384x a, size_t n) +{ + vec_copy(ret[0], a[0], sizeof(ret[0])); + cneg_fp(ret[1], a[1], n & 1); +} + +static void frobenius_map_fp6(vec384fp6 ret, const vec384fp6 a, size_t n) +{ + static const vec384x coeffs1[] = { /* (u + 1)^((P^n - 1) / 3) */ + { { 0 }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) } }, + { { TO_LIMB_T(0x30f1361b798a64e8), TO_LIMB_T(0xf3b8ddab7ece5a2a), + TO_LIMB_T(0x16a8ca3ac61577f7), TO_LIMB_T(0xc26a2ff874fd029b), + TO_LIMB_T(0x3636b76660701c6e), TO_LIMB_T(0x051ba4ab241b6160) } }, + { { 0 }, { ONE_MONT_P } } + }; + static const vec384 coeffs2[] = { /* (u + 1)^((2P^n - 2) / 3) */ + { TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + + frobenius_map_fp2(ret[0], a[0], n); + frobenius_map_fp2(ret[1], a[1], n); + frobenius_map_fp2(ret[2], a[2], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1], ret[1], coeffs1[n]); + mul_fp(ret[2][0], ret[2][0], coeffs2[n]); + mul_fp(ret[2][1], ret[2][1], coeffs2[n]); +} + +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + static const vec384x coeffs[] = { /* (u + 1)^((P^n - 1) / 6) */ + { { TO_LIMB_T(0x07089552b319d465), TO_LIMB_T(0xc6695f92b50a8313), + TO_LIMB_T(0x97e83cccd117228f), TO_LIMB_T(0xa35baecab2dc29ee), + TO_LIMB_T(0x1ce393ea5daace4d), TO_LIMB_T(0x08f2220fb0fb66eb) }, + { TO_LIMB_T(0xb2f66aad4ce5d646), TO_LIMB_T(0x5842a06bfc497cec), + TO_LIMB_T(0xcf4895d42599d394), TO_LIMB_T(0xc11b9cba40a8e8d0), + TO_LIMB_T(0x2e3813cbe5a0de89), TO_LIMB_T(0x110eefda88847faf) } }, + { { TO_LIMB_T(0xecfb361b798dba3a), TO_LIMB_T(0xc100ddb891865a2c), + TO_LIMB_T(0x0ec08ff1232bda8e), TO_LIMB_T(0xd5c13cc6f1ca4721), + TO_LIMB_T(0x47222a47bf7b5c04), TO_LIMB_T(0x0110f184e51c5f59) } }, + { { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } }, + }; + + frobenius_map_fp6(ret[0], a[0], n); + frobenius_map_fp6(ret[1], a[1], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1][0], ret[1][0], coeffs[n]); + mul_fp2(ret[1][1], ret[1][1], coeffs[n]); + mul_fp2(ret[1][2], ret[1][2], coeffs[n]); +} + + +/* + * BLS12-381-specifc Fp12 shortcuts. + */ +void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a) +{ sqr_fp12(ret, a); } + +void blst_fp12_cyclotomic_sqr(vec384fp12 ret, const vec384fp12 a) +{ cyclotomic_sqr_fp12(ret, a); } + +void blst_fp12_mul(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ mul_fp12(ret, a, b); } + +void blst_fp12_mul_by_xy00z0(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ mul_by_xy00z0_fp12(ret, a, xy00z0); } + +void blst_fp12_conjugate(vec384fp12 a) +{ conjugate_fp12(a); } + +void blst_fp12_inverse(vec384fp12 ret, const vec384fp12 a) +{ inverse_fp12(ret, a); } + +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(vec384fp12 ret, const vec384fp12 a, size_t n) +{ frobenius_map_fp12(ret, a, n); } + +int blst_fp12_is_equal(const vec384fp12 a, const vec384fp12 b) +{ return (int)vec_is_equal(a, b, sizeof(vec384fp12)); } + +int blst_fp12_is_one(const vec384fp12 a) +{ + return (int)(vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) & + vec_is_zero(a[0][1], sizeof(vec384fp12) - sizeof(a[0][0]))); +} + +const vec384fp12 *blst_fp12_one(void) +{ return (const vec384fp12 *)BLS12_381_Rx.p12; } diff --git a/blst/hash_to_field.c b/blst/hash_to_field.c new file mode 100644 index 0000000..42733b1 --- /dev/null +++ b/blst/hash_to_field.c @@ -0,0 +1,176 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "sha256.h" + +static const vec384 BLS12_381_RRRR = { /* RR^2 */ + TO_LIMB_T(0xed48ac6bd94ca1e0), TO_LIMB_T(0x315f831e03a7adf8), + TO_LIMB_T(0x9a53352a615e29dd), TO_LIMB_T(0x34c04e5e921e1761), + TO_LIMB_T(0x2512d43565724728), TO_LIMB_T(0x0aa6346091755d4d) +}; + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +static void sha256_init_Zpad(SHA256_CTX *ctx) +{ + ctx->h[0] = 0xda5698beU; + ctx->h[1] = 0x17b9b469U; + ctx->h[2] = 0x62335799U; + ctx->h[3] = 0x779fbecaU; + ctx->h[4] = 0x8ce5d491U; + ctx->h[5] = 0xc0d26243U; + ctx->h[6] = 0xbafef9eaU; + ctx->h[7] = 0x1837a9d8U; + ctx->N = 64; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void vec_xor(void *restrict ret, const void *restrict a, + const void *restrict b, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i] ^ bp[i]; +} + +static void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + union { limb_t align; unsigned char c[32]; } b_0; + union { limb_t align; unsigned char c[33+256+31]; } b_i; + unsigned char *p; + size_t i, b_i_bits, b_i_blocks; + SHA256_CTX ctx; + + /* + * compose template for 'strxor(b_0, b_(i-1)) || I2OSP(i, 1) || DST_prime' + */ + if (DST_len > 255) { + sha256_init(&ctx); + sha256_update(&ctx, "H2C-OVERSIZE-DST-", 17); + sha256_update(&ctx, DST, DST_len); + sha256_final(b_0.c, &ctx); + DST = b_0.c, DST_len = 32; + } + b_i_blocks = ((33 + DST_len + 1 + 9) + 63) & -64; + vec_zero(b_i.c + b_i_blocks - 64, 64); + + p = b_i.c + 33; + for (i = 0; i < DST_len; i++) + p[i] = DST[i]; + p[i++] = (unsigned char)DST_len; + p[i++] = 0x80; + p[i+6] = p[i+5] = p[i+4] = p[i+3] = p[i+2] = p[i+1] = p[i+0] = 0; + b_i_bits = (33 + DST_len + 1) * 8; + p = b_i.c + b_i_blocks; + p[-2] = (unsigned char)(b_i_bits >> 8); + p[-1] = (unsigned char)(b_i_bits); + + sha256_init_Zpad(&ctx); /* Z_pad | */ + sha256_update(&ctx, aug, aug_len); /* | aug | */ + sha256_update(&ctx, msg, msg_len); /* | msg | */ + /* | I2OSP(len_in_bytes, 2) || I2OSP(0, 1) || DST_prime */ + b_i.c[30] = (unsigned char)(len_in_bytes >> 8); + b_i.c[31] = (unsigned char)(len_in_bytes); + b_i.c[32] = 0; + sha256_update(&ctx, b_i.c + 30, 3 + DST_len + 1); + sha256_final(b_0.c, &ctx); + + sha256_init_h(ctx.h); + vec_copy(b_i.c, b_0.c, 32); + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + + len_in_bytes += 31; /* ell = ceil(len_in_bytes / b_in_bytes), with */ + len_in_bytes /= 32; /* caller being responsible for accordingly large + * buffer. hash_to_field passes one with length + * divisible by 64, remember? which works... */ + while (--len_in_bytes) { + sha256_init_h(ctx.h); + vec_xor(b_i.c, b_0.c, bytes, 32); + bytes += 32; + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + } +} +#endif + +/* + * |nelems| is 'count * m' from spec + */ +static void hash_to_field(vec384 elems[], size_t nelems, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t L = sizeof(vec384) + 128/8; /* ceil((ceil(log2(p)) + k) / 8) */ + size_t len_in_bytes = L * nelems; /* divisible by 64, hurray! */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 + limb_t *pseudo_random = alloca(len_in_bytes); +#else + limb_t pseudo_random[len_in_bytes/sizeof(limb_t)]; +#endif + unsigned char *bytes; + vec768 elem; + + aug_len = aug!=NULL ? aug_len : 0; + DST_len = DST!=NULL ? DST_len : 0; + + expand_message_xmd((unsigned char *)pseudo_random, len_in_bytes, + aug, aug_len, msg, msg_len, DST, DST_len); + + vec_zero(elem, sizeof(elem)); + bytes = (unsigned char *)pseudo_random; + while (nelems--) { + limbs_from_be_bytes(elem, bytes, L); + bytes += L; + /* + * L-bytes block % P, output is in Montgomery domain... + */ + redc_mont_384(elems[0], elem, BLS12_381_P, p0); + mul_mont_384(elems[0], elems[0], BLS12_381_RRRR, BLS12_381_P, p0); + elems++; + } +} + +void blst_expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t buf_len = (len_in_bytes+31) & ((size_t)0-32); + unsigned char *buf_ptr = bytes; + + if (buf_len > 255*32) + return; + + if (buf_len != len_in_bytes) + buf_ptr = alloca(buf_len); + + expand_message_xmd(buf_ptr, len_in_bytes, NULL, 0, msg, msg_len, + DST, DST_len); + if (buf_ptr != bytes) { + unsigned char *ptr = buf_ptr; + while (len_in_bytes--) + *bytes++ = *ptr++; + vec_zero(buf_ptr, buf_len); + } +} diff --git a/blst/keygen.c b/blst/keygen.c new file mode 100644 index 0000000..de749ac --- /dev/null +++ b/blst/keygen.c @@ -0,0 +1,182 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "sha256.h" + +typedef struct { + SHA256_CTX ctx; + unsigned int h_ipad[8]; + unsigned int h_opad[8]; + union { limb_t l[64/sizeof(limb_t)]; unsigned char c[64]; } tail; +} HMAC_SHA256_CTX; + +static void HMAC_init(HMAC_SHA256_CTX *ctx, const void *K, size_t K_len) +{ + size_t i; + + if (K == NULL) { /* reuse h_ipad and h_opad */ + sha256_hcopy(ctx->ctx.h, ctx->h_ipad); + ctx->ctx.N = 64; + vec_zero(ctx->ctx.buf, sizeof(ctx->ctx.buf)); + ctx->ctx.off = 0; + + return; + } + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + if (K_len > 64) { + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, K, K_len); + sha256_final(ctx->tail.c, &ctx->ctx); + } else { + sha256_bcopy(ctx->tail.c, K, K_len); + } + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)0x3636363636363636; + + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, ctx->tail.c, 64); + sha256_hcopy(ctx->h_ipad, ctx->ctx.h); + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)(0x3636363636363636 ^ 0x5c5c5c5c5c5c5c5c); + + sha256_init_h(ctx->h_opad); + sha256_block_data_order(ctx->h_opad, ctx->tail.c, 1); + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + ctx->tail.c[32] = 0x80; + ctx->tail.c[62] = 3; /* (64+32)*8 in big endian */ + ctx->tail.c[63] = 0; +} + +static void HMAC_update(HMAC_SHA256_CTX *ctx, const unsigned char *inp, + size_t len) +{ sha256_update(&ctx->ctx, inp, len); } + +static void HMAC_final(unsigned char md[32], HMAC_SHA256_CTX *ctx) +{ + sha256_final(ctx->tail.c, &ctx->ctx); + sha256_hcopy(ctx->ctx.h, ctx->h_opad); + sha256_block_data_order(ctx->ctx.h, ctx->tail.c, 1); + sha256_emit(md, ctx->ctx.h); +} + +static void HKDF_Extract(unsigned char PRK[32], + const void *salt, size_t salt_len, + const void *IKM, size_t IKM_len, + HMAC_SHA256_CTX *ctx) +{ + unsigned char zero[1] = { 0 }; + + HMAC_init(ctx, salt != NULL ? salt : zero, salt_len); + HMAC_update(ctx, IKM, IKM_len); +#ifndef __BLST_HKDF_TESTMODE__ + /* Section 2.3 KeyGen in BLS-signature draft */ + HMAC_update(ctx, zero, 1); +#endif + HMAC_final(PRK, ctx); +} + +static void HKDF_Expand(unsigned char *OKM, size_t L, + const unsigned char PRK[32], + const void *info, size_t info_len, + HMAC_SHA256_CTX *ctx) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 + unsigned char *info_prime = alloca(info_len + 2 + 1); +#else + unsigned char info_prime[info_len + 2 + 1]; +#endif + + HMAC_init(ctx, PRK, 32); + + if (info_len != 0) + sha256_bcopy(info_prime, info, info_len); +#ifndef __BLST_HKDF_TESTMODE__ + /* Section 2.3 KeyGen in BLS-signature draft */ + info_prime[info_len + 0] = (unsigned char)(L >> 8); + info_prime[info_len + 1] = (unsigned char)(L); + info_len += 2; +#endif + info_prime[info_len] = 1; /* counter */ + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + while (L > 32) { + sha256_hcopy((unsigned int *)OKM, (const unsigned int *)ctx->tail.c); + OKM += 32; L -= 32; + ++info_prime[info_len]; /* counter */ + HMAC_init(ctx, NULL, 0); + HMAC_update(ctx, ctx->tail.c, 32); + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + } + sha256_bcopy(OKM, ctx->tail.c, L); +} + +#ifndef __BLST_HKDF_TESTMODE__ +void blst_keygen(pow256 SK, const void *IKM, size_t IKM_len, + const void *info, size_t info_len) +{ + struct { + HMAC_SHA256_CTX ctx; + unsigned char PRK[32], OKM[48]; + vec512 key; + } scratch; + unsigned char salt[32] = "BLS-SIG-KEYGEN-SALT-"; + size_t salt_len = 20; + + if (IKM_len < 32) { + vec_zero(SK, sizeof(pow256)); + return; + } + + /* + * Vet |info| since some callers were caught to be sloppy, e.g. + * SWIG-4.0-generated Python wrapper... + */ + info_len = info==NULL ? 0 : info_len; + + do { + /* salt = H(salt) */ + sha256_init(&scratch.ctx.ctx); + sha256_update(&scratch.ctx.ctx, salt, salt_len); + sha256_final(salt, &scratch.ctx.ctx); + salt_len = sizeof(salt); + + /* PRK = HKDF-Extract(salt, IKM || I2OSP(0, 1)) */ + HKDF_Extract(scratch.PRK, salt, salt_len, + IKM, IKM_len, &scratch.ctx); + + /* OKM = HKDF-Expand(PRK, key_info || I2OSP(L, 2), L) */ + HKDF_Expand(scratch.OKM, sizeof(scratch.OKM), scratch.PRK, + info, info_len, &scratch.ctx); + + /* SK = OS2IP(OKM) mod r */ + vec_zero(scratch.key, sizeof(scratch.key)); + limbs_from_be_bytes(scratch.key, scratch.OKM, sizeof(scratch.OKM)); + redc_mont_256(scratch.key, scratch.key, BLS12_381_r, r0); + /* + * Given that mul_mont_sparse_256 has special boundary conditions + * it's appropriate to mention that redc_mont_256 output is fully + * reduced at this point. Because we started with 384-bit input, + * one with most significant half smaller than the modulus. + */ + mul_mont_sparse_256(scratch.key, scratch.key, BLS12_381_rRR, + BLS12_381_r, r0); + } while (vec_is_zero(scratch.key, sizeof(vec256))); + + le_bytes_from_limbs(SK, scratch.key, sizeof(pow256)); + + /* + * scrub the stack just in case next callee inadvertently flashes + * a fragment across application boundary... + */ + vec_zero(&scratch, sizeof(scratch)); +} +#endif diff --git a/blst/map_to_g1.c b/blst/map_to_g1.c new file mode 100644 index 0000000..6613d68 --- /dev/null +++ b/blst/map_to_g1.c @@ -0,0 +1,559 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384 Aprime_E1 = { + /* (0x00144698a3b8e9433d693a02c96d4982b0ea985383ee66a8 + d8e8981aefd881ac98936f8da0e0f97f5cf428082d584c1d << 384) % P */ + TO_LIMB_T(0x2f65aa0e9af5aa51), TO_LIMB_T(0x86464c2d1e8416c3), + TO_LIMB_T(0xb85ce591b7bd31e2), TO_LIMB_T(0x27e11c91b5f24e7c), + TO_LIMB_T(0x28376eda6bfc1835), TO_LIMB_T(0x155455c3e5071d85) +}; +static const vec384 Bprime_E1 = { + /* (0x12e2908d11688030018b12e8753eee3b2016c1f0f24f4070 + a0b9c14fcef35ef55a23215a316ceaa5d1cc48e98e172be0 << 384) % P */ + TO_LIMB_T(0xfb996971fe22a1e0), TO_LIMB_T(0x9aa93eb35b742d6f), + TO_LIMB_T(0x8c476013de99c5c4), TO_LIMB_T(0x873e27c3a221e571), + TO_LIMB_T(0xca72b5e45a52d888), TO_LIMB_T(0x06824061418a386b) +}; + +static void map_fp_times_Zz(vec384 map[], const vec384 isogeny_map[], + const vec384 Zz_powers[], size_t n) +{ + while (n--) + mul_fp(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp(vec384 acc, const vec384 x, const vec384 map[], size_t n) +{ + while (n--) { + mul_fp(acc, acc, x); + add_fp(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E1(POINTonE1 *out, const POINTonE1 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,11) * x'^11 + k_(1,10) * x'^10 + k_(1,9) * x'^9 + + * ... + k_(1,0) + * ... + */ + static const vec384 isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + { TO_LIMB_T(0x4d18b6f3af00131c), TO_LIMB_T(0x19fa219793fee28c), + TO_LIMB_T(0x3f2885f1467f19ae), TO_LIMB_T(0x23dcea34f2ffb304), + TO_LIMB_T(0xd15b58d2ffc00054), TO_LIMB_T(0x0913be200a20bef4) }, + { TO_LIMB_T(0x898985385cdbbd8b), TO_LIMB_T(0x3c79e43cc7d966aa), + TO_LIMB_T(0x1597e193f4cd233a), TO_LIMB_T(0x8637ef1e4d6623ad), + TO_LIMB_T(0x11b22deed20d827b), TO_LIMB_T(0x07097bc5998784ad) }, + { TO_LIMB_T(0xa542583a480b664b), TO_LIMB_T(0xfc7169c026e568c6), + TO_LIMB_T(0x5ba2ef314ed8b5a6), TO_LIMB_T(0x5b5491c05102f0e7), + TO_LIMB_T(0xdf6e99707d2a0079), TO_LIMB_T(0x0784151ed7605524) }, + { TO_LIMB_T(0x494e212870f72741), TO_LIMB_T(0xab9be52fbda43021), + TO_LIMB_T(0x26f5577994e34c3d), TO_LIMB_T(0x049dfee82aefbd60), + TO_LIMB_T(0x65dadd7828505289), TO_LIMB_T(0x0e93d431ea011aeb) }, + { TO_LIMB_T(0x90ee774bd6a74d45), TO_LIMB_T(0x7ada1c8a41bfb185), + TO_LIMB_T(0x0f1a8953b325f464), TO_LIMB_T(0x104c24211be4805c), + TO_LIMB_T(0x169139d319ea7a8f), TO_LIMB_T(0x09f20ead8e532bf6) }, + { TO_LIMB_T(0x6ddd93e2f43626b7), TO_LIMB_T(0xa5482c9aa1ccd7bd), + TO_LIMB_T(0x143245631883f4bd), TO_LIMB_T(0x2e0a94ccf77ec0db), + TO_LIMB_T(0xb0282d480e56489f), TO_LIMB_T(0x18f4bfcbb4368929) }, + { TO_LIMB_T(0x23c5f0c953402dfd), TO_LIMB_T(0x7a43ff6958ce4fe9), + TO_LIMB_T(0x2c390d3d2da5df63), TO_LIMB_T(0xd0df5c98e1f9d70f), + TO_LIMB_T(0xffd89869a572b297), TO_LIMB_T(0x1277ffc72f25e8fe) }, + { TO_LIMB_T(0x79f4f0490f06a8a6), TO_LIMB_T(0x85f894a88030fd81), + TO_LIMB_T(0x12da3054b18b6410), TO_LIMB_T(0xe2a57f6505880d65), + TO_LIMB_T(0xbba074f260e400f1), TO_LIMB_T(0x08b76279f621d028) }, + { TO_LIMB_T(0xe67245ba78d5b00b), TO_LIMB_T(0x8456ba9a1f186475), + TO_LIMB_T(0x7888bff6e6b33bb4), TO_LIMB_T(0xe21585b9a30f86cb), + TO_LIMB_T(0x05a69cdcef55feee), TO_LIMB_T(0x09e699dd9adfa5ac) }, + { TO_LIMB_T(0x0de5c357bff57107), TO_LIMB_T(0x0a0db4ae6b1a10b2), + TO_LIMB_T(0xe256bb67b3b3cd8d), TO_LIMB_T(0x8ad456574e9db24f), + TO_LIMB_T(0x0443915f50fd4179), TO_LIMB_T(0x098c4bf7de8b6375) }, + { TO_LIMB_T(0xe6b0617e7dd929c7), TO_LIMB_T(0xfe6e37d442537375), + TO_LIMB_T(0x1dafdeda137a489e), TO_LIMB_T(0xe4efd1ad3f767ceb), + TO_LIMB_T(0x4a51d8667f0fe1cf), TO_LIMB_T(0x054fdf4bbf1d821c) }, + { TO_LIMB_T(0x72db2a50658d767b), TO_LIMB_T(0x8abf91faa257b3d5), + TO_LIMB_T(0xe969d6833764ab47), TO_LIMB_T(0x464170142a1009eb), + TO_LIMB_T(0xb14f01aadb30be2f), TO_LIMB_T(0x18ae6a856f40715d) } + }; + /* ... + * x_den = x'^10 + k_(2,9) * x'^9 + k_(2,8) * x'^8 + ... + k_(2,0) + */ + static const vec384 isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + { TO_LIMB_T(0xb962a077fdb0f945), TO_LIMB_T(0xa6a9740fefda13a0), + TO_LIMB_T(0xc14d568c3ed6c544), TO_LIMB_T(0xb43fc37b908b133e), + TO_LIMB_T(0x9c0b3ac929599016), TO_LIMB_T(0x0165aa6c93ad115f) }, + { TO_LIMB_T(0x23279a3ba506c1d9), TO_LIMB_T(0x92cfca0a9465176a), + TO_LIMB_T(0x3b294ab13755f0ff), TO_LIMB_T(0x116dda1c5070ae93), + TO_LIMB_T(0xed4530924cec2045), TO_LIMB_T(0x083383d6ed81f1ce) }, + { TO_LIMB_T(0x9885c2a6449fecfc), TO_LIMB_T(0x4a2b54ccd37733f0), + TO_LIMB_T(0x17da9ffd8738c142), TO_LIMB_T(0xa0fba72732b3fafd), + TO_LIMB_T(0xff364f36e54b6812), TO_LIMB_T(0x0f29c13c660523e2) }, + { TO_LIMB_T(0xe349cc118278f041), TO_LIMB_T(0xd487228f2f3204fb), + TO_LIMB_T(0xc9d325849ade5150), TO_LIMB_T(0x43a92bd69c15c2df), + TO_LIMB_T(0x1c2c7844bc417be4), TO_LIMB_T(0x12025184f407440c) }, + { TO_LIMB_T(0x587f65ae6acb057b), TO_LIMB_T(0x1444ef325140201f), + TO_LIMB_T(0xfbf995e71270da49), TO_LIMB_T(0xccda066072436a42), + TO_LIMB_T(0x7408904f0f186bb2), TO_LIMB_T(0x13b93c63edf6c015) }, + { TO_LIMB_T(0xfb918622cd141920), TO_LIMB_T(0x4a4c64423ecaddb4), + TO_LIMB_T(0x0beb232927f7fb26), TO_LIMB_T(0x30f94df6f83a3dc2), + TO_LIMB_T(0xaeedd424d780f388), TO_LIMB_T(0x06cc402dd594bbeb) }, + { TO_LIMB_T(0xd41f761151b23f8f), TO_LIMB_T(0x32a92465435719b3), + TO_LIMB_T(0x64f436e888c62cb9), TO_LIMB_T(0xdf70a9a1f757c6e4), + TO_LIMB_T(0x6933a38d5b594c81), TO_LIMB_T(0x0c6f7f7237b46606) }, + { TO_LIMB_T(0x693c08747876c8f7), TO_LIMB_T(0x22c9850bf9cf80f0), + TO_LIMB_T(0x8e9071dab950c124), TO_LIMB_T(0x89bc62d61c7baf23), + TO_LIMB_T(0xbc6be2d8dad57c23), TO_LIMB_T(0x17916987aa14a122) }, + { TO_LIMB_T(0x1be3ff439c1316fd), TO_LIMB_T(0x9965243a7571dfa7), + TO_LIMB_T(0xc7f7f62962f5cd81), TO_LIMB_T(0x32c6aa9af394361c), + TO_LIMB_T(0xbbc2ee18e1c227f4), TO_LIMB_T(0x0c102cbac531bb34) }, + { TO_LIMB_T(0x997614c97bacbf07), TO_LIMB_T(0x61f86372b99192c0), + TO_LIMB_T(0x5b8c95fc14353fc3), TO_LIMB_T(0xca2b066c2a87492f), + TO_LIMB_T(0x16178f5bbf698711), TO_LIMB_T(0x12a6dcd7f0f4e0e8) } + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,15) * x'^15 + k_(3,14) * x'^14 + k_(3,13) * x'^13 + + * ... + k_(3,0) + * ... + */ + static const vec384 isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + { TO_LIMB_T(0x2b567ff3e2837267), TO_LIMB_T(0x1d4d9e57b958a767), + TO_LIMB_T(0xce028fea04bd7373), TO_LIMB_T(0xcc31a30a0b6cd3df), + TO_LIMB_T(0x7d7b18a682692693), TO_LIMB_T(0x0d300744d42a0310) }, + { TO_LIMB_T(0x99c2555fa542493f), TO_LIMB_T(0xfe7f53cc4874f878), + TO_LIMB_T(0x5df0608b8f97608a), TO_LIMB_T(0x14e03832052b49c8), + TO_LIMB_T(0x706326a6957dd5a4), TO_LIMB_T(0x0a8dadd9c2414555) }, + { TO_LIMB_T(0x13d942922a5cf63a), TO_LIMB_T(0x357e33e36e261e7d), + TO_LIMB_T(0xcf05a27c8456088d), TO_LIMB_T(0x0000bd1de7ba50f0), + TO_LIMB_T(0x83d0c7532f8c1fde), TO_LIMB_T(0x13f70bf38bbf2905) }, + { TO_LIMB_T(0x5c57fd95bfafbdbb), TO_LIMB_T(0x28a359a65e541707), + TO_LIMB_T(0x3983ceb4f6360b6d), TO_LIMB_T(0xafe19ff6f97e6d53), + TO_LIMB_T(0xb3468f4550192bf7), TO_LIMB_T(0x0bb6cde49d8ba257) }, + { TO_LIMB_T(0x590b62c7ff8a513f), TO_LIMB_T(0x314b4ce372cacefd), + TO_LIMB_T(0x6bef32ce94b8a800), TO_LIMB_T(0x6ddf84a095713d5f), + TO_LIMB_T(0x64eace4cb0982191), TO_LIMB_T(0x0386213c651b888d) }, + { TO_LIMB_T(0xa5310a31111bbcdd), TO_LIMB_T(0xa14ac0f5da148982), + TO_LIMB_T(0xf9ad9cc95423d2e9), TO_LIMB_T(0xaa6ec095283ee4a7), + TO_LIMB_T(0xcf5b1f022e1c9107), TO_LIMB_T(0x01fddf5aed881793) }, + { TO_LIMB_T(0x65a572b0d7a7d950), TO_LIMB_T(0xe25c2d8183473a19), + TO_LIMB_T(0xc2fcebe7cb877dbd), TO_LIMB_T(0x05b2d36c769a89b0), + TO_LIMB_T(0xba12961be86e9efb), TO_LIMB_T(0x07eb1b29c1dfde1f) }, + { TO_LIMB_T(0x93e09572f7c4cd24), TO_LIMB_T(0x364e929076795091), + TO_LIMB_T(0x8569467e68af51b5), TO_LIMB_T(0xa47da89439f5340f), + TO_LIMB_T(0xf4fa918082e44d64), TO_LIMB_T(0x0ad52ba3e6695a79) }, + { TO_LIMB_T(0x911429844e0d5f54), TO_LIMB_T(0xd03f51a3516bb233), + TO_LIMB_T(0x3d587e5640536e66), TO_LIMB_T(0xfa86d2a3a9a73482), + TO_LIMB_T(0xa90ed5adf1ed5537), TO_LIMB_T(0x149c9c326a5e7393) }, + { TO_LIMB_T(0x462bbeb03c12921a), TO_LIMB_T(0xdc9af5fa0a274a17), + TO_LIMB_T(0x9a558ebde836ebed), TO_LIMB_T(0x649ef8f11a4fae46), + TO_LIMB_T(0x8100e1652b3cdc62), TO_LIMB_T(0x1862bd62c291dacb) }, + { TO_LIMB_T(0x05c9b8ca89f12c26), TO_LIMB_T(0x0194160fa9b9ac4f), + TO_LIMB_T(0x6a643d5a6879fa2c), TO_LIMB_T(0x14665bdd8846e19d), + TO_LIMB_T(0xbb1d0d53af3ff6bf), TO_LIMB_T(0x12c7e1c3b28962e5) }, + { TO_LIMB_T(0xb55ebf900b8a3e17), TO_LIMB_T(0xfedc77ec1a9201c4), + TO_LIMB_T(0x1f07db10ea1a4df4), TO_LIMB_T(0x0dfbd15dc41a594d), + TO_LIMB_T(0x389547f2334a5391), TO_LIMB_T(0x02419f98165871a4) }, + { TO_LIMB_T(0xb416af000745fc20), TO_LIMB_T(0x8e563e9d1ea6d0f5), + TO_LIMB_T(0x7c763e17763a0652), TO_LIMB_T(0x01458ef0159ebbef), + TO_LIMB_T(0x8346fe421f96bb13), TO_LIMB_T(0x0d2d7b829ce324d2) }, + { TO_LIMB_T(0x93096bb538d64615), TO_LIMB_T(0x6f2a2619951d823a), + TO_LIMB_T(0x8f66b3ea59514fa4), TO_LIMB_T(0xf563e63704f7092f), + TO_LIMB_T(0x724b136c4cf2d9fa), TO_LIMB_T(0x046959cfcfd0bf49) }, + { TO_LIMB_T(0xea748d4b6e405346), TO_LIMB_T(0x91e9079c2c02d58f), + TO_LIMB_T(0x41064965946d9b59), TO_LIMB_T(0xa06731f1d2bbe1ee), + TO_LIMB_T(0x07f897e267a33f1b), TO_LIMB_T(0x1017290919210e5f) }, + { TO_LIMB_T(0x872aa6c17d985097), TO_LIMB_T(0xeecc53161264562a), + TO_LIMB_T(0x07afe37afff55002), TO_LIMB_T(0x54759078e5be6838), + TO_LIMB_T(0xc4b92d15db8acca8), TO_LIMB_T(0x106d87d1b51d13b9) } + }; + /* ... + * y_den = x'^15 + k_(4,14) * x'^14 + k_(4,13) * x'^13 + ... + k_(4,0) + */ + static const vec384 isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + { TO_LIMB_T(0xeb6c359d47e52b1c), TO_LIMB_T(0x18ef5f8a10634d60), + TO_LIMB_T(0xddfa71a0889d5b7e), TO_LIMB_T(0x723e71dcc5fc1323), + TO_LIMB_T(0x52f45700b70d5c69), TO_LIMB_T(0x0a8b981ee47691f1) }, + { TO_LIMB_T(0x616a3c4f5535b9fb), TO_LIMB_T(0x6f5f037395dbd911), + TO_LIMB_T(0xf25f4cc5e35c65da), TO_LIMB_T(0x3e50dffea3c62658), + TO_LIMB_T(0x6a33dca523560776), TO_LIMB_T(0x0fadeff77b6bfe3e) }, + { TO_LIMB_T(0x2be9b66df470059c), TO_LIMB_T(0x24a2c159a3d36742), + TO_LIMB_T(0x115dbe7ad10c2a37), TO_LIMB_T(0xb6634a652ee5884d), + TO_LIMB_T(0x04fe8bb2b8d81af4), TO_LIMB_T(0x01c2a7a256fe9c41) }, + { TO_LIMB_T(0xf27bf8ef3b75a386), TO_LIMB_T(0x898b367476c9073f), + TO_LIMB_T(0x24482e6b8c2f4e5f), TO_LIMB_T(0xc8e0bbd6fe110806), + TO_LIMB_T(0x59b0c17f7631448a), TO_LIMB_T(0x11037cd58b3dbfbd) }, + { TO_LIMB_T(0x31c7912ea267eec6), TO_LIMB_T(0x1dbf6f1c5fcdb700), + TO_LIMB_T(0xd30d4fe3ba86fdb1), TO_LIMB_T(0x3cae528fbee9a2a4), + TO_LIMB_T(0xb1cce69b6aa9ad9a), TO_LIMB_T(0x044393bb632d94fb) }, + { TO_LIMB_T(0xc66ef6efeeb5c7e8), TO_LIMB_T(0x9824c289dd72bb55), + TO_LIMB_T(0x71b1a4d2f119981d), TO_LIMB_T(0x104fc1aafb0919cc), + TO_LIMB_T(0x0e49df01d942a628), TO_LIMB_T(0x096c3a09773272d4) }, + { TO_LIMB_T(0x9abc11eb5fadeff4), TO_LIMB_T(0x32dca50a885728f0), + TO_LIMB_T(0xfb1fa3721569734c), TO_LIMB_T(0xc4b76271ea6506b3), + TO_LIMB_T(0xd466a75599ce728e), TO_LIMB_T(0x0c81d4645f4cb6ed) }, + { TO_LIMB_T(0x4199f10e5b8be45b), TO_LIMB_T(0xda64e495b1e87930), + TO_LIMB_T(0xcb353efe9b33e4ff), TO_LIMB_T(0x9e9efb24aa6424c6), + TO_LIMB_T(0xf08d33680a237465), TO_LIMB_T(0x0d3378023e4c7406) }, + { TO_LIMB_T(0x7eb4ae92ec74d3a5), TO_LIMB_T(0xc341b4aa9fac3497), + TO_LIMB_T(0x5be603899e907687), TO_LIMB_T(0x03bfd9cca75cbdeb), + TO_LIMB_T(0x564c2935a96bfa93), TO_LIMB_T(0x0ef3c33371e2fdb5) }, + { TO_LIMB_T(0x7ee91fd449f6ac2e), TO_LIMB_T(0xe5d5bd5cb9357a30), + TO_LIMB_T(0x773a8ca5196b1380), TO_LIMB_T(0xd0fda172174ed023), + TO_LIMB_T(0x6cb95e0fa776aead), TO_LIMB_T(0x0d22d5a40cec7cff) }, + { TO_LIMB_T(0xf727e09285fd8519), TO_LIMB_T(0xdc9d55a83017897b), + TO_LIMB_T(0x7549d8bd057894ae), TO_LIMB_T(0x178419613d90d8f8), + TO_LIMB_T(0xfce95ebdeb5b490a), TO_LIMB_T(0x0467ffaef23fc49e) }, + { TO_LIMB_T(0xc1769e6a7c385f1b), TO_LIMB_T(0x79bc930deac01c03), + TO_LIMB_T(0x5461c75a23ede3b5), TO_LIMB_T(0x6e20829e5c230c45), + TO_LIMB_T(0x828e0f1e772a53cd), TO_LIMB_T(0x116aefa749127bff) }, + { TO_LIMB_T(0x101c10bf2744c10a), TO_LIMB_T(0xbbf18d053a6a3154), + TO_LIMB_T(0xa0ecf39ef026f602), TO_LIMB_T(0xfc009d4996dc5153), + TO_LIMB_T(0xb9000209d5bd08d3), TO_LIMB_T(0x189e5fe4470cd73c) }, + { TO_LIMB_T(0x7ebd546ca1575ed2), TO_LIMB_T(0xe47d5a981d081b55), + TO_LIMB_T(0x57b2b625b6d4ca21), TO_LIMB_T(0xb0a1ba04228520cc), + TO_LIMB_T(0x98738983c2107ff3), TO_LIMB_T(0x13dddbc4799d81d6) }, + { TO_LIMB_T(0x09319f2e39834935), TO_LIMB_T(0x039e952cbdb05c21), + TO_LIMB_T(0x55ba77a9a2f76493), TO_LIMB_T(0xfd04e3dfc6086467), + TO_LIMB_T(0xfb95832e7d78742e), TO_LIMB_T(0x0ef9c24eccaf5e0e) } + }; + vec384 Zz_powers[15], map[15], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp(Zz_powers[14], p->Z); /* ZZ^1 */ +#ifdef __OPTIMIZE_SIZE__ + for (size_t i = 14; i > 0; i--) + mul_fp(Zz_powers[i-1], Zz_powers[i], Zz_powers[14]); +#else + sqr_fp(Zz_powers[13], Zz_powers[14]); /* ZZ^2 1+1 */ + mul_fp(Zz_powers[12], Zz_powers[14], Zz_powers[13]);/* ZZ^3 2+1 */ + sqr_fp(Zz_powers[11], Zz_powers[13]); /* ZZ^4 2+2 */ + mul_fp(Zz_powers[10], Zz_powers[13], Zz_powers[12]);/* ZZ^5 2+3 */ + sqr_fp(Zz_powers[9], Zz_powers[12]); /* ZZ^6 3+3 */ + mul_fp(Zz_powers[8], Zz_powers[12], Zz_powers[11]);/* ZZ^7 3+4 */ + sqr_fp(Zz_powers[7], Zz_powers[11]); /* ZZ^8 4+4 */ + mul_fp(Zz_powers[6], Zz_powers[11], Zz_powers[10]);/* ZZ^9 4+5 */ + sqr_fp(Zz_powers[5], Zz_powers[10]); /* ZZ^10 5+5 */ + mul_fp(Zz_powers[4], Zz_powers[10], Zz_powers[9]); /* ZZ^11 5+6 */ + sqr_fp(Zz_powers[3], Zz_powers[9]); /* ZZ^12 6+6 */ + mul_fp(Zz_powers[2], Zz_powers[9], Zz_powers[8]); /* ZZ^13 6+7 */ + sqr_fp(Zz_powers[1], Zz_powers[8]); /* ZZ^14 7+7 */ + mul_fp(Zz_powers[0], Zz_powers[8], Zz_powers[7]); /* ZZ^15 7+8 */ +#endif + + map_fp_times_Zz(map, isogeny_map_x_num, Zz_powers + 4, 11); + mul_fp(xn, p->X, isogeny_map_x_num[11]); + add_fp(xn, xn, map[10]); + map_fp(xn, p->X, map, 10); + + map_fp_times_Zz(map, isogeny_map_x_den, Zz_powers + 5, 10); + add_fp(xd, p->X, map[9]); + map_fp(xd, p->X, map, 9); + mul_fp(xd, xd, Zz_powers[14]); /* xd *= Z^2 */ + + map_fp_times_Zz(map, isogeny_map_y_num, Zz_powers, 15); + mul_fp(yn, p->X, isogeny_map_y_num[15]); + add_fp(yn, yn, map[14]); + map_fp(yn, p->X, map, 14); + mul_fp(yn, yn, p->Y); /* yn *= Y */ + + map_fp_times_Zz(map, isogeny_map_y_den, Zz_powers, 15); + add_fp(yd, p->X, map[14]); + map_fp(yd, p->X, map, 14); + mul_fp(Zz_powers[14], Zz_powers[14], p->Z); + mul_fp(yd, yd, Zz_powers[14]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp(out->X, xn, yd); + mul_fp(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp(out->Y, out->Z); + mul_fp(out->Y, out->Y, xd); + mul_fp(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E1(POINTonE1 *p, const vec384 u) +{ + static const vec384 minus_A = { /* P - A */ + TO_LIMB_T(0x8a9955f1650a005a), TO_LIMB_T(0x9865b3d192cfe93c), + TO_LIMB_T(0xaed3ed0f3ef3c441), TO_LIMB_T(0x3c962ef33d92c442), + TO_LIMB_T(0x22e438dbd74f94a2), TO_LIMB_T(0x04acbc265478c915) + }; + static const vec384 Z = { /* (11<<384) % P */ + TO_LIMB_T(0x886c00000023ffdc), TO_LIMB_T(0x0f70008d3090001d), + TO_LIMB_T(0x77672417ed5828c3), TO_LIMB_T(0x9dac23e943dc1740), + TO_LIMB_T(0x50553f1b9c131521), TO_LIMB_T(0x078c712fbe0ab6e8) + }; + static const vec384 sqrt_minus_ZZZ = { + TO_LIMB_T(0x43b571cad3215f1f), TO_LIMB_T(0xccb460ef1c702dc2), + TO_LIMB_T(0x742d884f4f97100b), TO_LIMB_T(0xdb2c3e3238a3382b), + TO_LIMB_T(0xe40f3fa13fce8f88), TO_LIMB_T(0x0073a2af9892a2ff) + }; + static const vec384 ZxA = { + TO_LIMB_T(0x7f674ea0a8915178), TO_LIMB_T(0xb0f945fc13b8fa65), + TO_LIMB_T(0x4b46759a38e87d76), TO_LIMB_T(0x2e7a929641bbb6a1), + TO_LIMB_T(0x1668ddfa462bf6b6), TO_LIMB_T(0x00960e2ed1cf294c) + }; + vec384 uu, tv2, x2n, gx1, gxd, y2; +#if 0 + vec384 xn, x1n, xd, y, y1, Zuu, tv4; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +# define tv4 y1 +#endif +#define sgn0_fp(a) (sgn0_pty_mont_384((a), BLS12_381_P, p0) & 1) + bool_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + */ + /* x numerator variants */ + sqr_fp(uu, u); /* uu = u^2 */ + mul_fp(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp(x1n, tv2, BLS12_381_Rx.p); /* x1n = tv2 + 1 */ + mul_fp(x1n, x1n, Bprime_E1); /* x1n = x1n * B */ + mul_fp(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp(tv2, xd); /* tv2 = xd^2 */ + mul_fp(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp(tv2, Aprime_E1, tv2); /* tv2 = A * tv2 */ + sqr_fp(gx1, x1n); /* gx1 = x1n^2 */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp(tv2, Bprime_E1, gxd); /* tv2 = B * gxd */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp(y1, tv4); /* y1 = tv4^c1 # (gx1*gxd^3)^((p-3)/4) */ + mul_fp(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp(y2, y1, sqrt_minus_ZZZ); /* y2 = y1 * c2 # y2 = y1*sqrt(-Z^3) */ + mul_fp(y2, y2, uu); /* y2 = y2 * uu */ + mul_fp(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp(u); + e2 = sgn0_fp(y); + cneg_fp(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp(p->X, xn, xd); /* X = xn * xd */ + mul_fp(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp +} + +static void POINTonE1_add_n_dbl(POINTonE1 *out, const POINTonE1 *p, size_t n) +{ + POINTonE1_dadd(out, out, p, NULL); + while(n--) + POINTonE1_double(out, out); +} + +static void POINTonE1_times_minus_z(POINTonE1 *out, const POINTonE1 *in) +{ + POINTonE1_double(out, in); /* 1: 0x2 */ + POINTonE1_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE1_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE1_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE1_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE1_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ + POINTonE1 p; + + map_to_isogenous_E1(&p, u); + + if (v != NULL) { + map_to_isogenous_E1(out, v); /* borrow |out| */ + POINTonE1_dadd(&p, &p, out, Aprime_E1); + } + + isogeny_map_to_E1(&p, &p); /* sprinkle isogenous powder */ + + /* clear the cofactor by multiplying |p| by 1-z, 0xd201000000010001 */ + POINTonE1_times_minus_z(out, &p); + POINTonE1_dadd(out, out, &p, NULL); +} + +void blst_map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ map_to_g1(out, u, v); } + +static void Encode_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[1]; + + hash_to_field(u, 1, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], NULL); +} + +void blst_encode_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[2]; + + hash_to_field(u, 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], u[1]); +} + +void blst_hash_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void sigma(POINTonE1 *out, const POINTonE1 *in); + +#if 0 +#ifdef __OPTIMIZE_SIZE__ +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + static const byte zz_minus_1_div_by_3[] = { + TO_BYTES(0x0000000055555555ULL), TO_BYTES(0x396c8c005555e156) + }; + size_t n = 126-1; + const POINTonE1 *dblin = in; + + while(n--) { + POINTonE1_double(out, dblin); dblin = out; + if (is_bit_set(zz_minus_1_div_by_3, n)) + POINTonE1_dadd(out, out, in, NULL); + } +} +#else +static void POINTonE1_dbl_n_add(POINTonE1 *out, size_t n, const POINTonE1 *p) +{ + while(n--) + POINTonE1_double(out, out); + POINTonE1_dadd(out, out, p, NULL); +} + +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + POINTonE1 t3, t5, t7, t11, t85; + + POINTonE1_double(&t7, in); /* 2P */ + POINTonE1_dadd(&t3, &t7, in, NULL); /* 3P */ + POINTonE1_dadd(&t5, &t3, &t7, NULL); /* 5P */ + POINTonE1_dadd(&t7, &t5, &t7, NULL); /* 7P */ + POINTonE1_double(&t85, &t5); /* 10P */ + POINTonE1_dadd(&t11, &t85, in, NULL); /* 11P */ + POINTonE1_dbl_n_add(&t85, 3, &t5); /* 0x55P */ + /* (-0xd201000000010000^2 - 1) / 3 */ + POINTonE1_double(out, &t7); /* 0xe */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb */ + POINTonE1_dbl_n_add(out, 3, &t3); /* 0xe5b */ + POINTonE1_dbl_n_add(out, 3, in); /* 0x72d9 */ + POINTonE1_dbl_n_add(out, 5, &t3); /* 0xe5b23 */ + POINTonE1_dbl_n_add(out, 18, &t85); /* 0x396c8c0055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555 */ + POINTonE1_dbl_n_add(out, 3, &t7); /* 0x1cb646002aaaf */ + POINTonE1_dbl_n_add(out, 7, &t5); /* 0xe5b23001555785 */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb646002aaaf0ab */ + POINTonE1_dbl_n_add(out, 41, &t85); /* 0x396c8c005555e1560000000055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e156000000005555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e15600000000555555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e1560000000055555555 */ +} +#endif + +static bool_t POINTonE1_in_G1(const POINTonE1 *P) +{ + POINTonE1 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + sigma(&t0, P); /* σ(P) */ + sigma(&t1, &t0); /* σ²(P) */ + + POINTonE1_double(&t0, &t0); /* 2σ(P) */ + POINTonE1_dadd(&t2, &t1, P, NULL); /* P + σ²(P) */ + POINTonE1_cneg(&t2, 1); /* - P - σ²(P) */ + POINTonE1_dadd(&t2, &t2, &t0, NULL); /* 2σ(P) - P - σ²(P) */ + POINTonE1_times_zz_minus_1_div_by_3( &t0, &t2); + POINTonE1_cneg(&t1, 1); + POINTonE1_dadd(&t0, &t0, &t1, NULL); /* [(z²-1)/3](2σ(P) - P - σ²(P)) */ + /* - σ²(P) */ + return vec_is_zero(t0.Z, sizeof(t0.Z)); +} +#else +static bool_t POINTonE1_in_G1(const POINTonE1 *P) +{ + POINTonE1 t0, t1; + + /* Scott, M., https://eprint.iacr.org/2021/1130 */ + POINTonE1_times_minus_z(&t0, P); + POINTonE1_times_minus_z(&t1, &t0); + POINTonE1_cneg(&t1, 1); /* [-z²]P */ + + sigma(&t0, P); /* σ(P) */ + sigma(&t0, &t0); /* σ²(P) */ + + return POINTonE1_is_equal(&t0, &t1); +} +#endif + +int blst_p1_in_g1(const POINTonE1 *p) +{ return (int)POINTonE1_in_G1(p); } + +int blst_p1_affine_in_g1(const POINTonE1_affine *p) +{ + POINTonE1 P; + + vec_copy(P.X, p->X, 2*sizeof(P.X)); + vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), + vec_is_zero(p, sizeof(*p))); + + return (int)POINTonE1_in_G1(&P); +} diff --git a/blst/map_to_g2.c b/blst/map_to_g2.c new file mode 100644 index 0000000..90fd86e --- /dev/null +++ b/blst/map_to_g2.c @@ -0,0 +1,444 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384x Aprime_E2 = { /* 240*i */ + { 0 }, + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) } +}; +static const vec384x Bprime_E2 = { /* 1012 + 1012*i */ + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) }, + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) } +}; + +static void map_fp2_times_Zz(vec384x map[], const vec384x isogeny_map[], + const vec384x Zz_powers[], size_t n) +{ + while (n--) + mul_fp2(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp2(vec384x acc, const vec384x x, const vec384x map[], size_t n) +{ + while (n--) { + mul_fp2(acc, acc, x); + add_fp2(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E2(POINTonE2 *out, const POINTonE2 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,3) * x'^3 + k_(1,2) * x'^2 + k_(1,1) * x' + k_(1,0) + * ... + */ + static const vec384x isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + {{ TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }, + { TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }}, + {{ 0 }, + { TO_LIMB_T(0x5fe55555554c71d0), TO_LIMB_T(0x873fffdd236aaaa3), + TO_LIMB_T(0x6a6b4619b26ef918), TO_LIMB_T(0x21c2888408874945), + TO_LIMB_T(0x2836cda7028cabc5), TO_LIMB_T(0x0ac73310a7fd5abd) }}, + {{ TO_LIMB_T(0x0a0c5555555971c3), TO_LIMB_T(0xdb0c00101f9eaaae), + TO_LIMB_T(0xb1fb2f941d797997), TO_LIMB_T(0xd3960742ef416e1c), + TO_LIMB_T(0xb70040e2c20556f4), TO_LIMB_T(0x149d7861e581393b) }, + { TO_LIMB_T(0xaff2aaaaaaa638e8), TO_LIMB_T(0x439fffee91b55551), + TO_LIMB_T(0xb535a30cd9377c8c), TO_LIMB_T(0x90e144420443a4a2), + TO_LIMB_T(0x941b66d3814655e2), TO_LIMB_T(0x0563998853fead5e) }}, + {{ TO_LIMB_T(0x40aac71c71c725ed), TO_LIMB_T(0x190955557a84e38e), + TO_LIMB_T(0xd817050a8f41abc3), TO_LIMB_T(0xd86485d4c87f6fb1), + TO_LIMB_T(0x696eb479f885d059), TO_LIMB_T(0x198e1a74328002d2) }, + { 0 }} + }; + /* ... + * x_den = x'^2 + k_(2,1) * x' + k_(2,0) + */ + static const vec384x isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + {{ 0 }, + { TO_LIMB_T(0x1f3affffff13ab97), TO_LIMB_T(0xf25bfc611da3ff3e), + TO_LIMB_T(0xca3757cb3819b208), TO_LIMB_T(0x3e6427366f8cec18), + TO_LIMB_T(0x03977bc86095b089), TO_LIMB_T(0x04f69db13f39a952) }}, + {{ TO_LIMB_T(0x447600000027552e), TO_LIMB_T(0xdcb8009a43480020), + TO_LIMB_T(0x6f7ee9ce4a6e8b59), TO_LIMB_T(0xb10330b7c0a95bc6), + TO_LIMB_T(0x6140b1fcfb1e54b7), TO_LIMB_T(0x0381be097f0bb4e1) }, + { TO_LIMB_T(0x7588ffffffd8557d), TO_LIMB_T(0x41f3ff646e0bffdf), + TO_LIMB_T(0xf7b1e8d2ac426aca), TO_LIMB_T(0xb3741acd32dbb6f8), + TO_LIMB_T(0xe9daf5b9482d581f), TO_LIMB_T(0x167f53e0ba7431b8) }} + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,3) * x'^3 + k_(3,2) * x'^2 + k_(3,1) * x' + k_(3,0) + * ... + */ + static const vec384x isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + {{ TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }, + { TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }}, + {{ 0 }, + { TO_LIMB_T(0xbf0a71c71c91b406), TO_LIMB_T(0x4d6d55d28b7638fd), + TO_LIMB_T(0x9d82f98e5f205aee), TO_LIMB_T(0xa27aa27b1d1a18d5), + TO_LIMB_T(0x02c3b2b2d2938e86), TO_LIMB_T(0x0c7d13420b09807f) }}, + {{ TO_LIMB_T(0xd7f9555555531c74), TO_LIMB_T(0x21cffff748daaaa8), + TO_LIMB_T(0x5a9ad1866c9bbe46), TO_LIMB_T(0x4870a2210221d251), + TO_LIMB_T(0x4a0db369c0a32af1), TO_LIMB_T(0x02b1ccc429ff56af) }, + { TO_LIMB_T(0xe205aaaaaaac8e37), TO_LIMB_T(0xfcdc000768795556), + TO_LIMB_T(0x0c96011a8a1537dd), TO_LIMB_T(0x1c06a963f163406e), + TO_LIMB_T(0x010df44c82a881e6), TO_LIMB_T(0x174f45260f808feb) }}, + {{ TO_LIMB_T(0xa470bda12f67f35c), TO_LIMB_T(0xc0fe38e23327b425), + TO_LIMB_T(0xc9d3d0f2c6f0678d), TO_LIMB_T(0x1c55c9935b5a982e), + TO_LIMB_T(0x27f6c0e2f0746764), TO_LIMB_T(0x117c5e6e28aa9054) }, + { 0 }} + }; + /* ... + * y_den = x'^3 + k_(4,2) * x'^2 + k_(4,1) * x' + k_(4,0) + */ + static const vec384x isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + {{ TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }, + { TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }}, + {{ 0 }, + { TO_LIMB_T(0x5db0fffffd3b02c5), TO_LIMB_T(0xd713f52358ebfdba), + TO_LIMB_T(0x5ea60761a84d161a), TO_LIMB_T(0xbb2c75a34ea6c44a), + TO_LIMB_T(0x0ac6735921c1119b), TO_LIMB_T(0x0ee3d913bdacfbf6) }}, + {{ TO_LIMB_T(0x66b10000003affc5), TO_LIMB_T(0xcb1400e764ec0030), + TO_LIMB_T(0xa73e5eb56fa5d106), TO_LIMB_T(0x8984c913a0fe09a9), + TO_LIMB_T(0x11e10afb78ad7f13), TO_LIMB_T(0x05429d0e3e918f52) }, + { TO_LIMB_T(0x534dffffffc4aae6), TO_LIMB_T(0x5397ff174c67ffcf), + TO_LIMB_T(0xbff273eb870b251d), TO_LIMB_T(0xdaf2827152870915), + TO_LIMB_T(0x393a9cbaca9e2dc3), TO_LIMB_T(0x14be74dbfaee5748) }} + }; + vec384x Zz_powers[3], map[3], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp2(Zz_powers[2], p->Z); /* ZZ^1 */ + sqr_fp2(Zz_powers[1], Zz_powers[2]); /* ZZ^2 1+1 */ + mul_fp2(Zz_powers[0], Zz_powers[2], Zz_powers[1]); /* ZZ^3 2+1 */ + + map_fp2_times_Zz(map, isogeny_map_x_num, Zz_powers, 3); + mul_fp2(xn, p->X, isogeny_map_x_num[3]); + add_fp2(xn, xn, map[2]); + map_fp2(xn, p->X, map, 2); + + map_fp2_times_Zz(map, isogeny_map_x_den, Zz_powers + 1, 2); + add_fp2(xd, p->X, map[1]); + map_fp2(xd, p->X, map, 1); + mul_fp2(xd, xd, Zz_powers[2]); /* xd *= Z^2 */ + + map_fp2_times_Zz(map, isogeny_map_y_num, Zz_powers, 3); + mul_fp2(yn, p->X, isogeny_map_y_num[3]); + add_fp2(yn, yn, map[2]); + map_fp2(yn, p->X, map, 2); + mul_fp2(yn, yn, p->Y); /* yn *= Y */ + + map_fp2_times_Zz(map, isogeny_map_y_den, Zz_powers, 3); + add_fp2(yd, p->X, map[2]); + map_fp2(yd, p->X, map, 2); + mul_fp2(Zz_powers[2], Zz_powers[2], p->Z); + mul_fp2(yd, yd, Zz_powers[2]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp2(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp2(out->X, xn, yd); + mul_fp2(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp2(out->Y, out->Z); + mul_fp2(out->Y, out->Y, xd); + mul_fp2(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E2(POINTonE2 *p, const vec384x u) +{ + static const vec384x minus_A = { + { 0 }, + { TO_LIMB_T(0xd4c4fffffcec5869), TO_LIMB_T(0x1da3f3eed25bfd79), + TO_LIMB_T(0x7fa833c5136fff67), TO_LIMB_T(0x59261433cd540cbd), + TO_LIMB_T(0x48450f5f2b84682c), TO_LIMB_T(0x07e05d00bf959233) } + }; + static const vec384x Z = { /* -2 - i */ + { TO_LIMB_T(0x87ebfffffff9555c), TO_LIMB_T(0x656fffe5da8ffffa), + TO_LIMB_T(0x0fd0749345d33ad2), TO_LIMB_T(0xd951e663066576f4), + TO_LIMB_T(0xde291a3d41e980d3), TO_LIMB_T(0x0815664c7dfe040d) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + static const vec384x recip_ZZZ = { /* 1/(Z^3) */ + { TO_LIMB_T(0x65018f5c28f598eb), TO_LIMB_T(0xe6020417f022d916), + TO_LIMB_T(0xd6327313288369c7), TO_LIMB_T(0x622ded8eb447156f), + TO_LIMB_T(0xe52a2aee72c2a01f), TO_LIMB_T(0x089812fb8481ffe4) }, + { TO_LIMB_T(0x2574eb851eb8619f), TO_LIMB_T(0xdba2e97912925604), + TO_LIMB_T(0x67e495a909e7a18e), TO_LIMB_T(0xdf2da23b8145b8f7), + TO_LIMB_T(0xcf5d3728310ebf6d), TO_LIMB_T(0x11be446236f4c116) } + }; + static const vec384x magic_ZZZ = { /* 1/Z^3 = a + b*i */ + /* a^2 + b^2 */ + { TO_LIMB_T(0xaa7eb851eb8508e0), TO_LIMB_T(0x1c54fdf360989374), + TO_LIMB_T(0xc87f2fc6e716c62e), TO_LIMB_T(0x0124aefb1f9efea7), + TO_LIMB_T(0xb2f8be63e844865c), TO_LIMB_T(0x08b47f775a7ef35a) }, + /* (a^2 + b^2)^((P-3)/4) */ + { TO_LIMB_T(0xe4132bbd838cf70a), TO_LIMB_T(0x01d769ac83772c19), + TO_LIMB_T(0xa83dd6e974c22e45), TO_LIMB_T(0xbc8ec3e777b08dff), + TO_LIMB_T(0xc035c2042ecf5da3), TO_LIMB_T(0x073929e97f0850bf) } + }; + static const vec384x ZxA = { /* 240 - 480*i */ + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) }, + { TO_LIMB_T(0xa989fffff9d8b0d2), TO_LIMB_T(0x3b47e7dda4b7faf3), + TO_LIMB_T(0xff50678a26dffece), TO_LIMB_T(0xb24c28679aa8197a), + TO_LIMB_T(0x908a1ebe5708d058), TO_LIMB_T(0x0fc0ba017f2b2466) } + }; + vec384x uu, tv2, tv4, x2n, gx1, gxd, y2; +#if 0 + vec384x xn, x1n, xd, y, y1, Zuu; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +#endif +#define sgn0_fp2(a) (sgn0_pty_mont_384x((a), BLS12_381_P, p0) & 1) + bool_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + * with 9mod16 twists... + */ + /* x numerator variants */ + sqr_fp2(uu, u); /* uu = u^2 */ + mul_fp2(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp2(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp2(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp2(x1n, tv2, BLS12_381_Rx.p2); /* x1n = tv2 + 1 */ + mul_fp2(x1n, x1n, Bprime_E2); /* x1n = x1n * B */ + mul_fp2(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp2(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp2(tv2, xd); /* tv2 = xd^2 */ + mul_fp2(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp2(tv2, Aprime_E2, tv2); /* tv2 = A * tv2 */ + sqr_fp2(gx1, x1n); /* gx1 = x1n^2 */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp2(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp2(tv2, Bprime_E2, gxd); /* tv2 = B * gxd */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp2(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp2(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp2(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp2(y1, tv4, /* y1 = tv4^c1 # (gx1*gxd^3)^((p^2-9)/16) */ + recip_ZZZ, magic_ZZZ); + mul_fp2(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp2(y2, y1, uu); /* y2 = y1 * uu */ + mul_fp2(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp2(u); + e2 = sgn0_fp2(y); + cneg_fp2(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp2(p->X, xn, xd); /* X = xn * xd */ + mul_fp2(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp2 +} + +#if 0 +static const byte h_eff[] = { + TO_BYTES(0xe8020005aaa95551), TO_BYTES(0x59894c0adebbf6b4), + TO_BYTES(0xe954cbc06689f6a3), TO_BYTES(0x2ec0ec69d7477c1a), + TO_BYTES(0x6d82bf015d1212b0), TO_BYTES(0x329c2f178731db95), + TO_BYTES(0x9986ff031508ffe1), TO_BYTES(0x88e2a8e9145ad768), + TO_BYTES(0x584c6a0ea91b3528), TO_BYTES(0x0bc69f08f2ee75b3) +}; + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ POINTonE2_mult_w5(out, p, h_eff, 636); } +#else +/* + * As per suggestions in "7. Clearing the cofactor" at + * https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06 + */ +static void POINTonE2_add_n_dbl(POINTonE2 *out, const POINTonE2 *p, size_t n) +{ + POINTonE2_dadd(out, out, p, NULL); + while(n--) + POINTonE2_double(out, out); +} + +static void POINTonE2_times_minus_z(POINTonE2 *out, const POINTonE2 *in) +{ + POINTonE2_double(out, in); /* 1: 0x2 */ + POINTonE2_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE2_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE2_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE2_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE2_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +static void psi(POINTonE2 *out, const POINTonE2 *in); + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ + POINTonE2 t0, t1; + + /* A.Budroni, F.Pintore, "Efficient hash maps to G2 on BLS curves" */ + POINTonE2_double(out, p); /* out = 2P */ + psi(out, out); /* out = Ψ(2P) */ + psi(out, out); /* out = Ψ²(2P) */ + + vec_copy(&t0, p, sizeof(t0)); + POINTonE2_cneg(&t0, 1); /* t0 = -P */ + psi(&t1, &t0); /* t1 = -Ψ(P) */ + POINTonE2_dadd(out, out, &t0, NULL);/* out = Ψ²(2P) - P */ + POINTonE2_dadd(out, out, &t1, NULL);/* out = Ψ²(2P) - P - Ψ(P) */ + + POINTonE2_times_minus_z(&t0, p); /* t0 = [-z]P */ + POINTonE2_dadd(&t0, &t0, p, NULL); /* t0 = [-z + 1]P */ + POINTonE2_dadd(&t0, &t0, &t1, NULL);/* t0 = [-z + 1]P - Ψ(P) */ + POINTonE2_times_minus_z(&t1, &t0); /* t1 = [z² - z]P + [z]Ψ(P) */ + POINTonE2_dadd(out, out, &t1, NULL);/* out = [z² - z - 1]P */ + /* + [z - 1]Ψ(P) */ + /* + Ψ²(2P) */ +} +#endif + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ + POINTonE2 p; + + map_to_isogenous_E2(&p, u); + + if (v != NULL) { + map_to_isogenous_E2(out, v); /* borrow |out| */ + POINTonE2_dadd(&p, &p, out, Aprime_E2); + } + + isogeny_map_to_E2(&p, &p); /* sprinkle isogenous powder */ + clear_cofactor(out, &p); +} + +void blst_map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ map_to_g2(out, u, v); } + +static void Encode_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[1]; + + hash_to_field(u[0], 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], NULL); +} + +void blst_encode_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[2]; + + hash_to_field(u[0], 4, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], u[1]); +} + +void blst_hash_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static bool_t POINTonE2_in_G2(const POINTonE2 *P) +{ +#if 0 + POINTonE2 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + psi(&t0, P); /* Ψ(P) */ + psi(&t0, &t0); /* Ψ²(P) */ + psi(&t1, &t0); /* Ψ³(P) */ + + POINTonE2_times_minus_z(&t2, &t1); + POINTonE2_dadd(&t0, &t0, &t2, NULL); + POINTonE2_cneg(&t0, 1); + POINTonE2_dadd(&t0, &t0, P, NULL); /* [z]Ψ³(P) - Ψ²(P) + P */ + + return vec_is_zero(t0.Z, sizeof(t0.Z)); +#else + POINTonE2 t0, t1; + + /* Scott, M., https://eprint.iacr.org/2021/1130 */ + psi(&t0, P); /* Ψ(P) */ + + POINTonE2_times_minus_z(&t1, P); + POINTonE2_cneg(&t1, 1); /* [z]P */ + + return POINTonE2_is_equal(&t0, &t1); +#endif +} + +int blst_p2_in_g2(const POINTonE2 *p) +{ return (int)POINTonE2_in_G2(p); } + +int blst_p2_affine_in_g2(const POINTonE2_affine *p) +{ + POINTonE2 P; + + vec_copy(P.X, p->X, 2*sizeof(P.X)); + vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), + vec_is_zero(p, sizeof(*p))); + + return (int)POINTonE2_in_G2(&P); +} diff --git a/blst/multi_scalar.c b/blst/multi_scalar.c new file mode 100644 index 0000000..d0b3dee --- /dev/null +++ b/blst/multi_scalar.c @@ -0,0 +1,414 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" +#include "point.h" + +/* + * Infinite point among inputs would be devastating. Shall we change it? + */ +#define POINTS_TO_AFFINE_IMPL(prefix, ptype, bits, field) \ +static void ptype##s_to_affine(ptype##_affine dst[], \ + const ptype *const points[], size_t npoints) \ +{ \ + size_t i; \ + vec##bits *acc, ZZ, ZZZ; \ + const ptype *point = NULL; \ + const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 1536 : 768; \ +\ + while (npoints) { \ + const ptype *p, *const *walkback; \ + size_t delta = stride<npoints ? stride : npoints; \ +\ + point = *points ? *points++ : point+1; \ + acc = (vec##bits *)dst; \ + vec_copy(acc++, point->Z, sizeof(vec##bits)); \ + for (i = 1; i < delta; i++, acc++) \ + point = *points ? *points++ : point+1, \ + mul_##field(acc[0], acc[-1], point->Z); \ +\ + --acc; reciprocal_##field(acc[0], acc[0]); \ +\ + walkback = points-1, p = point, --delta, dst += delta; \ + for (i = 0; i < delta; i++, acc--, dst--) { \ + mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ + sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ + mul_##field(acc[-1], p->Z, acc[0]); \ + mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ + p = (p == *walkback) ? *--walkback : p-1; \ + } \ + sqr_##field(ZZ, acc[0]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[0]); /* 1/Z^3 */\ + mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ + ++delta, dst += delta, npoints -= delta; \ + } \ +} \ +\ +void prefix##s_to_affine(ptype##_affine dst[], const ptype *const points[], \ + size_t npoints) \ +{ ptype##s_to_affine(dst, points, npoints); } + +POINTS_TO_AFFINE_IMPL(blst_p1, POINTonE1, 384, fp) +POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2) + +/* + * This is two-step multi-scalar multiplication procedure. First, given + * a set of points you pre-compute a table for chosen windowing factor + * [expressed in bits with value between 2 and 14], and then you pass + * this table to the actual multiplication procedure along with scalars. + * Idea is that the pre-computed table will be reused multiple times. In + * which case multiplication runs faster than below Pippenger algorithm + * implementation for up to ~16K points for wbits=8, naturally at the + * expense of multi-megabyte table. One can trade even more memory for + * performance, but each wbits increment doubles the memory requirement, + * so at some point it gets prohibively large... For reference, without + * reusing the table it's faster than Pippenger algorithm for up ~32 + * points [with wbits=5]... + */ + +#define SCRATCH_SZ(ptype) (sizeof(ptype)==sizeof(POINTonE1) ? 8192 : 4096) + +#define PRECOMPUTE_WBITS_IMPL(prefix, ptype, bits, field, one) \ +static void ptype##_precompute_row_wbits(ptype row[], size_t wbits, \ + const ptype##_affine *point) \ +{ \ + size_t i, j, n = (size_t)1 << (wbits-1); \ + /* row[-1] is implicit infinity */\ + vec_copy(&row[0], point, sizeof(*point)); /* row[0]=p*1 */\ + vec_copy(&row[0].Z, one, sizeof(row[0].Z)); \ + ptype##_double(&row[1], &row[0]); /* row[1]=p*(1+1) */\ + for (i = 2, j = 1; i < n; i += 2, j++) \ + ptype##_add_affine(&row[i], &row[i-1], point), /* row[2]=p*(2+1) */\ + ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ +} /* row[4] ... */\ +\ +static void ptype##s_to_affine_row_wbits(ptype##_affine dst[], ptype src[], \ + size_t wbits, size_t npoints) \ +{ \ + size_t total = npoints << (wbits-1); \ + size_t nwin = (size_t)1 << (wbits-1); \ + size_t i, j; \ + vec##bits *acc, ZZ, ZZZ; \ +\ + src += total; \ + acc = (vec##bits *)src; \ + vec_copy(acc++, one, sizeof(vec##bits)); \ + for (i = 0; i < npoints; i++) \ + for (j = nwin; --src, --j; acc++) \ + mul_##field(acc[0], acc[-1], src->Z); \ +\ + --acc; reciprocal_##field(acc[0], acc[0]); \ +\ + for (i = 0; i < npoints; i++) { \ + vec_copy(dst++, src++, sizeof(ptype##_affine)); \ + for (j = 1; j < nwin; j++, acc--, src++, dst++) { \ + mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ + sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ + mul_##field(acc[-1], src->Z, acc[0]); \ + mul_##field(dst->X, src->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, src->Y, ZZZ); /* Y = Y'/Z^3 */\ + } \ + } \ +} \ +\ +/* flat |points[n]| can be placed at the end of |table[n<<(wbits-1)]| */\ +static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \ + const ptype##_affine *const points[], \ + size_t npoints) \ +{ \ + size_t total = npoints << (wbits-1); \ + size_t nwin = (size_t)1 << (wbits-1); \ + size_t nmin = wbits>9 ? (size_t)1: (size_t)1 << (9-wbits); \ + size_t i, top = 0; \ + ptype *rows, *row; \ + const ptype##_affine *point = NULL; \ + size_t stride = ((512*1024)/sizeof(ptype##_affine)) >> wbits; \ + if (stride == 0) stride = 1; \ +\ + while (npoints >= nmin) { \ + size_t limit = total - npoints; \ +\ + if (top + (stride << wbits) > limit) { \ + stride = (limit - top) >> wbits; \ + if (stride == 0) break; \ + } \ + rows = row = (ptype *)(&table[top]); \ + for (i = 0; i < stride; i++, row += nwin) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_row_wbits(row, wbits, point); \ + ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \ + top += stride << (wbits-1); \ + npoints -= stride; \ + } \ + rows = row = alloca(2*sizeof(ptype##_affine) * npoints * nwin); \ + for (i = 0; i < npoints; i++, row += nwin) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_row_wbits(row, wbits, point); \ + ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \ +} \ +\ +size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \ +{ return (sizeof(ptype##_affine)*npoints) << (wbits-1); } \ +void prefix##s_mult_wbits_precompute(ptype##_affine table[], size_t wbits, \ + const ptype##_affine *const points[], \ + size_t npoints) \ +{ ptype##s_precompute_wbits(table, wbits, points, npoints); } + +#define POINTS_MULT_WBITS_IMPL(prefix, ptype, bits, field, one) \ +static void ptype##_gather_booth_wbits(ptype *p, const ptype##_affine row[], \ + size_t wbits, limb_t booth_idx) \ +{ \ + bool_t booth_sign = (booth_idx >> wbits) & 1; \ + bool_t idx_is_zero; \ + static const ptype##_affine infinity = { 0 }; \ +\ + booth_idx &= ((limb_t)1 << wbits) - 1; \ + idx_is_zero = is_zero(booth_idx); \ + booth_idx -= 1 ^ idx_is_zero; \ + vec_select(p, &infinity, &row[booth_idx], sizeof(row[0]), idx_is_zero); \ + ptype##_cneg(p, booth_sign); \ +} \ +\ +static void ptype##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ + size_t wbits, size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype scratch[]) \ +{ \ + limb_t wmask, wval; \ + size_t i, j, z, nbytes, window, nwin = (size_t)1 << (wbits-1); \ + const byte *scalar, *const *scalar_s = scalars; \ + const ptype##_affine *row = table; \ +\ + size_t scratch_sz = SCRATCH_SZ(ptype); \ + if (scratch == NULL) { \ + scratch_sz /= 4; /* limit to 288K */ \ + scratch_sz = scratch_sz < npoints ? scratch_sz : npoints; \ + scratch = alloca(sizeof(ptype) * scratch_sz); \ + } \ +\ + nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \ + scalar = *scalar_s++; \ +\ + /* top excess bits modulo target window size */ \ + window = nbits % wbits; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + nbits -= window; \ + z = is_zero(nbits); \ + wval = (get_wval_limb(scalar, nbits - (z^1), wbits + (z^1)) << z) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[0], row, wbits, wval); \ + row += nwin; \ +\ + i = 1; vec_zero(ret, sizeof(*ret)); \ + while (nbits > 0) { \ + for (j = i; i < npoints; i++, j++, row += nwin) { \ + if (j == scratch_sz) \ + ptype##s_accumulate(ret, scratch, j), j = 0; \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = get_wval_limb(scalar, nbits - 1, window + 1) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ + } \ + ptype##s_accumulate(ret, scratch, j); \ +\ + for (j = 0; j < wbits; j++) \ + ptype##_double(ret, ret); \ +\ + window = wbits; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + nbits -= window; \ + i = 0; row = table; scalar_s = scalars; \ + } \ +\ + for (j = i; i < npoints; i++, j++, row += nwin) { \ + if (j == scratch_sz) \ + ptype##s_accumulate(ret, scratch, j), j = 0; \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = (get_wval_limb(scalar, 0, wbits) << 1) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ + } \ + ptype##s_accumulate(ret, scratch, j); \ +} \ +\ +size_t prefix##s_mult_wbits_scratch_sizeof(size_t npoints) \ +{ \ + const size_t scratch_sz = SCRATCH_SZ(ptype); \ + return sizeof(ptype) * (npoints < scratch_sz ? npoints : scratch_sz); \ +} \ +void prefix##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ + size_t wbits, size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype scratch[]) \ +{ ptype##s_mult_wbits(ret, table, wbits, npoints, scalars, nbits, scratch); } + +PRECOMPUTE_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) +POINTS_MULT_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) + +PRECOMPUTE_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINTS_MULT_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) + +/* + * Pippenger algorithm implementation, fastest option for larger amount + * of points... + */ + +static size_t pippenger_window_size(size_t npoints) +{ + size_t wbits; + + for (wbits=0; npoints>>=1; wbits++) ; + + return wbits>12 ? wbits-3 : (wbits>4 ? wbits-2 : (wbits ? 2 : 1)); +} + +#define DECLARE_PRIVATE_POINTXYZZ(ptype, bits) \ +typedef struct { vec##bits X,Y,ZZZ,ZZ; } ptype##xyzz; + +#define POINTS_MULT_PIPPENGER_IMPL(prefix, ptype) \ +static void ptype##_integrate_buckets(ptype *out, ptype##xyzz buckets[], \ + size_t wbits) \ +{ \ + ptype##xyzz ret[1], acc[1]; \ + size_t n = (size_t)1 << wbits; \ +\ + /* Calculate sum of x[i-1]*i for i=1 through 1<<|wbits|. */\ + vec_copy(acc, &buckets[--n], sizeof(acc)); \ + vec_copy(ret, &buckets[n], sizeof(ret)); \ + vec_zero(&buckets[n], sizeof(buckets[n])); \ + while (n--) { \ + ptype##xyzz_dadd(acc, acc, &buckets[n]); \ + ptype##xyzz_dadd(ret, ret, acc); \ + vec_zero(&buckets[n], sizeof(buckets[n])); \ + } \ + ptype##xyzz_to_Jacobian(out, ret); \ +} \ +\ +static void ptype##_bucket(ptype##xyzz buckets[], limb_t booth_idx, \ + size_t wbits, const ptype##_affine *p) \ +{ \ + bool_t booth_sign = (booth_idx >> wbits) & 1; \ +\ + booth_idx &= (1<<wbits) - 1; \ + if (booth_idx--) \ + ptype##xyzz_dadd_affine(&buckets[booth_idx], &buckets[booth_idx], \ + p, booth_sign); \ +} \ +\ +static void ptype##_prefetch(const ptype##xyzz buckets[], limb_t booth_idx, \ + size_t wbits) \ +{ \ + booth_idx &= (1<<wbits) - 1; \ + if (booth_idx--) \ + vec_prefetch(&buckets[booth_idx], sizeof(buckets[booth_idx])); \ +} \ +\ +static void ptype##s_tile_pippenger(ptype *ret, \ + const ptype##_affine *const points[], \ + size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype##xyzz buckets[], \ + size_t bit0, size_t wbits, size_t cbits) \ +{ \ + limb_t wmask, wval, wnxt; \ + size_t i, z, nbytes; \ + const byte *scalar = *scalars++; \ + const ptype##_affine *point = *points++; \ +\ + nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \ + wmask = ((limb_t)1 << (wbits+1)) - 1; \ + z = is_zero(bit0); \ + bit0 -= z^1; wbits += z^1; \ + wval = (get_wval_limb(scalar, bit0, wbits) << z) & wmask; \ + wval = booth_encode(wval, cbits); \ + scalar = *scalars ? *scalars++ : scalar+nbytes; \ + wnxt = (get_wval_limb(scalar, bit0, wbits) << z) & wmask; \ + wnxt = booth_encode(wnxt, cbits); \ + npoints--; /* account for prefetch */ \ +\ + ptype##_bucket(buckets, wval, cbits, point); \ + for (i = 1; i < npoints; i++) { \ + wval = wnxt; \ + scalar = *scalars ? *scalars++ : scalar+nbytes; \ + wnxt = (get_wval_limb(scalar, bit0, wbits) << z) & wmask; \ + wnxt = booth_encode(wnxt, cbits); \ + ptype##_prefetch(buckets, wnxt, cbits); \ + point = *points ? *points++ : point+1; \ + ptype##_bucket(buckets, wval, cbits, point); \ + } \ + point = *points ? *points++ : point+1; \ + ptype##_bucket(buckets, wnxt, cbits, point); \ + ptype##_integrate_buckets(ret, buckets, cbits - 1); \ +} \ +\ +static void ptype##s_mult_pippenger(ptype *ret, \ + const ptype##_affine *const points[], \ + size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype##xyzz buckets[], size_t window) \ +{ \ + size_t i, wbits, cbits, bit0 = nbits; \ + ptype tile[1]; \ +\ + window = window ? window : pippenger_window_size(npoints); \ + vec_zero(buckets, sizeof(buckets[0]) << (window-1)); \ + vec_zero(ret, sizeof(*ret)); \ +\ + /* top excess bits modulo target window size */ \ + wbits = nbits % window; /* yes, it may be zero */ \ + cbits = wbits + 1; \ + while (bit0 -= wbits) { \ + ptype##s_tile_pippenger(tile, points, npoints, scalars, nbits, \ + buckets, bit0, wbits, cbits); \ + ptype##_dadd(ret, ret, tile, NULL); \ + for (i = 0; i < window; i++) \ + ptype##_double(ret, ret); \ + cbits = wbits = window; \ + } \ + ptype##s_tile_pippenger(tile, points, npoints, scalars, nbits, \ + buckets, 0, wbits, cbits); \ + ptype##_dadd(ret, ret, tile, NULL); \ +} \ +\ +size_t prefix##s_mult_pippenger_scratch_sizeof(size_t npoints) \ +{ return sizeof(ptype##xyzz) << (pippenger_window_size(npoints)-1); } \ +void prefix##s_tile_pippenger(ptype *ret, \ + const ptype##_affine *const points[], \ + size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype##xyzz scratch[], \ + size_t bit0, size_t window) \ +{ \ + size_t wbits, cbits; \ +\ + if (bit0 + window > nbits) wbits = nbits - bit0, cbits = wbits + 1; \ + else wbits = cbits = window; \ + ptype##s_tile_pippenger(ret, points, npoints, scalars, nbits, scratch, \ + bit0, wbits, cbits); \ +} \ +void prefix##s_mult_pippenger(ptype *ret, \ + const ptype##_affine *const points[], \ + size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype##xyzz scratch[]) \ +{ ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); } + +DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384) +POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp) +POINTXYZZ_DADD_IMPL(POINTonE1, 384, fp) +POINTXYZZ_DADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINTS_MULT_PIPPENGER_IMPL(blst_p1, POINTonE1) + +DECLARE_PRIVATE_POINTXYZZ(POINTonE2, 384x) +POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE2, 384x, fp2) +POINTXYZZ_DADD_IMPL(POINTonE2, 384x, fp2) +POINTXYZZ_DADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINTS_MULT_PIPPENGER_IMPL(blst_p2, POINTonE2) diff --git a/blst/no_asm.h b/blst/no_asm.h new file mode 100644 index 0000000..4f12f53 --- /dev/null +++ b/blst/no_asm.h @@ -0,0 +1,1287 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#if LIMB_T_BITS==32 +typedef unsigned long long llimb_t; +#endif + +#if defined(__clang__) +# pragma GCC diagnostic ignored "-Wstatic-in-inline" +#endif + +static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], limb_t n0, size_t n) +{ + llimb_t limbx; + limb_t mask, borrow, mx, hi, tmp[n+1], carry; + size_t i, j; + + for (mx=b[0], hi=0, i=0; i<n; i++) { + limbx = (mx * (llimb_t)a[i]) + hi; + tmp[i] = (limb_t)limbx; + hi = (limb_t)(limbx >> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + + for (carry=0, j=0; ; ) { + limbx = (mx * (llimb_t)p[0]) + tmp[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i<n; i++) { + limbx = (mx * (llimb_t)p[i] + hi) + tmp[i]; + tmp[i-1] = (limb_t)limbx; + hi = (limb_t)(limbx >> LIMB_T_BITS); + } + limbx = tmp[i] + (hi + (llimb_t)carry); + tmp[i-1] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + + if (++j==n) + break; + + for (mx=b[j], hi=0, i=0; i<n; i++) { + limbx = (mx * (llimb_t)a[i] + hi) + tmp[i]; + tmp[i] = (limb_t)limbx; + hi = (limb_t)(limbx >> LIMB_T_BITS); + } + mx = n0*tmp[0]; + limbx = hi + (llimb_t)carry; + tmp[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + for (borrow=0, i=0; i<n; i++) { + limbx = tmp[i] - (p[i] + (llimb_t)borrow); + ret[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i<n; i++) + ret[i] = (ret[i] & ~mask) | (tmp[i] & mask); +} + +#define MUL_MONT_IMPL(bits) \ +inline void mul_mont_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits b, const vec##bits p, limb_t n0) \ +{ mul_mont_n(ret, a, b, p, n0, NLIMBS(bits)); } \ +\ +inline void sqr_mont_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits p, limb_t n0) \ +{ mul_mont_n(ret, a, a, p, n0, NLIMBS(bits)); } + +/* + * 256-bit subroutines can handle arbitrary modulus, even non-"sparse", + * but we have to harmonize the naming with assembly. + */ +#define mul_mont_256 mul_mont_sparse_256 +#define sqr_mont_256 sqr_mont_sparse_256 +MUL_MONT_IMPL(256) +#undef mul_mont_256 +#undef sqr_mont_256 +MUL_MONT_IMPL(384) + +static void add_mod_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], size_t n) +{ + llimb_t limbx; + limb_t mask, carry, borrow, tmp[n]; + size_t i; + + for (carry=0, i=0; i<n; i++) { + limbx = a[i] + (b[i] + (llimb_t)carry); + tmp[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + for (borrow=0, i=0; i<n; i++) { + limbx = tmp[i] - (p[i] + (llimb_t)borrow); + ret[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i<n; i++) + ret[i] = (ret[i] & ~mask) | (tmp[i] & mask); +} + +#define ADD_MOD_IMPL(bits) \ +inline void add_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits b, const vec##bits p) \ +{ add_mod_n(ret, a, b, p, NLIMBS(bits)); } + +ADD_MOD_IMPL(256) +ADD_MOD_IMPL(384) + +static void sub_mod_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], size_t n) +{ + llimb_t limbx; + limb_t mask, carry, borrow; + size_t i; + + for (borrow=0, i=0; i<n; i++) { + limbx = a[i] - (b[i] + (llimb_t)borrow); + ret[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + + for (carry=0, i=0; i<n; i++) { + limbx = ret[i] + ((p[i] & mask) + (llimb_t)carry); + ret[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } +} + +#define SUB_MOD_IMPL(bits) \ +inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits b, const vec##bits p) \ +{ sub_mod_n(ret, a, b, p, NLIMBS(bits)); } + +SUB_MOD_IMPL(256) +SUB_MOD_IMPL(384) + +static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[], + size_t n) +{ + llimb_t limbx; + limb_t mask, carry, borrow, tmp[n], two_a[n]; + size_t i; + + for (carry=0, i=0; i<n; i++) { + limb_t a_i = a[i]; + tmp[i] = a_i<<1 | carry; + carry = a_i>>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i<n; i++) { + limbx = tmp[i] - (p[i] + (llimb_t)borrow); + two_a[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i<n; i++) + two_a[i] = (two_a[i] & ~mask) | (tmp[i] & mask); + + for (carry=0, i=0; i<n; i++) { + limbx = a[i] + (two_a[i] + (llimb_t)carry); + tmp[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + for (borrow=0, i=0; i<n; i++) { + limbx = tmp[i] - (p[i] + (llimb_t)borrow); + ret[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i<n; i++) + ret[i] = (ret[i] & ~mask) | (tmp[i] & mask); +} + +#define MUL_BY_3_MOD_IMPL(bits) \ +inline void mul_by_3_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits p) \ +{ mul_by_3_mod_n(ret, a, p, NLIMBS(bits)); } + +MUL_BY_3_MOD_IMPL(256) +MUL_BY_3_MOD_IMPL(384) + +static void lshift_mod_n(limb_t ret[], const limb_t a[], size_t count, + const limb_t p[], size_t n) +{ + llimb_t limbx; + limb_t mask, carry, borrow, tmp[n]; + size_t i; + + while (count--) { + for (carry=0, i=0; i<n; i++) { + limb_t a_i = a[i]; + tmp[i] = a_i<<1 | carry; + carry = a_i>>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i<n; i++) { + limbx = tmp[i] - (p[i] + (llimb_t)borrow); + ret[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i<n; i++) + ret[i] = (ret[i] & ~mask) | (tmp[i] & mask); + + a = ret; + } +} + +#define LSHIFT_MOD_IMPL(bits) \ +inline void lshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \ + const vec##bits p) \ +{ lshift_mod_n(ret, a, count, p, NLIMBS(bits)); } + +LSHIFT_MOD_IMPL(256) +LSHIFT_MOD_IMPL(384) + +static void cneg_mod_n(limb_t ret[], const limb_t a[], bool_t flag, + const limb_t p[], size_t n) +{ + llimb_t limbx; + limb_t borrow, mask, tmp[n]; + size_t i; + + for (borrow=0, i=0; i<n; i++) { + limbx = p[i] - (a[i] + (llimb_t)borrow); + tmp[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + flag &= vec_is_zero(a, sizeof(tmp)) ^ 1; + mask = (limb_t)0 - flag; + + for(i=0; i<n; i++) + ret[i] = (a[i] & ~mask) | (tmp[i] & mask); +} + +#define CNEG_MOD_IMPL(bits) \ +inline void cneg_mod_##bits(vec##bits ret, const vec##bits a, bool_t flag, \ + const vec##bits p) \ +{ cneg_mod_n(ret, a, flag, p, NLIMBS(bits)); } + +CNEG_MOD_IMPL(256) +CNEG_MOD_IMPL(384) + +static limb_t check_mod_n(const byte a[], const limb_t p[], size_t n) +{ + llimb_t limbx; + limb_t borrow, ai, acc; + size_t i, j; + + for (acc=borrow=0, i=0; i<n; i++) { + for (ai=0, j=0; j<8*sizeof(limb_t); j+=8) + ai |= (limb_t)(*a++) << j; + acc |= ai; + limbx = ai - (p[i] + (llimb_t)borrow); + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + return borrow & (is_zero(acc) ^ 1); +} + +#define CHECK_MOD_IMPL(bits) \ +inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \ +{ return check_mod_n(a, p, NLIMBS(bits)); } + +CHECK_MOD_IMPL(256) + +static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[], + const limb_t p[], size_t n) +{ + limb_t ret_[n], a_[n], b_[n], zero; + + limbs_from_le_bytes(a_, a, sizeof(a_)); + limbs_from_le_bytes(b_, b, sizeof(b_)); + + add_mod_n(ret_, a_, b_, p, n); + zero = vec_is_zero(ret_, sizeof(ret_)); + + le_bytes_from_limbs(ret, ret_, sizeof(ret_)); + + return zero^1; +} + +#define ADD_N_CHECK_MOD_IMPL(bits) \ +inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ + const pow##bits b, const vec##bits p) \ +{ return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } + +ADD_N_CHECK_MOD_IMPL(256) + +static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[], + const limb_t p[], size_t n) +{ + limb_t ret_[n], a_[n], b_[n], zero; + + limbs_from_le_bytes(a_, a, sizeof(a_)); + limbs_from_le_bytes(b_, b, sizeof(b_)); + + sub_mod_n(ret_, a_, b_, p, n); + zero = vec_is_zero(ret_, sizeof(ret_)); + + le_bytes_from_limbs(ret, ret_, sizeof(ret_)); + + return zero^1; +} + +#define SUB_N_CHECK_MOD_IMPL(bits) \ +inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ + const pow##bits b, const vec##bits p) \ +{ return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } + +SUB_N_CHECK_MOD_IMPL(256) + +static void from_mont_n(limb_t ret[], const limb_t a[], + const limb_t p[], limb_t n0, size_t n) +{ + llimb_t limbx; + limb_t mask, borrow, mx, hi, tmp[n]; + size_t i, j; + + for (j=0; j<n; j++) { + mx = n0*a[0]; + limbx = (mx * (llimb_t)p[0]) + a[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i<n; i++) { + limbx = (mx * (llimb_t)p[i] + hi) + a[i]; + tmp[i-1] = (limb_t)limbx; + hi = (limb_t)(limbx >> LIMB_T_BITS); + } + tmp[i-1] = hi; + a = tmp; + } + + /* this is needed only if input can be non-fully-reduced */ + for (borrow=0, i=0; i<n; i++) { + limbx = tmp[i] - (p[i] + (llimb_t)borrow); + ret[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + + for(i=0; i<n; i++) + ret[i] = (ret[i] & ~mask) | (tmp[i] & mask); +} + +#define FROM_MONT_IMPL(bits) \ +inline void from_mont_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits p, limb_t n0) \ +{ from_mont_n(ret, a, p, n0, NLIMBS(bits)); } + +FROM_MONT_IMPL(256) +FROM_MONT_IMPL(384) + +static void redc_mont_n(limb_t ret[], const limb_t a[], + const limb_t p[], limb_t n0, size_t n) +{ + llimb_t limbx; + limb_t mask, carry, borrow, mx, hi, tmp[n]; + const limb_t *b = a; + size_t i, j; + + for (j=0; j<n; j++) { + mx = n0*b[0]; + limbx = (mx * (llimb_t)p[0]) + b[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i<n; i++) { + limbx = (mx * (llimb_t)p[i] + hi) + b[i]; + tmp[i-1] = (limb_t)limbx; + hi = (limb_t)(limbx >> LIMB_T_BITS); + } + tmp[i-1] = hi; + b = tmp; + } + + for (carry=0, i=0; i<n; i++) { + limbx = a[n+i] + (tmp[i] + (llimb_t)carry); + tmp[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + for (borrow=0, i=0; i<n; i++) { + limbx = tmp[i] - (p[i] + (llimb_t)borrow); + ret[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i<n; i++) + ret[i] = (ret[i] & ~mask) | (tmp[i] & mask); +} + +#define REDC_MONT_IMPL(bits, bits2) \ +inline void redc_mont_##bits(vec##bits ret, const vec##bits2 a, \ + const vec##bits p, limb_t n0) \ +{ redc_mont_n(ret, a, p, n0, NLIMBS(bits)); } + +REDC_MONT_IMPL(256, 512) +REDC_MONT_IMPL(384, 768) + +static void rshift_mod_n(limb_t ret[], const limb_t a[], size_t count, + const limb_t p[], size_t n) +{ + llimb_t limbx; + limb_t mask, carry, limb, next; + size_t i; + + while (count--) { + mask = 0 - (a[0] & 1); + for (carry=0, i=0; i<n; i++) { + limbx = a[i] + ((p[i]&mask) + (llimb_t)carry); + ret[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + for (next=ret[0], i=0; i<n-1; i++) { + limb = next >> 1; + next = ret[i+1]; + ret[i] = limb | next << (LIMB_T_BITS-1); + } + ret[i] = next >> 1 | carry << (LIMB_T_BITS-1); + + a = ret; + } +} + +#define RSHIFT_MOD_IMPL(bits) \ +inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \ + const vec##bits p) \ +{ rshift_mod_n(ret, a, count, p, NLIMBS(bits)); } + +RSHIFT_MOD_IMPL(256) +RSHIFT_MOD_IMPL(384) + +#define DIV_BY_2_MOD_IMPL(bits) \ +inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits p) \ +{ rshift_mod_n(ret, a, 1, p, NLIMBS(bits)); } + +DIV_BY_2_MOD_IMPL(384) + +static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n) +{ + llimb_t limbx; + limb_t carry, borrow, ret, tmp[n]; + size_t i; + + ret = a[0] & 1; /* parity */ + + for (carry=0, i=0; i<n; i++) { + limb_t a_i = a[i]; + tmp[i] = a_i<<1 | carry; + carry = a_i>>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i<n; i++) { + limbx = tmp[i] - (p[i] + (llimb_t)borrow); + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + ret |= ((carry - borrow) & 2) ^ 2; + + return ret; +} + +inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p) +{ return sgn0_pty_mod_n(a, p, NLIMBS(384)); } + +inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0) +{ + vec384 tmp; + + from_mont_n(tmp, a, p, n0, NLIMBS(384)); + + return sgn0_pty_mod_n(tmp, p, NLIMBS(384)); +} + +inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p) +{ + limb_t re, im, sign, prty; + + re = sgn0_pty_mod_n(a[0], p, NLIMBS(384)); + im = sgn0_pty_mod_n(a[1], p, NLIMBS(384)); + + /* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */ + sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384)); + sign = (re & sign) | (im & ~sign); + + /* a->re==0 ? prty(a->im) : prty(a->re) */ + prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384)); + prty = (im & prty) | (re & ~prty); + + return (sign & 2) | (prty & 1); +} + +inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0) +{ + vec384x tmp; + + from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384)); + from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384)); + + return sgn0_pty_mod_384x(tmp, p); +} + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0) +{ + vec384 aa, bb, cc; + + add_mod_n(aa, a[0], a[1], p, NLIMBS(384)); + add_mod_n(bb, b[0], b[1], p, NLIMBS(384)); + mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384)); + mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384)); + mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384)); + sub_mod_n(ret[0], aa, cc, p, NLIMBS(384)); + sub_mod_n(ret[1], bb, aa, p, NLIMBS(384)); + sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384)); +} + +/* + * mul_mont_n without final conditional subtraction, which implies + * that modulus is one bit short, which in turn means that there are + * no carries to handle between iterations... + */ +static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], limb_t n0, size_t n) +{ + llimb_t limbx; + limb_t mx, hi, tmp[n+1]; + size_t i, j; + + for (mx=b[0], hi=0, i=0; i<n; i++) { + limbx = (mx * (llimb_t)a[i]) + hi; + tmp[i] = (limb_t)limbx; + hi = (limb_t)(limbx >> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + + for (j=0; ; ) { + limbx = (mx * (llimb_t)p[0]) + tmp[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i<n; i++) { + limbx = (mx * (llimb_t)p[i] + hi) + tmp[i]; + tmp[i-1] = (limb_t)limbx; + hi = (limb_t)(limbx >> LIMB_T_BITS); + } + tmp[i-1] = tmp[i] + hi; + + if (++j==n) + break; + + for (mx=b[j], hi=0, i=0; i<n; i++) { + limbx = (mx * (llimb_t)a[i] + hi) + tmp[i]; + tmp[i] = (limb_t)limbx; + hi = (limb_t)(limbx >> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + } + + vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t)); +} + +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b) +{ + while(count--) { + mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384)); + a = ret; + } + mul_mont_n(ret, ret, b, p, n0, NLIMBS(384)); +} + +void sqr_mont_382x(vec384x ret, const vec384x a, + const vec384 p, limb_t n0) +{ + llimb_t limbx; + limb_t mask, carry, borrow; + size_t i; + vec384 t0, t1; + + /* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */ + for (carry=0, i=0; i<NLIMBS(384); i++) { + limbx = a[0][i] + (a[1][i] + (llimb_t)carry); + t0[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + /* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */ + for (borrow=0, i=0; i<NLIMBS(384); i++) { + limbx = a[0][i] - (a[1][i] + (llimb_t)borrow); + t1[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + mask = 0 - borrow; + + /* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */ + mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384)); + + /* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */ + for (carry=0, i=0; i<NLIMBS(384); i++) { + limb_t a_i = ret[1][i]; + ret[1][i] = a_i<<1 | carry; + carry = a_i>>(LIMB_T_BITS-1); + } + + /* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */ + mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384)); + + /* account for t1's sign... */ + for (borrow=0, i=0; i<NLIMBS(384); i++) { + limbx = ret[0][i] - ((t0[i] & mask) + (llimb_t)borrow); + ret[0][i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + mask = 0 - borrow; + for (carry=0, i=0; i<NLIMBS(384); i++) { + limbx = ret[0][i] + ((p[i] & mask) + (llimb_t)carry); + ret[0][i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } +} + +#define MSB(x) ((x) >> (LIMB_T_BITS-1)) + +static size_t num_bits(limb_t l) +{ + limb_t x, mask; + size_t bits = is_zero(l) ^ 1; + + if (sizeof(limb_t) == 8) { + x = l >> (32 & (8*sizeof(limb_t)-1)); + mask = 0 - MSB(0 - x); + bits += 32 & mask; + l ^= (x ^ l) & mask; + } + + x = l >> 16; + mask = 0 - MSB(0 - x); + bits += 16 & mask; + l ^= (x ^ l) & mask; + + x = l >> 8; + mask = 0 - MSB(0 - x); + bits += 8 & mask; + l ^= (x ^ l) & mask; + + x = l >> 4; + mask = 0 - MSB(0 - x); + bits += 4 & mask; + l ^= (x ^ l) & mask; + + x = l >> 2; + mask = 0 - MSB(0 - x); + bits += 2 & mask; + l ^= (x ^ l) & mask; + + bits += l >> 1; + + return bits; +} + +#if defined(__clang_major__) && __clang_major__>7 +__attribute__((optnone)) +#endif +static limb_t lshift_2(limb_t hi, limb_t lo, size_t l) +{ + size_t r = LIMB_T_BITS - l; + limb_t mask = 0 - (is_zero(l)^1); + return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1))); +} + +/* + * https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1. + */ +static void ab_approximation_n(limb_t a_[2], const limb_t a[], + limb_t b_[2], const limb_t b[], size_t n) +{ + limb_t a_hi, a_lo, b_hi, b_lo, mask; + size_t i; + + i = n-1; + a_hi = a[i], a_lo = a[i-1]; + b_hi = b[i], b_lo = b[i-1]; + for (i--; --i;) { + mask = 0 - is_zero(a_hi | b_hi); + a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi; + b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi; + a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo; + b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo; + } + i = LIMB_T_BITS - num_bits(a_hi | b_hi); + /* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */ + + a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i); + b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i); +} + +typedef struct { limb_t f0, g0, f1, g1; } factors; + +static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2], + size_t n) +{ + llimb_t limbx; + limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1; + limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm; + + a_lo = a_[0], a_hi = a_[1]; + b_lo = b_[0], b_hi = b_[1]; + + while(n--) { + odd = 0 - (a_lo&1); + + /* a_ -= b_ if a_ is odd */ + t_lo = a_lo, t_hi = a_hi; + limbx = a_lo - (llimb_t)(b_lo & odd); + a_lo = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); + a_hi = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS); + + /* negate a_-b_ if it borrowed */ + a_lo ^= borrow; + a_hi ^= borrow; + limbx = a_lo + (llimb_t)(borrow & 1); + a_lo = (limb_t)limbx; + a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; + + /* b_=a_ if a_-b_ borrowed */ + b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; + b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; + + /* exchange f0 and f1 if a_-b_ borrowed */ + xorm = (f0 ^ f1) & borrow; + f0 ^= xorm; + f1 ^= xorm; + + /* exchange g0 and g1 if a_-b_ borrowed */ + xorm = (g0 ^ g1) & borrow; + g0 ^= xorm; + g1 ^= xorm; + + /* subtract if a_ was odd */ + f0 -= f1 & odd; + g0 -= g1 & odd; + + f1 <<= 1; + g1 <<= 1; + a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); + a_hi >>= 1; + } + + fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1; +} + +static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n) +{ + llimb_t limbx = 0; + limb_t carry; + size_t i; + + for (carry=neg&1, i=0; i<n; i++) { + limbx = (llimb_t)(a[i] ^ neg) + carry; + ret[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + return 0 - MSB((limb_t)limbx); +} + +static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n) +{ + llimb_t limbx; + limb_t carry; + size_t i; + + for (carry=0, i=0; i<n; i++) { + limbx = a[i] + (b[i] + (llimb_t)carry); + ret[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + return carry; +} + +static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n) +{ + llimb_t limbx; + limb_t hi; + size_t i; + + for (hi=0, i=0; i<n; i++) { + limbx = (b * (llimb_t)a[i]) + hi; + ret[i] = (limb_t)limbx; + hi = (limb_t)(limbx >> LIMB_T_BITS); + } + + return hi; +} + +static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_, + const limb_t b[], limb_t *g_, + size_t n) +{ + limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi; + size_t i; + + /* |a|*|f_| */ + f = *f_; + neg = 0 - MSB(f); + f = (f ^ neg) - neg; /* ensure |f| is positive */ + (void)cneg_n(a_, a, neg, n); + hi = umul_n(a_, a_, f, n); + a_[n] = hi - (f & neg); + + /* |b|*|g_| */ + g = *g_; + neg = 0 - MSB(g); + g = (g ^ neg) - neg; /* ensure |g| is positive */ + (void)cneg_n(b_, b, neg, n); + hi = umul_n(b_, b_, g, n); + b_[n] = hi - (g & neg); + + /* |a|*|f_| + |b|*|g_| */ + (void)add_n(a_, a_, b_, n+1); + + /* (|a|*|f_| + |b|*|g_|) >> k */ + for (carry=a_[0], i=0; i<n; i++) { + hi = carry >> (LIMB_T_BITS-2); + carry = a_[i+1]; + ret[i] = hi | (carry << 2); + } + + /* ensure result is non-negative, fix up |f_| and |g_| accordingly */ + neg = 0 - MSB(carry); + *f_ = (*f_ ^ neg) - neg; + *g_ = (*g_ ^ neg) - neg; + (void)cneg_n(ret, ret, neg, n); + + return neg; +} + +static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f, + const limb_t v[], limb_t g, size_t n) +{ + limb_t u_[n], v_[n], neg, hi; + + /* |u|*|f_| */ + neg = 0 - MSB(f); + f = (f ^ neg) - neg; /* ensure |f| is positive */ + neg = cneg_n(u_, u, neg, n); + hi = umul_n(u_, u_, f, n) - (f&neg); + + /* |v|*|g_| */ + neg = 0 - MSB(g); + g = (g ^ neg) - neg; /* ensure |g| is positive */ + neg = cneg_n(v_, v, neg, n); + hi += umul_n(v_, v_, g, n) - (g&neg); + + /* |u|*|f_| + |v|*|g_| */ + hi += add_n(ret, u_, v_, n); + + return hi; +} + +static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[], + const limb_t mod[], const limb_t modx[], size_t n) +{ + llimb_t limbx; + limb_t a[n], b[n], u[2*n], v[2*n], t[2*n]; + limb_t a_[2], b_[2], sign, carry, top; + factors fg; + size_t i; + + vec_copy(a, inp, sizeof(a)); + vec_copy(b, mod, sizeof(b)); + vec_zero(u, sizeof(u)); u[0] = 1; + vec_zero(v, sizeof(v)); + + for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { + ab_approximation_n(a_, a, b_, b, n); + inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2); + (void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); + (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); + vec_copy(a, t, sizeof(a)); + smul_2n(t, u, fg.f0, v, fg.g0, 2*n); + smul_2n(v, u, fg.f1, v, fg.g1, 2*n); + vec_copy(u, t, sizeof(u)); + } + + inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); + top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n); + + sign = 0 - MSB(top); /* top is 1, 0 or -1 */ + for (carry=0, i=0; i<n; i++) { + limbx = ret[n+i] + ((modx[i] & sign) + (llimb_t)carry); + ret[n+i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + top += carry; + sign = 0 - top; /* top is 1, 0 or -1 */ + top |= sign; + for (i=0; i<n; i++) + a[i] = modx[i] & top; + (void)cneg_n(a, a, 0 - MSB(sign), n); + add_n(ret+n, ret+n, a, n); +} + +#define CT_INVERSE_MOD_IMPL(bits) \ +inline void ct_inverse_mod_##bits(vec##bits ret, const vec##bits inp, \ + const vec##bits mod, const vec##bits modx) \ +{ ct_inverse_mod_n(ret, inp, mod, modx, NLIMBS(bits)); } + +CT_INVERSE_MOD_IMPL(256) +CT_INVERSE_MOD_IMPL(384) + +/* + * Copy of inner_loop_n above, but with |L| updates. + */ +static limb_t legendre_loop_n(limb_t L, factors *fg, const limb_t a_[2], + const limb_t b_[2], size_t n) +{ + llimb_t limbx; + limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1; + limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm; + + a_lo = a_[0], a_hi = a_[1]; + b_lo = b_[0], b_hi = b_[1]; + + while(n--) { + odd = 0 - (a_lo&1); + + /* a_ -= b_ if a_ is odd */ + t_lo = a_lo, t_hi = a_hi; + limbx = a_lo - (llimb_t)(b_lo & odd); + a_lo = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); + a_hi = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS); + + L += ((t_lo & b_lo) >> 1) & borrow; + + /* negate a_-b_ if it borrowed */ + a_lo ^= borrow; + a_hi ^= borrow; + limbx = a_lo + (llimb_t)(borrow & 1); + a_lo = (limb_t)limbx; + a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; + + /* b_=a_ if a_-b_ borrowed */ + b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; + b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; + + /* exchange f0 and f1 if a_-b_ borrowed */ + xorm = (f0 ^ f1) & borrow; + f0 ^= xorm; + f1 ^= xorm; + + /* exchange g0 and g1 if a_-b_ borrowed */ + xorm = (g0 ^ g1) & borrow; + g0 ^= xorm; + g1 ^= xorm; + + /* subtract if a_ was odd */ + f0 -= f1 & odd; + g0 -= g1 & odd; + + f1 <<= 1; + g1 <<= 1; + a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); + a_hi >>= 1; + + L += (b_lo + 2) >> 2; + } + + fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1; + + return L; +} + +static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n) +{ + limb_t a[n], b[n], t[n]; + limb_t a_[2], b_[2], neg, L = 0; + factors fg; + size_t i; + + vec_copy(a, inp, sizeof(a)); + vec_copy(b, mod, sizeof(b)); + + for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { + ab_approximation_n(a_, a, b_, b, n); + L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2); + neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); + (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); + vec_copy(a, t, sizeof(a)); + L += (b[0] >> 1) & neg; + } + + L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); + + return (L & 1) ^ 1; +} + +#define CT_IS_SQR_MOD_IMPL(bits) \ +inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \ + const vec##bits mod) \ +{ return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits)); } + +CT_IS_SQR_MOD_IMPL(384) + +/* + * |div_top| points at two most significant limbs of the dividend, |d_hi| + * and |d_lo| are two most significant limbs of the divisor. If divisor + * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. + * The divisor is required to be "bitwise left-aligned," and dividend's + * top limbs to be not larger than the divisor's. The latter limitation + * can be problematic in the first iteration of multi-precision division, + * where in most general case the condition would have to be "smaller." + * The subroutine considers four limbs, two of which are "overlapping," + * hence the name... Another way to look at it is to think of the pair + * of the dividend's limbs being suffixed with a zero: + * +-------+-------+-------+ + * R | | | 0 | + * +-------+-------+-------+ + * +-------+-------+ + * D | | | + * +-------+-------+ + */ +limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi) +{ + llimb_t Rx; + limb_t r_lo = div_top[0], r_hi = div_top[1]; + limb_t Q = 0, mask, borrow, rx; + size_t i; + + for (i = 0; i < LIMB_T_BITS; i++) { + /* "borrow, Rx = R - D" */ + Rx = (llimb_t)r_lo - d_lo; + rx = (limb_t)Rx; + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + Rx = r_hi - (d_hi + (llimb_t)borrow); + borrow = (limb_t)(Rx >> LIMB_T_BITS); + + /* "if (R >= D) R -= D" */ + r_lo = ((r_lo ^ rx) & borrow) ^ rx; + rx = (limb_t)Rx; + r_hi = ((r_hi ^ rx) & borrow) ^ rx; + + Q <<= 1; + Q |= ~borrow & 1; + + /* "D >>= 1" */ + d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1); + d_hi >>= 1; + } + + mask = 0 - MSB(Q); /* does it overflow? */ + + /* "borrow, Rx = R - D" */ + Rx = (llimb_t)r_lo - d_lo; + rx = (limb_t)Rx; + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + Rx = r_hi - (d_hi + (llimb_t)borrow); + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + + Q <<= 1; + Q |= borrow ^ 1; + + return (Q | mask); +} + +static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor, + limb_t quotient, size_t n) +{ + llimb_t limbx; + limb_t tmp[n+1], carry, mask, borrow; + size_t i; + + /* divisor*quotient */ + for (carry=0, i=0; i<n; i++) { + limbx = (quotient * (llimb_t)divisor[i]) + carry; + tmp[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + tmp[i] = carry; + + /* remainder = dividend - divisor*quotient */ + for (borrow=0, i=0; i<=n; i++) { + limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow); + tmp[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + + /* if quotient was off by one, add divisor to the remainder */ + for (carry=0, i=0; i<n; i++) { + limbx = tmp[i] + ((divisor[i] & mask) + (llimb_t)carry); + div_rem[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + return (div_rem[i] = quotient + mask); +} + +inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor, + limb_t quotient) +{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128)); } + +inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor, + limb_t quotient) +{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64)); } + +/* + * Unlock reference implementations in vect.c + */ +#define mul_by_8_mod_384 mul_by_8_mod_384 +#define mul_by_8_mod_384x mul_by_8_mod_384x +#define mul_by_3_mod_384x mul_by_3_mod_384x +#define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x +#define add_mod_384x add_mod_384x +#define sub_mod_384x sub_mod_384x +#define lshift_mod_384x lshift_mod_384x +#define sqr_mont_384x sqr_mont_384x + +inline void vec_prefetch(const void *ptr, size_t len) +{ (void)ptr; (void)len; } + +/* + * SHA-256 + */ +#define ROTR(x,n) ((x)>>n | (x)<<(32-n)) +#define Sigma0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22)) +#define Sigma1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25)) +#define sigma0(x) (ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3)) +#define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10)) +#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) +#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +void blst_sha256_block_data_order(unsigned int *v, const void *inp, + size_t blocks) +{ + static const unsigned int K256[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + }; + unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2; + const unsigned char *data = inp; + size_t round; + + a = v[0]; + b = v[1]; + c = v[2]; + d = v[3]; + e = v[4]; + f = v[5]; + g = v[6]; + h = v[7]; + + while (blocks--) { + for (round = 0; round < 16; round++) { + l = (unsigned int)data[0] << 24; + l |= (unsigned int)data[1] << 16; + l |= (unsigned int)data[2] << 8; + l |= (unsigned int)data[3]; + data += 4; + T1 = X[round] = l; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + for (; round < 64; round++) { + s0 = X[(round + 1) & 0x0f]; + s0 = sigma0(s0); + s1 = X[(round + 14) & 0x0f]; + s1 = sigma1(s1); + + T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf]; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + a += v[0]; v[0] = a; + b += v[1]; v[1] = b; + c += v[2]; v[2] = c; + d += v[3]; v[3] = d; + e += v[4]; v[4] = e; + f += v[5]; v[5] = f; + g += v[6]; v[6] = g; + h += v[7]; v[7] = h; + } +} +#undef ROTR +#undef Sigma0 +#undef Sigma1 +#undef sigma0 +#undef sigma1 +#undef Ch +#undef Maj + +void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8]) +{ + size_t i; + + for (i=0; i<8; i++) + dst[i] = src[i]; +} + +void blst_sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + size_t i; + + for (i=0; i<8; i++, md+=4) { + unsigned int h_i = h[i]; + md[0] = (unsigned char)(h_i >> 24); + md[1] = (unsigned char)(h_i >> 16); + md[2] = (unsigned char)(h_i >> 8); + md[3] = (unsigned char)h_i; + } +} + +void blst_sha256_bcopy(void *dst_, const void *src_, size_t len) +{ + unsigned char *dst = dst_; + const unsigned char *src = src_; + size_t i; + + for (i=0; i<len; i++) + dst[i] = src[i]; +} diff --git a/blst/pairing.c b/blst/pairing.c new file mode 100644 index 0000000..8d19b98 --- /dev/null +++ b/blst/pairing.c @@ -0,0 +1,443 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * Line evaluations from https://eprint.iacr.org/2010/354.pdf + * with a twist moving common expression to line_by_Px2. + */ +static void line_add(vec384fp6 line, POINTonE2 *T, const POINTonE2 *R, + const POINTonE2_affine *Q) +{ + vec384x Z1Z1, U2, S2, H, HH, I, J, V; +#if 1 +# define r line[1] +#else + vec384x r; +#endif + + /* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl + * with XYZ3 being |T|, XYZ1 - |R|, XY2 - |Q|, i.e. Q is affine + */ + sqr_fp2(Z1Z1, R->Z); /* Z1Z1 = Z1^2 */ + mul_fp2(U2, Q->X, Z1Z1); /* U2 = X2*Z1Z1 */ + + mul_fp2(S2, Q->Y, R->Z); + mul_fp2(S2, S2, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */ + + sub_fp2(H, U2, R->X); /* H = U2-X1 */ + + sqr_fp2(HH, H); /* HH = H^2 */ + add_fp2(I, HH, HH); + add_fp2(I, I, I); /* I = 4*HH */ + + mul_fp2(J, H, I); /* J = H*I */ + + sub_fp2(r, S2, R->Y); + add_fp2(r, r, r); /* r = 2*(S2-Y1) */ + + mul_fp2(V, R->X, I); /* V = X1*I */ + + sqr_fp2(T->X, r); + sub_fp2(T->X, T->X, J); + sub_fp2(T->X, T->X, V); + sub_fp2(T->X, T->X, V); /* X3 = r^2-J-2*V */ + + mul_fp2(J, J, R->Y); + sub_fp2(T->Y, V, T->X); + mul_fp2(T->Y, T->Y, r); + sub_fp2(T->Y, T->Y, J); + sub_fp2(T->Y, T->Y, J); /* Y3 = r*(V-X3)-2*Y1*J */ + + add_fp2(T->Z, R->Z, H); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, Z1Z1); + sub_fp2(T->Z, T->Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */ + + /* + * line evaluation + */ + mul_fp2(I, r, Q->X); + mul_fp2(J, Q->Y, T->Z); + sub_fp2(I, I, J); + add_fp2(line[0], I, I); /* 2*(r*X2 - Y2*Z3) */ +#ifdef r +# undef r +#else + vec_copy(line[1], r, sizeof(r)); +#endif + vec_copy(line[2], T->Z, sizeof(T->Z)); +} + +static void line_dbl(vec384fp6 line, POINTonE2 *T, const POINTonE2 *Q) +{ + vec384x ZZ, A, B, C, D, E, F; + + /* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-alnr + */ + sqr_fp2(A, Q->X); /* A = X1^2 */ + sqr_fp2(B, Q->Y); /* B = Y1^2 */ + sqr_fp2(ZZ, Q->Z); /* ZZ = Z1^2 */ + sqr_fp2(C, B); /* C = B^2 */ + + add_fp2(D, Q->X, B); /* X1+B */ + sqr_fp2(D, D); /* (X1+B)^2 */ + sub_fp2(D, D, A); /* (X1+B)^2-A */ + sub_fp2(D, D, C); /* (X1+B)^2-A-C */ + add_fp2(D, D, D); /* D = 2*((X1+B)^2-A-C) */ + + mul_by_3_fp2(E, A); /* E = 3*A */ + sqr_fp2(F, E); /* F = E^2 */ + + add_fp2(line[0], E, Q->X); /* 3*A+X1 for line evaluation */ + + sub_fp2(T->X, F, D); + sub_fp2(T->X, T->X, D); /* X3 = F-2*D */ + + add_fp2(T->Z, Q->Y, Q->Z); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, B); + sub_fp2(T->Z, T->Z, ZZ); /* Z3 = (Y1+Z1)^2-B-ZZ */ + + mul_by_8_fp2(C, C); /* 8*C */ + sub_fp2(T->Y, D, T->X); /* D-X3 */ + mul_fp2(T->Y, T->Y, E); /* E*(D-X3) */ + sub_fp2(T->Y, T->Y, C); /* Y3 = E*(D-X3)-8*C */ + + /* + * line evaluation + */ + sqr_fp2(line[0], line[0]); + sub_fp2(line[0], line[0], A); + sub_fp2(line[0], line[0], F); /* (3*A+X1)^2 - X1^2 - 9*A^2 */ + lshift_fp2(B, B, 2); + sub_fp2(line[0], line[0], B); /* 6*X1^3 - 4*Y1^2 */ + + mul_fp2(line[1], E, ZZ); /* 3*X1^2 * Z1^2 */ + + mul_fp2(line[2], T->Z, ZZ); /* Z3 * Z1^2 */ +} + +static void line_by_Px2(vec384fp6 line, const POINTonE1_affine *Px2) +{ + mul_fp(line[1][0], line[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(line[1][1], line[1][1], Px2->X); + + mul_fp(line[2][0], line[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(line[2][1], line[2][1], Px2->Y); +} + +#if 0 +static void add_n_dbl(vec384fp12 ret, POINTonE2 *T, const POINTonE2_affine *Q, + const POINTonE1_affine *Px2, vec384fp6 line, size_t n) +{ + line_add(line, T, T, Q); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + line_dbl(line, T, T); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop(vec384fp12 ret, const POINTonE2 *Q, const POINTonE1 *P) +{ +#define Q ((const POINTonE2_affine *)Q) + POINTonE2 T[1]; + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T, T); /* 0x2 */ + line_by_Px2(line, Px2); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + add_n_dbl(ret, T, Q, Px2, line, 2); /* ..0xc */ + add_n_dbl(ret, T, Q, Px2, line, 3); /* ..0x68 */ + add_n_dbl(ret, T, Q, Px2, line, 9); /* ..0xd200 */ + add_n_dbl(ret, T, Q, Px2, line, 32); /* ..0xd20100000000 */ + add_n_dbl(ret, T, Q, Px2, line, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +#undef Q +} +#endif + +static void start_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE1_affine Px2[], size_t n) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T+0, T+0); line_by_Px2(line, Px2+0); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + + for (i = 1; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void add_n_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE2_affine Q[], + const POINTonE1_affine Px2[], + size_t n, size_t k) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + for (i = 0; i < n; i++) { + line_add(line, T+i, T+i, Q+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + while (k--) { + sqr_fp12(ret, ret); + for (i = 0; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + } +} + +static void miller_loop_n(vec384fp12 ret, const POINTonE2_affine Q[], + const POINTonE1_affine P[], size_t n) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 + POINTonE2 *T = alloca(n*sizeof(POINTonE2)); + POINTonE1_affine *Px2 = alloca(n*sizeof(POINTonE1_affine)); +#else + POINTonE2 T[n]; + POINTonE1_affine Px2[n]; +#endif + size_t i; + + if ((n == 1) && (vec_is_zero(&Q[0], sizeof(Q[0])) | + vec_is_zero(&P[0], sizeof(P[0]))) ) { + /* + * Special case of infinite aggregated signature, pair the additive + * group's identity with the multiplicative group's identity. + */ + vec_copy(ret, BLS12_381_Rx.p12, sizeof(vec384fp12)); + return; + } + + for (i = 0; i < n; i++) { + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2[i].X, P[i].X, P[i].X); + neg_fp(Px2[i].X, Px2[i].X); + add_fp(Px2[i].Y, P[i].Y, P[i].Y); + + vec_copy(T[i].X, Q[i].X, 2*sizeof(T[i].X)); + vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); + } + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + start_dbl_n(ret, T, Px2, n); /* 0x2 */ + add_n_dbl_n(ret, T, Q, Px2, n, 2); /* ..0xc */ + add_n_dbl_n(ret, T, Q, Px2, n, 3); /* ..0x68 */ + add_n_dbl_n(ret, T, Q, Px2, n, 9); /* ..0xd200 */ + add_n_dbl_n(ret, T, Q, Px2, n, 32); /* ..0xd20100000000 */ + add_n_dbl_n(ret, T, Q, Px2, n, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +static void pre_add_n_dbl(vec384fp6 lines[], POINTonE2 *T, + const POINTonE2_affine *Q, + size_t n) +{ + line_add(lines++[0], T, T, Q); + while (n--) + line_dbl(lines++[0], T, T); +} + +static void precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ + POINTonE2 T[1]; + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + line_dbl(Qlines[0], T, T); /* 0x2 */ + pre_add_n_dbl(&Qlines[1], T, Q, 2); /* ..0xc */ + pre_add_n_dbl(&Qlines[4], T, Q, 3); /* ..0x68 */ + pre_add_n_dbl(&Qlines[8], T, Q, 9); /* ..0xd200 */ + pre_add_n_dbl(&Qlines[18], T, Q, 32); /* ..0xd20100000000 */ + pre_add_n_dbl(&Qlines[51], T, Q, 16); /* ..0xd201000000010000 */ +} + +static void post_line_by_Px2(vec384fp6 out, const vec384fp6 in, + const POINTonE1_affine *Px2) +{ + vec_copy(out[0], in[0], sizeof(out[0])); + + mul_fp(out[1][0], in[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(out[1][1], in[1][1], Px2->X); + + mul_fp(out[2][0], in[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(out[2][1], in[2][1], Px2->Y); +} + +static void post_add_n_dbl(vec384fp12 ret, const vec384fp6 lines[], + const POINTonE1_affine *Px2, size_t n) +{ + vec384fp6 line; + + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + post_line_by_Px2(line, Qlines[0], Px2); /* 0x2 */ + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + post_add_n_dbl(ret, &Qlines[1], Px2, 2); /* ..0xc */ + post_add_n_dbl(ret, &Qlines[4], Px2, 3); /* ..0x68 */ + post_add_n_dbl(ret, &Qlines[8], Px2, 9); /* ..0xd200 */ + post_add_n_dbl(ret, &Qlines[18], Px2, 32); /* ..0xd20100000000 */ + post_add_n_dbl(ret, &Qlines[51], Px2, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#ifdef INTERNAL_TESTMODE +static void miller_loop_alt(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ + vec384fp6 lines[68]; + + precompute_lines(lines, Q); + miller_loop_lines(ret, lines, P); +} +#endif + +static void mul_n_sqr(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + mul_fp12(ret, ret, a); + while (n--) + cyclotomic_sqr_fp12(ret, ret); +} + +static void raise_to_z_div_by_2(vec384fp12 ret, const vec384fp12 a) +{ + cyclotomic_sqr_fp12(ret, a); /* 0x2 */ + mul_n_sqr(ret, a, 2); /* ..0xc */ + mul_n_sqr(ret, a, 3); /* ..0x68 */ + mul_n_sqr(ret, a, 9); /* ..0xd200 */ + mul_n_sqr(ret, a, 32); /* ..0xd20100000000 */ + mul_n_sqr(ret, a, 16-1); /* ..0x6900800000008000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#define raise_to_z(a, b) (raise_to_z_div_by_2(a, b), cyclotomic_sqr_fp12(a, a)) + +/* + * Adaptation from <zkcrypto>/pairing/src/bls12_381/mod.rs + */ +static void final_exp(vec384fp12 ret, const vec384fp12 f) +{ + vec384fp12 y0, y1, y2, y3; + + vec_copy(y1, f, sizeof(y1)); + conjugate_fp12(y1); + inverse_fp12(y2, f); + mul_fp12(ret, y1, y2); + frobenius_map_fp12(y2, ret, 2); + mul_fp12(ret, ret, y2); + + cyclotomic_sqr_fp12(y0, ret); + raise_to_z(y1, y0); + raise_to_z_div_by_2(y2, y1); + vec_copy(y3, ret, sizeof(y3)); + conjugate_fp12(y3); + mul_fp12(y1, y1, y3); + conjugate_fp12(y1); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y1); + raise_to_z(y3, y2); + conjugate_fp12(y1); + mul_fp12(y3, y3, y1); + conjugate_fp12(y1); + frobenius_map_fp12(y1, y1, 3); + frobenius_map_fp12(y2, y2, 2); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y3); + mul_fp12(y2, y2, y0); + mul_fp12(y2, y2, ret); + mul_fp12(y1, y1, y2); + frobenius_map_fp12(y2, y3, 1); + mul_fp12(ret, y1, y2); +} + +void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ miller_loop_n(ret, Q ? Q : (const POINTonE2_affine *)&BLS12_381_G2, + P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1); +} + +void blst_final_exp(vec384fp12 ret, const vec384fp12 f) +{ final_exp(ret, f); } + +void blst_precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ precompute_lines(Qlines, Q); } + +void blst_miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ miller_loop_lines(ret, Qlines, P); } + +static bool_t is_cyclotomic(const vec384fp12 f) +{ + vec384fp12 a, b; + + frobenius_map_fp12(a, f, 2); + frobenius_map_fp12(b, a, 2); + mul_fp12(b, b, f); + + return vec_is_equal(a, b, sizeof(a)); +} + +int blst_fp12_in_group(const vec384fp12 f) +{ + vec384fp12 a, b; + + if (vec_is_zero(f, sizeof(vec384fp12)) || !is_cyclotomic(f)) + return 0; + + frobenius_map_fp12(a, f, 1); + raise_to_z(b, f); + + return (int)vec_is_equal(a, b, sizeof(a)); +} diff --git a/blst/point.h b/blst/point.h new file mode 100644 index 0000000..4d041b0 --- /dev/null +++ b/blst/point.h @@ -0,0 +1,61 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_POINT_H__ +#define __BLS12_381_ASM_POINT_H__ + +#include "vect.h" + +#define DECLARE_POINT(ptype, bits) \ +typedef struct { vec##bits X,Y,Z; } ptype; \ +typedef struct { vec##bits X,Y; } ptype##_affine; \ +\ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4); \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2); \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2); \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2); \ +static void ptype##_double(ptype *out, const ptype *p1); \ +static void ptype##_mult_w5(ptype *out, const ptype *point, \ + const byte *scalar, size_t nbits); \ +static void ptype##_cneg(ptype *p, limb_t cbit); \ +static void ptype##_to_affine(ptype##_affine *out, const ptype *in); \ +static void ptype##_from_Jacobian(ptype *out, const ptype *in); \ +\ +static inline void ptype##_cswap(ptype *restrict a, \ + ptype *restrict b, bool_t cbit) { \ + vec_cswap(a, b, sizeof(ptype), cbit); \ +} \ +static inline void ptype##_ccopy(ptype *restrict a, \ + const ptype *restrict b, bool_t cbit) {\ + vec_select(a, b, a, sizeof(ptype), cbit); \ +} + +#define DECLARE_PRIVATE_POINTXZ(ptype, bits) \ +typedef struct { vec##bits X,Z; } ptype##xz; \ +\ +static void ptype##xz_ladder_pre(ptype##xz *out, const ptype *in); \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p); \ +static void ptype##xz_ladder_post(ptype *ret, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1);\ +\ +static inline void ptype##xz_cswap(ptype##xz *restrict a, \ + ptype##xz *restrict b, bool_t cbit) {\ + vec_cswap(a, b, sizeof(ptype##xz), cbit); \ +} + +DECLARE_POINT(POINTonE1, 384) + +DECLARE_POINT(POINTonE2, 384x) + +#ifdef __GNUC__ +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#endif diff --git a/blst/rb_tree.c b/blst/rb_tree.c new file mode 100644 index 0000000..207becd --- /dev/null +++ b/blst/rb_tree.c @@ -0,0 +1,145 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include <stddef.h> + +/* + * Red-black tree tailored for uniqueness test. Amount of messages to be + * checked is known prior context initialization, implementation is + * insert-only, failure is returned if message is already in the tree. + */ + +struct node { + struct node *leafs[2]; + const void *data; + size_t len_n_colour; /* len<<1 | colour */ +}; + +struct rb_tree { + struct node *root; + size_t n_nodes; + struct node nodes[1]; +}; + +static long bytes_compare(const unsigned char *ptr0, size_t len0, + const unsigned char *ptr1, size_t len1) +{ + size_t i, len = len0<len1 ? len0 : len1; + long a, b; + + for (i=0; i<len; i++) { + if ((a = ptr0[i]) != (b = ptr1[i])) + return a - b; + } + + return (long)len0 - (long)len1; +} + +#define PAINT_BLACK(p) ((p)->len_n_colour &= ~(size_t)1) +#define PAINT_RED(p) ((p)->len_n_colour |= 1) +#define IS_RED(p) ((p)->len_n_colour & 1) + +static int rb_tree_insert(struct rb_tree *tree, const void *data, size_t len) +{ + struct node *nodes[8*sizeof(void *)]; /* visited nodes */ + unsigned char dirs[8*sizeof(void *)]; /* taken directions */ + size_t k = 0; /* walked distance */ + struct node *p, *y, *z; + + for (p = tree->root; p != NULL; k++) { + long cmp = bytes_compare(data, len, p->data, p->len_n_colour>>1); + + if (cmp == 0) + return 0; /* already in tree, no insertion */ + + /* record the step */ + nodes[k] = p; + p = p->leafs[(dirs[k] = cmp>0)]; + } + + /* allocate new node */ + z = &tree->nodes[tree->n_nodes++]; + z->leafs[0] = z->leafs[1] = NULL; + z->data = data; + z->len_n_colour = len<<1; + PAINT_RED(z); + + /* graft |z| */ + if (k > 0) + nodes[k-1]->leafs[dirs[k-1]] = z; + else + tree->root = z; + + /* re-balance |tree| */ + while (k >= 2 && IS_RED(y = nodes[k-1])) { + size_t ydir = dirs[k-2]; + struct node *x = nodes[k-2], /* |z|'s grandparent */ + *s = x->leafs[ydir^1]; /* |z|'s uncle */ + + if (s != NULL && IS_RED(s)) { + PAINT_RED(x); + PAINT_BLACK(y); + PAINT_BLACK(s); + k -= 2; + } else { + if (dirs[k-1] != ydir) { + /* | | + * x x + * / \ \ + * y s -> z s + * \ / + * z y + * / \ + * ? ? + */ + struct node *t = y; + y = y->leafs[ydir^1]; + t->leafs[ydir^1] = y->leafs[ydir]; + y->leafs[ydir] = t; + } + + /* | | + * x y + * \ / \ + * y s -> z x + * / \ / \ + * z ? ? s + */ + x->leafs[ydir] = y->leafs[ydir^1]; + y->leafs[ydir^1] = x; + + PAINT_RED(x); + PAINT_BLACK(y); + + if (k > 2) + nodes[k-3]->leafs[dirs[k-3]] = y; + else + tree->root = y; + + break; + } + } + + PAINT_BLACK(tree->root); + + return 1; +} + +#undef IS_RED +#undef PAINT_RED +#undef PAINT_BLACK + +size_t blst_uniq_sizeof(size_t n_nodes) +{ return sizeof(struct rb_tree) + sizeof(struct node)*(n_nodes-1); } + +void blst_uniq_init(struct rb_tree *tree) +{ + tree->root = NULL; + tree->n_nodes = 0; +} + +int blst_uniq_test(struct rb_tree *tree, const void *data, size_t len) +{ return (int)rb_tree_insert(tree, data, len); } diff --git a/blst/recip-addchain.h b/blst/recip-addchain.h new file mode 100644 index 0000000..e4e436a --- /dev/null +++ b/blst/recip-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is BLS12_381_P-2. Exponentiation to which yields + * reciprocal to input base. + * + * Generated with 'addchain 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559785' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 461 (16) <<< + * # Bos-Coster (win=3) : 464 ( 9) + * # Bos-Coster (win=8) : 469 (35) + * # Bos-Coster (win=5) : 463 (28) + * # Bos-Coster (win=9) : 467 (32) + * # Bos-Coster (win=7) : 462 (27) + * # Yacobi : 481 (31) + * # Bos-Coster (win=10) : 475 (30) + * # Bos-Coster (win=6) : 463 (32) + * # Bos-Coster (win=2) : 489 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 498 ( 5) + */ + +#define RECIPROCAL_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ +sqr(t[0], t[1]); /* 1: 2 */\ +mul(t[9], t[0], t[1]); /* 2: 3 */\ +sqr(t[5], t[0]); /* 3: 4 */\ +mul(t[2], t[9], t[0]); /* 4: 5 */\ +mul(t[7], t[5], t[9]); /* 5: 7 */\ +mul(t[10], t[2], t[5]); /* 6: 9 */\ +mul(t[13], t[7], t[5]); /* 7: b */\ +mul(t[4], t[10], t[5]); /* 8: d */\ +mul(t[8], t[13], t[5]); /* 9: f */\ +mul(t[15], t[4], t[5]); /* 10: 11 */\ +mul(t[11], t[8], t[5]); /* 11: 13 */\ +mul(t[3], t[15], t[5]); /* 12: 15 */\ +mul(t[12], t[11], t[5]); /* 13: 17 */\ +sqr(t[0], t[4]); /* 14: 1a */\ +mul(t[14], t[12], t[5]); /* 15: 1b */\ +mul(t[6], t[0], t[9]); /* 16: 1d */\ +mul(t[5], t[0], t[2]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[8]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[8]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[1]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[13]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[6]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[11]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[7]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[12]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[8]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[8]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +/* sqr(t[0], t[0]); */ /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +/* sqr(t[0], t[0]); */ /* 458: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd554 */\ +/* sqr(t[0], t[0]); */ /* 459: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa8 */\ +sqr_n_mul(out, t[0], 3, t[1]); /* 460: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9 */\ +} while(0) diff --git a/blst/recip.c b/blst/recip.c new file mode 100644 index 0000000..e0c7006 --- /dev/null +++ b/blst/recip.c @@ -0,0 +1,139 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +#ifdef __OPTIMIZE_SIZE__ +/* + * 608 multiplications for scalar inversion modulo BLS12-381 prime, 32% + * more than corresponding optimal addition-chain, plus mispredicted + * branch penalties on top of that... The addition chain below was + * measured to be >50% faster. + */ +static void flt_reciprocal_fp(vec384 out, const vec384 inp) +{ + static const byte BLS12_381_P_minus_2[] = { + TO_BYTES(0xb9feffffffffaaa9), TO_BYTES(0x1eabfffeb153ffff), + TO_BYTES(0x6730d2a0f6b0f624), TO_BYTES(0x64774b84f38512bf), + TO_BYTES(0x4b1ba7b6434bacd7), TO_BYTES(0x1a0111ea397fe69a) + }; + + exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0); +} +#else +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "recip-addchain.h" +static void flt_reciprocal_fp(vec384 out, const vec384 inp) +{ + RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIPROCAL_MOD_BLS12_381_P +# undef sqr_n_mul +# undef mul +# undef sqr +#endif + +static void flt_reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + flt_reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +static void reciprocal_fp(vec384 out, const vec384 inp) +{ + static const vec384 Px8 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xcff7fffffffd5558), TO_LIMB_T(0xf55ffff58a9ffffd), + TO_LIMB_T(0x39869507b587b120), TO_LIMB_T(0x23ba5c279c2895fb), + TO_LIMB_T(0x58dd3db21a5d66bb), TO_LIMB_T(0xd0088f51cbff34d2) + }; +#ifdef __BLST_NO_ASM__ +# define RRx4 BLS12_381_RR +#else + static const vec384 RRx4 = { /* (4<<768)%P */ + TO_LIMB_T(0x5f7e7cd070d107c2), TO_LIMB_T(0xec839a9ac49c13c8), + TO_LIMB_T(0x6933786f44f4ef0b), TO_LIMB_T(0xd6bf8b9c676be983), + TO_LIMB_T(0xd3adaaaa4dcefb06), TO_LIMB_T(0x12601bc1d82bc175) + }; +#endif + union { vec768 x; vec384 r[2]; } temp; + + ct_inverse_mod_383(temp.x, inp, BLS12_381_P, Px8); + redc_mont_384(temp.r[0], temp.x, BLS12_381_P, p0); + mul_mont_384(temp.r[0], temp.r[0], RRx4, BLS12_381_P, p0); + +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + /* sign goes straight to flt_reciprocal */ + mul_mont_384(temp.r[1], temp.r[0], inp, BLS12_381_P, p0); + if (vec_is_equal(temp.r[1], BLS12_381_Rx.p, sizeof(vec384)) | + vec_is_zero(temp.r[1], sizeof(vec384))) + vec_copy(out, temp.r[0], sizeof(vec384)); + else + flt_reciprocal_fp(out, inp); +#else + vec_copy(out, temp.r[0], sizeof(vec384)); +#endif +#undef RRx4 +} + +void blst_fp_inverse(vec384 out, const vec384 inp) +{ reciprocal_fp(out, inp); } + +void blst_fp_eucl_inverse(vec384 ret, const vec384 a) +{ reciprocal_fp(ret, a); } + +static void reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +void blst_fp2_inverse(vec384x out, const vec384x inp) +{ reciprocal_fp2(out, inp); } + +void blst_fp2_eucl_inverse(vec384x out, const vec384x inp) +{ reciprocal_fp2(out, inp); } + +static void reciprocal_fr(vec256 out, const vec256 inp) +{ + static const vec256 rx2 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd), + TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90), + }; + vec512 temp; + + ct_inverse_mod_256(temp, inp, BLS12_381_r, rx2); + redc_mont_256(out, temp, BLS12_381_r, r0); + mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_fr_inverse(vec256 out, const vec256 inp) +{ reciprocal_fr(out, inp); } + +void blst_fr_eucl_inverse(vec256 out, const vec256 inp) +{ reciprocal_fr(out, inp); } diff --git a/blst/server.c b/blst/server.c new file mode 100644 index 0000000..52c1812 --- /dev/null +++ b/blst/server.c @@ -0,0 +1,24 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "hash_to_field.c" +#include "e1.c" +#include "map_to_g1.c" +#include "e2.c" +#include "map_to_g2.c" +#include "fp12_tower.c" +#include "pairing.c" +#include "aggregate.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "bulk_addition.c" +#include "multi_scalar.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" +#include "rb_tree.c" diff --git a/blst/sha256.h b/blst/sha256.h new file mode 100644 index 0000000..77ddb6d --- /dev/null +++ b/blst/sha256.h @@ -0,0 +1,140 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_SHA256_H__ +#define __BLS12_381_ASM_SHA256_H__ + +#include "vect.h" + +#if (defined(__x86_64__) || defined(__x86_64) || defined(_M_X64)) && \ + defined(__SHA__) /* -msha */ && !defined(__BLST_PORTABLE__) +# define sha256_block_data_order blst_sha256_block_data_order_shaext +#elif defined(__aarch64__) && \ + defined(__ARM_FEATURE_CRYPTO) && !defined(__BLST_PORTABLE__) +# define sha256_block_data_order blst_sha256_block_armv8 +#else +# define sha256_block_data_order blst_sha256_block_data_order +#endif +#define sha256_hcopy blst_sha256_hcopy +#define sha256_bcopy blst_sha256_bcopy +#define sha256_emit blst_sha256_emit + +void sha256_block_data_order(unsigned int *h, const void *inp, size_t blocks); +void sha256_hcopy(unsigned int dst[8], const unsigned int src[8]); +void sha256_bcopy(void *dst, const void *src, size_t len); + +/* + * If SHA256_CTX conflicts with something, just redefine it to alternative + * custom name prior including this header. + */ +typedef struct { + unsigned int h[8]; + unsigned long long N; + unsigned char buf[64]; + size_t off; +} SHA256_CTX; + + +static void sha256_init_h(unsigned int h[8]) +{ + h[0] = 0x6a09e667U; + h[1] = 0xbb67ae85U; + h[2] = 0x3c6ef372U; + h[3] = 0xa54ff53aU; + h[4] = 0x510e527fU; + h[5] = 0x9b05688cU; + h[6] = 0x1f83d9abU; + h[7] = 0x5be0cd19U; +} + +static void sha256_init(SHA256_CTX *ctx) +{ + sha256_init_h(ctx->h); + ctx->N = 0; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void sha256_update(SHA256_CTX *ctx, const void *_inp, size_t len) +{ + size_t n; + const unsigned char *inp = _inp; + + ctx->N += len; + + if ((len != 0) & ((n = ctx->off) != 0)) { + size_t rem = sizeof(ctx->buf) - n; + + if (rem > len) { + sha256_bcopy(ctx->buf + n, inp, len); + ctx->off += len; + return; + } else { + sha256_bcopy(ctx->buf + n, inp, rem); + inp += rem; + len -= rem; + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; + } + } + + n = len / sizeof(ctx->buf); + if (n > 0) { + sha256_block_data_order(ctx->h, inp, n); + n *= sizeof(ctx->buf); + inp += n; + len -= n; + } + + if (len) + sha256_bcopy(ctx->buf, inp, ctx->off = len); +} + +#define __TOBE32(ptr, val) ((ptr)[0] = (unsigned char)((val)>>24), \ + (ptr)[1] = (unsigned char)((val)>>16), \ + (ptr)[2] = (unsigned char)((val)>>8), \ + (ptr)[3] = (unsigned char)(val)) + +#if 1 +void sha256_emit(unsigned char md[32], const unsigned int h[8]); +#else +static void sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + unsigned int h_i; + + h_i = h[0]; __TOBE32(md + 0, h_i); + h_i = h[1]; __TOBE32(md + 4, h_i); + h_i = h[2]; __TOBE32(md + 8, h_i); + h_i = h[3]; __TOBE32(md + 12, h_i); + h_i = h[4]; __TOBE32(md + 16, h_i); + h_i = h[5]; __TOBE32(md + 20, h_i); + h_i = h[6]; __TOBE32(md + 24, h_i); + h_i = h[7]; __TOBE32(md + 28, h_i); +} +#endif + +static void sha256_final(unsigned char md[32], SHA256_CTX *ctx) +{ + unsigned long long bits = ctx->N * 8; + size_t n = ctx->off; + unsigned char *tail; + + ctx->buf[n++] = 0x80; + + if (n > (sizeof(ctx->buf) - 8)) { + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + } + + tail = ctx->buf + sizeof(ctx->buf) - 8; + __TOBE32(tail, (unsigned int)(bits >> 32)); + __TOBE32(tail + 4, (unsigned int)bits); + sha256_block_data_order(ctx->h, ctx->buf, 1); + sha256_emit(md, ctx->h); +} + +#undef __TOBE32 +#endif diff --git a/blst/sqrt-addchain.h b/blst/sqrt-addchain.h new file mode 100644 index 0000000..4e7f0be --- /dev/null +++ b/blst/sqrt-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is (BLS12_381_P-3)/4. Exponentiation to which + * yields reciprocal of sqrt(x), which is used in simplified Shallue- + * van de Woestijne-Ulas map-to-curve method, but it's trivial to adapt + * it for more "traditional" sqrt(x) as 'x*ret' (or for is_square(x) + * as 'x*ret^2==1'). + * + * Generated with 'addchain 1000602388805416848354447456433976039139220704984751971333014534031007912622709466110671907282253916009473568139946' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 458 (16) <<< + * # Bos-Coster (win=5) : 460 (28) + * # Bos-Coster (win=6) : 461 (33) + * # Bos-Coster (win=7) : 460 (28) + * # Bos-Coster (win=3) : 462 ( 9) + * # Bos-Coster (win=8) : 466 (34) + * # Bos-Coster (win=9) : 464 (31) + * # Yacobi : 478 (31) + * # Bos-Coster (win=10) : 473 (30) + * # Bos-Coster (win=2) : 486 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 489 ( 5) + */ + +#define RECIP_SQRT_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[13], inp, sizeof(ptype));/* 0: 1 */\ +sqr(t[0], t[13]); /* 1: 2 */\ +mul(t[8], t[0], t[13]); /* 2: 3 */\ +sqr(t[4], t[0]); /* 3: 4 */\ +mul(t[1], t[8], t[0]); /* 4: 5 */\ +mul(t[6], t[4], t[8]); /* 5: 7 */\ +mul(t[9], t[1], t[4]); /* 6: 9 */\ +mul(t[12], t[6], t[4]); /* 7: b */\ +mul(t[3], t[9], t[4]); /* 8: d */\ +mul(t[7], t[12], t[4]); /* 9: f */\ +mul(t[15], t[3], t[4]); /* 10: 11 */\ +mul(t[10], t[7], t[4]); /* 11: 13 */\ +mul(t[2], t[15], t[4]); /* 12: 15 */\ +mul(t[11], t[10], t[4]); /* 13: 17 */\ +sqr(t[0], t[3]); /* 14: 1a */\ +mul(t[14], t[11], t[4]); /* 15: 1b */\ +mul(t[5], t[0], t[8]); /* 16: 1d */\ +mul(t[4], t[0], t[1]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[7]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[9]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[13]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[12]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[9]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[5]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[10]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[1]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[11]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[3]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[7]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[6]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +sqr(out, t[0]); /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +} while(0) diff --git a/blst/sqrt.c b/blst/sqrt.c new file mode 100644 index 0000000..cf149fd --- /dev/null +++ b/blst/sqrt.c @@ -0,0 +1,261 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +#ifdef __OPTIMIZE_SIZE__ +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + static const byte BLS_12_381_P_minus_3_div_4[] = { + TO_BYTES(0xee7fbfffffffeaaa), TO_BYTES(0x07aaffffac54ffff), + TO_BYTES(0xd9cc34a83dac3d89), TO_BYTES(0xd91dd2e13ce144af), + TO_BYTES(0x92c6e9ed90d2eb35), TO_BYTES(0x0680447a8e5ff9a6) + }; + + exp_mont_384(out, inp, BLS_12_381_P_minus_3_div_4, 379, BLS12_381_P, p0); +} +#else +# if 1 +/* + * "383"-bit variant omits full reductions at the ends of squarings, + * which results in up to ~15% improvement. [One can improve further + * by omitting full reductions even after multiplications and + * performing final reduction at the very end of the chain.] + */ +static inline void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ sqr_n_mul_mont_383(out, a, count, BLS12_381_P, p0, b); } +# else +static void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ + while(count--) { + sqr_fp(out, a); + a = out; + } + mul_fp(out, out, b); +} +# endif + +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "sqrt-addchain.h" +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + RECIP_SQRT_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIP_SQRT_MOD_BLS12_381_P + +# undef sqr_n_mul +# undef sqr +# undef mul +#endif + +static bool_t recip_sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + bool_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t1, t0, inp); + sqr_fp(t1, t1); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +static bool_t sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + bool_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t0, t0, inp); + sqr_fp(t1, t0); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +int blst_fp_sqrt(vec384 out, const vec384 inp) +{ return (int)sqrt_fp(out, inp); } + +int blst_fp_is_square(const vec384 inp) +{ + return (int)ct_is_square_mod_384(inp, BLS12_381_P); +} + +static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp) +{ + static const vec384x sqrt_minus_1 = { { 0 }, { ONE_MONT_P } }; + static const vec384x sqrt_sqrt_minus_1 = { + /* + * "magic" number is ±2^((p-3)/4)%p, which is "1/sqrt(2)", + * in quotes because 2*"1/sqrt(2)"^2 == -1 mod p, not 1, + * but it pivots into "complex" plane nevertheless... + */ + { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + static const vec384x sqrt_minus_sqrt_minus_1 = { + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + vec384x coeff, t0, t1; + bool_t is_sqrt, flag; + + /* + * Instead of multiple trial squarings we can perform just one + * and see if the result is "rotated by multiple of 90°" in + * relation to |inp|, and "rotate" |ret| accordingly. + */ + sqr_fp2(t0, sqrt); + /* "sqrt(|inp|)"^2 = (a + b*i)^2 = (a^2-b^2) + 2ab*i */ + + /* (a^2-b^2) + 2ab*i == |inp| ? |ret| is spot on */ + sub_fp2(t1, t0, inp); + is_sqrt = vec_is_zero(t1, sizeof(t1)); + vec_copy(coeff, BLS12_381_Rx.p2, sizeof(coeff)); + + /* -(a^2-b^2) - 2ab*i == |inp| ? "rotate |ret| by 90°" */ + add_fp2(t1, t0, inp); + vec_select(coeff, sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* 2ab - (a^2-b^2)*i == |inp| ? "rotate |ret| by 135°" */ + sub_fp(t1[0], t0[0], inp[1]); + add_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* -2ab + (a^2-b^2)*i == |inp| ? "rotate |ret| by 45°" */ + add_fp(t1[0], t0[0], inp[1]); + sub_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_minus_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* actual "rotation" */ + mul_fp2(out, ret, coeff); + + return is_sqrt; +} + +/* + * |inp| = a + b*i + */ +static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, + const vec384x recip_ZZZ, + const vec384x magic_ZZZ) +{ + vec384 aa, bb, cc; + vec384x inp_; + bool_t is_sqrt; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + is_sqrt = recip_sqrt_fp(cc, aa); /* 1/sqrt(a²+b²) */ + + /* if |inp| doesn't have quadratic residue, multiply by "1/Z³" ... */ + mul_fp2(inp_, inp, recip_ZZZ); + /* ... and adjust |aa| and |cc| accordingly */ + { + vec384 za, zc; + + mul_fp(za, aa, magic_ZZZ[0]); /* aa*(za² + zb²) */ + mul_fp(zc, cc, magic_ZZZ[1]); /* cc*(za² + zb²)^((p-3)/4) */ + vec_select(aa, aa, za, sizeof(aa), is_sqrt); + vec_select(cc, cc, zc, sizeof(cc), is_sqrt); + } + vec_select(inp_, inp, inp_, sizeof(inp_), is_sqrt); + + mul_fp(aa, aa, cc); /* sqrt(a²+b²) */ + + sub_fp(bb, inp_[0], aa); + add_fp(aa, inp_[0], aa); + vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); + div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ + + /* if it says "no sqrt," final "align" will find right one... */ + (void)recip_sqrt_fp(out[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ + + div_by_2_fp(out[1], inp_[1]); + mul_fp(out[1], out[1], out[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ + mul_fp(out[0], out[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ + + /* bound to succeed */ + (void)sqrt_align_fp2(out, out, out, inp_); + + mul_fp(out[0], out[0], cc); /* inverse the result */ + mul_fp(out[1], out[1], cc); + neg_fp(out[1], out[1]); + + return is_sqrt; +} + +static bool_t sqrt_fp2(vec384x out, const vec384x inp) +{ + vec384x ret; + vec384 aa, bb; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + /* don't pay attention to return value, final "align" will tell... */ + (void)sqrt_fp(aa, aa); /* sqrt(a²+b²) */ + + sub_fp(bb, inp[0], aa); + add_fp(aa, inp[0], aa); + vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); + div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ + + /* if it says "no sqrt," final "align" will find right one... */ + (void)recip_sqrt_fp(ret[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ + + div_by_2_fp(ret[1], inp[1]); + mul_fp(ret[1], ret[1], ret[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ + mul_fp(ret[0], ret[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ + + /* + * Now see if |ret| is or can be made sqrt(|inp|)... + */ + + return sqrt_align_fp2(out, ret, ret, inp); +} + +int blst_fp2_sqrt(vec384x out, const vec384x inp) +{ return (int)sqrt_fp2(out, inp); } + +int blst_fp2_is_square(const vec384x inp) +{ + vec384 aa, bb; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + return (int)ct_is_square_mod_384(aa, BLS12_381_P); +} diff --git a/blst/vect.c b/blst/vect.c new file mode 100644 index 0000000..1834a48 --- /dev/null +++ b/blst/vect.c @@ -0,0 +1,176 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +#ifdef __BLST_NO_ASM__ +# include "no_asm.h" +#endif + +/* + * Following are some reference C implementations to assist new + * assembly modules development, as starting-point stand-ins and for + * cross-checking. In order to "polyfil" specific subroutine redefine + * it on compiler command line, e.g. -Dmul_mont_384x=_mul_mont_384x. + */ + +#ifdef lshift_mod_384 +inline void lshift_mod_384(vec384 ret, const vec384 a, size_t n, + const vec384 mod) +{ + while(n--) + add_mod_384(ret, a, a, mod), a = ret; +} +#endif + +#ifdef mul_by_8_mod_384 +inline void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ lshift_mod_384(ret, a, 3, mod); } +#endif + +#ifdef mul_by_3_mod_384 +inline void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a, a, mod); + add_mod_384(ret, t, a, mod); +} +#endif + +#ifdef mul_by_3_mod_384x +inline void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_3_mod_384(ret[0], a[0], mod); + mul_by_3_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_8_mod_384x +inline void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_8_mod_384(ret[0], a[0], mod); + mul_by_8_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_1_plus_i_mod_384x +inline void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, + const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a[0], a[1], mod); + sub_mod_384(ret[0], a[0], a[1], mod); + vec_copy(ret[1], t, sizeof(t)); +} +#endif + +#ifdef add_mod_384x +inline void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + add_mod_384(ret[0], a[0], b[0], mod); + add_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef sub_mod_384x +inline void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + sub_mod_384(ret[0], a[0], b[0], mod); + sub_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef lshift_mod_384x +inline void lshift_mod_384x(vec384x ret, const vec384x a, size_t n, + const vec384 mod) +{ + lshift_mod_384(ret[0], a[0], n, mod); + lshift_mod_384(ret[1], a[1], n, mod); +} +#endif + +#if defined(mul_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod, limb_t n0) +{ + vec768 t0, t1, t2; + vec384 aa, bb; + + mul_384(t0, a[0], b[0]); + mul_384(t1, a[1], b[1]); + + add_mod_384(aa, a[0], a[1], mod); + add_mod_384(bb, b[0], b[1], mod); + mul_384(t2, aa, bb); + sub_mod_384x384(t2, t2, t0, mod); + sub_mod_384x384(t2, t2, t1, mod); + + sub_mod_384x384(t0, t0, t1, mod); + + redc_mont_384(ret[0], t0, mod, n0); + redc_mont_384(ret[1], t2, mod, n0); +} +#endif + +#if defined(sqr_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 mod, limb_t n0) +{ + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], mod); + sub_mod_384(t1, a[0], a[1], mod); + + mul_mont_384(ret[1], a[0], a[1], mod, n0); + add_mod_384(ret[1], ret[1], ret[1], mod); + + mul_mont_384(ret[0], t0, t1, mod, n0); +} +#endif + +limb_t div_3_limbs(const limb_t dividend_top[2], limb_t d_lo, limb_t d_hi); +limb_t quot_rem_128(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); +limb_t quot_rem_64(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); + +/* + * Divide 255-bit |val| by z^2 yielding 128-bit quotient and remainder in place. + */ +static void div_by_zz(limb_t val[]) +{ + static const limb_t zz[] = { TO_LIMB_T(0x0000000100000000), + TO_LIMB_T(0xac45a4010001a402) }; + size_t loop, zz_len = sizeof(zz)/sizeof(zz[0]); + limb_t d_lo, d_hi; + + d_lo = zz[zz_len - 2]; + d_hi = zz[zz_len - 1]; + for (loop = zz_len, zz_len--; loop--;) { + limb_t q = div_3_limbs(val + loop + zz_len, d_lo, d_hi); + (void)quot_rem_128(val + loop, zz, q); + } + /* remainder is in low half of val[], quotient is in high */ +} + +/* + * Divide 128-bit |val| by z yielding 64-bit quotient and remainder in place. + */ +static void div_by_z(limb_t val[]) +{ + static const limb_t z[] = { TO_LIMB_T(0xd201000000010000) }; + size_t loop, z_len = sizeof(z)/sizeof(z[0]); + limb_t d_lo, d_hi; + + d_lo = (sizeof(z) == sizeof(limb_t)) ? 0 : z[z_len - 2]; + d_hi = z[z_len - 1]; + for (loop = z_len, z_len--; loop--;) { + limb_t q = div_3_limbs(val + loop + z_len, d_lo, d_hi); + (void)quot_rem_64(val + loop, z, q); + } + /* remainder is in low half of val[], quotient is in high */ +} diff --git a/blst/vect.h b/blst/vect.h new file mode 100644 index 0000000..11b5836 --- /dev/null +++ b/blst/vect.h @@ -0,0 +1,483 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_VECT_H__ +#define __BLS12_381_ASM_VECT_H__ + +#include <stddef.h> + +#if defined(__x86_64__) || defined(__aarch64__) +/* These are available even in ILP32 flavours, but even then they are + * capable of performing 64-bit operations as efficiently as in *P64. */ +typedef unsigned long long limb_t; +# define LIMB_T_BITS 64 + +#elif defined(_WIN64) /* Win64 is P64 */ +typedef unsigned __int64 limb_t; +# define LIMB_T_BITS 64 + +#elif defined(__BLST_NO_ASM__) || defined(__wasm64__) +typedef unsigned int limb_t; +# define LIMB_T_BITS 32 +# ifndef __BLST_NO_ASM__ +# define __BLST_NO_ASM__ +# endif + +#else /* 32 bits on 32-bit platforms, 64 - on 64-bit */ +typedef unsigned long limb_t; +# ifdef _LP64 +# define LIMB_T_BITS 64 +# else +# define LIMB_T_BITS 32 +# define __BLST_NO_ASM__ +# endif +#endif + +/* + * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor + * knows nothing about sizeof(anything)... + */ +#if LIMB_T_BITS == 64 +# define TO_LIMB_T(limb64) limb64 +#else +# define TO_LIMB_T(limb64) (limb_t)limb64,(limb_t)(limb64>>32) +#endif + +#define NLIMBS(bits) (bits/LIMB_T_BITS) + +typedef limb_t vec256[NLIMBS(256)]; +typedef limb_t vec512[NLIMBS(512)]; +typedef limb_t vec384[NLIMBS(384)]; +typedef limb_t vec768[NLIMBS(768)]; +typedef vec384 vec384x[2]; /* 0 is "real" part, 1 is "imaginary" */ + +typedef unsigned char byte; +#define TO_BYTES(limb64) (byte)limb64,(byte)(limb64>>8),\ + (byte)(limb64>>16),(byte)(limb64>>24),\ + (byte)(limb64>>32),(byte)(limb64>>40),\ + (byte)(limb64>>48),(byte)(limb64>>56) +typedef byte pow256[256/8]; + +/* + * Internal Boolean type, Bolean by value, hence safe to cast to or + * reinterpret as 'bool'. + */ +typedef limb_t bool_t; + +/* + * Assembly subroutines... + */ +#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__) +# define mul_mont_sparse_256 mulx_mont_sparse_256 +# define sqr_mont_sparse_256 sqrx_mont_sparse_256 +# define from_mont_256 fromx_mont_256 +# define redc_mont_256 redcx_mont_256 +# define mul_mont_384 mulx_mont_384 +# define sqr_mont_384 sqrx_mont_384 +# define sqr_n_mul_mont_384 sqrx_n_mul_mont_384 +# define sqr_n_mul_mont_383 sqrx_n_mul_mont_383 +# define mul_384 mulx_384 +# define sqr_384 sqrx_384 +# define redc_mont_384 redcx_mont_384 +# define from_mont_384 fromx_mont_384 +# define sgn0_pty_mont_384 sgn0x_pty_mont_384 +# define sgn0_pty_mont_384x sgn0x_pty_mont_384x +# define ct_inverse_mod_383 ctx_inverse_mod_383 +#elif defined(__BLST_NO_ASM__) +# define ct_inverse_mod_383 ct_inverse_mod_384 +#endif + +void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b, + const vec256 p, limb_t n0); +void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); +void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); +void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); + +void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p); +void cneg_mod_256(vec256 ret, const vec256 a, bool_t flag, const vec256 p); +void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +bool_t eucl_inverse_mod_256(vec256 ret, const vec256 a, const vec256 p, + const vec256 one); +limb_t check_mod_256(const pow256 a, const vec256 p); +limb_t add_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, + const vec256 p); +limb_t sub_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, + const vec256 p); + +void vec_prefetch(const void *ptr, size_t len); + +void mul_mont_384(vec384 ret, const vec384 a, const vec384 b, + const vec384 p, limb_t n0); +void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); + +void mul_384(vec768 ret, const vec384 a, const vec384 b); +void sqr_384(vec768 ret, const vec384 a); +void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0); +void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p); +limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p); + +void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p); +void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p); +void cneg_mod_384(vec384 ret, const vec384 a, bool_t flag, const vec384 p); +void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +void rshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +void div_by_2_mod_384(vec384 ret, const vec384 a, const vec384 p); +void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod, + const vec384 modx); +void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, + const vec256 modx); +bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod); + +#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__) +# define mul_mont_384x mulx_mont_384x +# define sqr_mont_384x sqrx_mont_384x +# define sqr_mont_382x sqrx_mont_382x +# define sqr_n_mul_mont_384x sqrx_n_mul_mont_384x +# define mul_382x mulx_382x +# define sqr_382x sqrx_382x +#endif + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0); +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void sqr_n_mul_mont_384x(vec384x ret, const vec384x a, size_t count, + const vec384 p, limb_t n0, const vec384x b); +void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p); +void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p); + +void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); +void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); + +/* + * C subroutines + */ +static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void div_by_zz(limb_t val[]); +static void div_by_z(limb_t val[]); + +#ifdef __UINTPTR_TYPE__ +typedef __UINTPTR_TYPE__ uptr_t; +#else +typedef const void *uptr_t; +#endif + +#if !defined(restrict) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define restrict __restrict__ +# elif defined(_MSC_VER) +# define restrict __restrict +# else +# define restrict +# endif +# endif +#endif + +#if defined(__CUDA_ARCH__) +# define inline inline __device__ +#endif + +#if !defined(inline) && !defined(__cplusplus) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define inline __inline__ +# elif defined(_MSC_VER) +# define inline __inline +# else +# define inline +# endif +# endif +#endif + +static inline bool_t is_bit_set(const byte *v, size_t i) +{ return (v[i/8] >> (i%8)) & 1; } + +static inline bool_t byte_is_zero(unsigned char c) +{ return ((limb_t)(c) - 1) >> (LIMB_T_BITS - 1); } + +static inline bool_t bytes_are_zero(const unsigned char *a, size_t num) +{ + unsigned char acc; + size_t i; + + for (acc = 0, i = 0; i < num; i++) + acc |= a[i]; + + return byte_is_zero(acc); +} + +static inline void bytes_zero(unsigned char *a, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + a[i] = 0; +} + +static inline void vec_cswap(void *restrict a, void *restrict b, size_t num, + bool_t cbit) +{ + limb_t ai, *ap = (limb_t *)a; + limb_t bi, *bp = (limb_t *)b; + limb_t xorm, mask = (limb_t)0 - cbit; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask; + ap[i] = ai ^ xorm; + bp[i] = bi ^ xorm; + } +} + +/* ret = bit ? a : b */ +#ifdef __CUDA_ARCH__ +extern "C" { +__device__ void vec_select_48(void *ret, const void *a, const void *b, + unsigned int sel_a); +__device__ void vec_select_96(void *ret, const void *a, const void *b, + unsigned int sel_a); +__device__ void vec_select_192(void *ret, const void *a, const void *b, + unsigned int sel_a); +__device__ void vec_select_144(void *ret, const void *a, const void *b, + unsigned int sel_a); +__device__ void vec_select_288(void *ret, const void *a, const void *b, + unsigned int sel_a); +} +#else +void vec_select_48(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_96(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_144(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_192(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_288(void *ret, const void *a, const void *b, bool_t sel_a); +#endif +static inline void vec_select(void *ret, const void *a, const void *b, + size_t num, bool_t sel_a) +{ +#ifndef __BLST_NO_ASM__ + if (num == 48) vec_select_48(ret, a, b, sel_a); + else if (num == 96) vec_select_96(ret, a, b, sel_a); + else if (num == 144) vec_select_144(ret, a, b, sel_a); + else if (num == 192) vec_select_192(ret, a, b, sel_a); + else if (num == 288) vec_select_288(ret, a, b, sel_a); +#else + if (0) ; +#endif + else { + limb_t bi, *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t xorm, mask = (limb_t)0 - sel_a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = (ap[i] ^ (bi = bp[i])) & mask; + rp[i] = bi ^ xorm; + } + } +} + +static inline bool_t is_zero(limb_t l) +{ return (~l & (l - 1)) >> (LIMB_T_BITS - 1); } + +static inline bool_t vec_is_zero(const void *a, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + limb_t acc; + size_t i; + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i]; + + return is_zero(acc); +} + +static inline bool_t vec_is_equal(const void *a, const void *b, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t acc; + size_t i; + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i] ^ bp[i]; + + return is_zero(acc); +} + +static inline void cneg_mod_384x(vec384x ret, const vec384x a, bool_t flag, + const vec384 p) +{ + cneg_mod_384(ret[0], a[0], flag, p); + cneg_mod_384(ret[1], a[1], flag, p); +} + +static inline void vec_copy(void *restrict ret, const void *a, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i]; +} + +static inline void vec_zero(void *ret, size_t num) +{ + volatile limb_t *rp = (volatile limb_t *)ret; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = 0; + +#if defined(__GNUC__) && !defined(__NVCC__) + asm volatile("" : : "r"(ret) : "memory"); +#endif +} + +static inline void limbs_from_be_bytes(limb_t *restrict ret, + const unsigned char *in, size_t n) +{ + limb_t limb = 0; + + while(n--) { + limb <<= 8; + limb |= *in++; + /* + * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper + * to perform redundant stores than to pay penalty for + * mispredicted branch. Besides, some compilers unroll the + * loop and remove redundant stores to 'restict'-ed storage... + */ + ret[n / sizeof(limb_t)] = limb; + } +} + +static inline void be_bytes_from_limbs(unsigned char *out, const limb_t *in, + size_t n) +{ + limb_t limb; + + while(n--) { + limb = in[n / sizeof(limb_t)]; + *out++ = (unsigned char)(limb >> (8 * (n % sizeof(limb_t)))); + } +} + +static inline void limbs_from_le_bytes(limb_t *restrict ret, + const unsigned char *in, size_t n) +{ + limb_t limb = 0; + + while(n--) { + limb <<= 8; + limb |= in[n]; + /* + * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper + * to perform redundant stores than to pay penalty for + * mispredicted branch. Besides, some compilers unroll the + * loop and remove redundant stores to 'restict'-ed storage... + */ + ret[n / sizeof(limb_t)] = limb; + } +} + +static inline void le_bytes_from_limbs(unsigned char *out, const limb_t *in, + size_t n) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + limb_t limb; + size_t i, j, r; + + if ((uptr_t)out == (uptr_t)in && is_endian.little) + return; + + r = n % sizeof(limb_t); + n /= sizeof(limb_t); + + for(i = 0; i < n; i++) { + for (limb = in[i], j = 0; j < sizeof(limb_t); j++, limb >>= 8) + *out++ = (unsigned char)limb; + } + if (r) { + for (limb = in[i], j = 0; j < r; j++, limb >>= 8) + *out++ = (unsigned char)limb; + } +} + +/* + * Some compilers get arguably overzealous(*) when passing pointer to + * multi-dimensional array [such as vec384x] as 'const' argument. + * General direction seems to be to legitimize such constification, + * so it's argued that suppressing the warning is appropriate. + * + * (*) http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm + */ +#if defined(__INTEL_COMPILER) +# pragma warning(disable:167) +# pragma warning(disable:556) +#elif defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic ignored "-Wpedantic" +#elif defined(_MSC_VER) +# pragma warning(disable: 4127 4189) +#endif + +#if !defined(__wasm__) +# include <stdlib.h> +#endif + +#if defined(__GNUC__) +# ifndef alloca +# define alloca(s) __builtin_alloca(s) +# endif +#elif defined(__sun) +# include <alloca.h> +#elif defined(_WIN32) +# include <malloc.h> +# ifndef alloca +# define alloca(s) _alloca(s) +# endif +#endif + +#endif /* __BLS12_381_ASM_VECT_H__ */ diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..84bcc77 --- /dev/null +++ b/build.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +CFLAGS=${CFLAGS:--O -fno-builtin -fPIC -Wall -Wextra} +CC=gcc +AR=ar + +${CC} ${CFLAGS} -c blst/server.c +${CC} ${CFLAGS} -c blst/assembly.S +${AR} rc libblst.a server.o assembly.o + +${CC} ${CFLAGS} -o ctm ctm.c fstoken.c debugprint.c libblst.a \ No newline at end of file diff --git a/ctm b/ctm new file mode 100755 index 0000000..0932e57 Binary files /dev/null and b/ctm differ diff --git a/ctm.c b/ctm.c new file mode 100644 index 0000000..7e13f45 --- /dev/null +++ b/ctm.c @@ -0,0 +1,404 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <dirent.h> +#include <unistd.h> +#include <sys/random.h> +#include <stdbool.h> +#include <assert.h> +#include "blst/blst.h" +#include "debugprint.h" +#include "fstoken.h" + +char* token_path; + +void print_help(char* name){ + printf("FemtoStar Credit Token Manager (ctm)\n"); + printf("This tool can be used to generate and process Credit Tokens for use with the FemtoStar Protocol.\n\n"); + + printf("Warning: This tool lets you do insecure or broken things! Be careful with it.\n"); + printf("ctm is still under development! Do not assume it is secure or complete yet.\n"); + printf("In particular, note that keys and tokens are currently stored unencrypted on-disk.\n\n"); + + printf("%s help - display this help\n", name); + printf("%s path - display your token path\n", name); + printf("%s list - list targets you have keys for - use \"%s list verbose\" to also display paths to the keys\n", name, name); + printf("%s keygen [targ] - create a new target keypair [targ] using the default token format (128/256) and the system true randomness source\n", name); + printf("%s keygen [targ] [tfs] - create a new target keypair [targ] using token format [tfs] and the system true randomness source\n", name); + printf("%s keygen [targ] [tfs] [ikm] - create a new target keypair [targ] using token format specifier [tfs] and 32-byte hexadecimal seed [ikm]\n", name); + printf("%s keydump [targ] - dump Public and, if available, Secret Keys for target [targ]\n", name); + printf("%s keyrepair [targ] - regenerate a missing Public Key for a target [targ] for which a Secret Key is available\n", name); + printf("%s req [targ] - generate a token request for target [targ]\n", name); +} + +int get_key_paths(char* target, char** sk_path, char** pk_path, char** tfs_path){ + int key_path_len; + + key_path_len = strlen(token_path) + strlen(target) + 13; + + *sk_path = malloc(key_path_len); + if(*sk_path == NULL) return 1; + + *pk_path = malloc(key_path_len); + if(*pk_path == NULL) return 1; + + *tfs_path = malloc(key_path_len + 1); + if(*tfs_path == NULL) return 1; + + strcpy(*sk_path, token_path); + strcat(*sk_path, "/targets/"); + strcat(*sk_path, target); + strcpy(*pk_path, *sk_path); + strcpy(*tfs_path, *sk_path); + strcat(*sk_path, ".sk"); + strcat(*pk_path, ".pk"); + strcat(*tfs_path, ".tfs"); + + return 0; +} + +int get_keys(char* target, byte* sk, byte* pk, int* idbits, int* hashbits){ // pk/sk/idbits/hasbits pointers can be NULL if you don't want to read those + FILE *targ_file; + char *sk_path; + char *pk_path; + char *tfs_path; + bool sk_available, pk_available, tfs_available; + int idbits_buf, hashbits_buf; + + get_key_paths(target, &sk_path, &pk_path, &tfs_path); + + sk_available = (access(sk_path, R_OK) == 0); + pk_available = (access(pk_path, R_OK) == 0); + tfs_available = (access(tfs_path, R_OK) == 0); + + if(sk_available && sk != NULL){ + targ_file = fopen(sk_path, "r"); + if(!targ_file){ + printf("Could not open Secret Key file. Exiting.\n"); + return 1; + } + fread(sk, 32, 1, targ_file); + fclose(targ_file); + } + + if(pk_available && pk != NULL){ + targ_file = fopen(pk_path, "r"); + if(!targ_file){ + printf("Could not open Public Key file. Exiting.\n"); + return 1; + } + fread(pk, 96, 1, targ_file); + fclose(targ_file); + } + + if(idbits != NULL || hashbits != NULL){ + if(tfs_available){ + targ_file = fopen(tfs_path, "r"); + if(!targ_file){ + printf("Could not open Token Format Specifier file. Exiting.\n"); + return 1; + } + fscanf(targ_file, "%i/%i", &idbits_buf, &hashbits_buf); + fclose(targ_file); + + if(idbits != NULL) *idbits = idbits_buf; + if(hashbits != NULL) *hashbits = hashbits_buf; + }else{ + printf("WARNING: Token Format Specifier not set, this is a broken state. Using default (128/256) - please add a .tfs file for the target\n"); + if(idbits != NULL) *idbits = IDBITS_DEFAULT; + if(hashbits != NULL) *hashbits = HASHBITS_DEFAULT; + } + } + + // 0 = no keys (bad target), 1 = PK only, 2 = SK only (broken state), 3 = PK+SK (can sign) + return (2 * sk_available) + pk_available; +} + +int keydump(char* target){ + byte sk[32]; + byte pk[96]; + int key_status; + int idbits, hashbits; + + key_status = get_keys(target, sk, pk, &idbits, &hashbits); + + switch(key_status){ + case 0: + printf("No keys found - target unknown.\n"); + break; + case 1: + printf("Public Key available - can verify and request for this target\n"); + print_bytes("Public Key: ", pk, 96); + break; + case 2: + printf("Secret Key ONLY available - this is a broken state, please keyrepair this keypair (see help)\n"); + print_bytes("Secret Key: ", sk, 32); + break; + case 3: + printf("Secret Key and Public Key available - can verify, request, and sign for this target.\n"); + print_bytes("Secret Key: ", sk, 32); + print_bytes("Public Key: ", pk, 96); + break; + } + + printf("Token Format Specifier: %i/%i (%i ID bits, %i hash bits)\n", idbits, hashbits, idbits, hashbits); + + return 0; +} + +int keyrepair(char* target){ + FILE *key_file; + byte sk[32]; + byte pk[96]; + char* sk_path; + char* pk_path; + char* tfs_path; + int key_status; + + key_status = get_keys(target, sk, NULL, NULL, NULL); + + if(key_status != 2){ + printf("This target does not refer to a keypair with only a Secret Key available. Exiting.\n"); + return 1; + } + + printf("Regenerating Public Key from Private Key for broken keypair %s\n", target); + + fstoken_get_pk_from_sk(sk, pk); + debug_print_bytes("Regenerated Public Key: ", pk, 96); + + get_key_paths(target, &sk_path, &pk_path, &tfs_path); + + key_file = fopen(pk_path, "w"); + if(!key_file){ + printf("Could not open Public Key file. Exiting.\n"); + return 1; + } + fwrite(pk, 96, 1, key_file); + fclose(key_file); + + printf("Saved to %s\n", pk_path); + + return 0; +} + +int keygen(char* target, byte* ikm, int idbits, int hashbits){ + char *sk_path; + char *pk_path; + char* tfs_path; + FILE *targ_file; + byte sk_byte[32]; + byte pk_byte[96]; + + debug_print_bytes("IKM: ", ikm, 32); + + fstoken_keygen(ikm, sk_byte, pk_byte); + + debug_print_bytes("Secret Key: ", sk_byte, 32); + debug_print_bytes("Public Key: ", pk_byte, 96); + + if(get_key_paths(target, &sk_path, &pk_path, &tfs_path)) return 1; + + printf("Writing Secret Key to %s\n", sk_path); + + targ_file = fopen(sk_path, "w"); + if(!targ_file){ + printf("Could not open Secret Key file. Exiting.\n"); + return 1; + } + fwrite(sk_byte, 32, 1, targ_file); + fclose(targ_file); + + printf("Writing Public Key to %s\n", pk_path); + + targ_file = fopen(pk_path, "w"); + if(!targ_file){ + printf("Could not open Public Key file. Exiting.\n"); + return 1; + } + fwrite(pk_byte, 96, 1, targ_file); + fclose(targ_file); + + printf("Writing Token Format Specifier to %s\n", tfs_path); + + targ_file = fopen(tfs_path, "w"); + if(!targ_file){ + printf("Could not open Token Format Specifier file. Exiting.\n"); + return 1; + } + fprintf(targ_file, "%i/%i", idbits, hashbits); + fclose(targ_file); + + return 0; +} + +void print_path(){ + printf("Token Path (from FEMTOSTAR_TOKEN_PATH environment variable): %s\n", token_path); +} + +bool string_endswith(const char *str, const char *suffix){ + if (!str || !suffix) + return 0; + size_t lenstr = strlen(str); + size_t lensuffix = strlen(suffix); + if (lensuffix > lenstr) + return 0; + return strncmp(str + lenstr - lensuffix, suffix, lensuffix) == 0; +} + +// This function is awful because strings in C. It should probably be improved. +int list_targets(bool verbose){ + printf("Listing all targets - you have secret keys for, and can issue tokens for, targets marked with (*)\n\n"); + int n, keyname_len; + struct dirent **files; + char *keydir_path, *key_path, *key_name; + bool sk_available; + + keydir_path = malloc(strlen(token_path) + 9); + if(keydir_path == NULL) return 1; + + strcpy(keydir_path, token_path); + strcat(keydir_path, "/targets"); + + #ifndef __INTELLISENSE__ // VSCodium doesn't know where alphasort is and highlights an error + n = scandir(keydir_path, &files, NULL, alphasort); + #endif + + if(n == -1){ + fprintf(stderr, "Could not list directory at token path.\n"); + exit(1); + } + + for(int i=0;i<n;i++){ + if(string_endswith(files[i]->d_name, ".pk")){ + keyname_len = strlen(files[i]->d_name); + + key_name = malloc(keyname_len + 1); + if(key_name == NULL) return 1; + + strcpy(key_name, files[i]->d_name); + key_name[keyname_len - 3] = '\0'; + + printf("%s", key_name); + + key_path = malloc(strlen(token_path) + 9 + strlen(files[i]->d_name)); + if(key_path == NULL) return 1; + + strcpy(key_path, token_path); + strcat(key_path, "/targets/"); + strcat(key_path, files[i]->d_name); + + if(verbose) printf(" (PK: %s", key_path); + + key_path[strlen(key_path) - 2] = 's'; + + if(access(key_path, R_OK) == 0){ + sk_available = true; + + if(verbose) printf(", SK: %s", key_path); + }else{ + sk_available = false; + } + + if(verbose) printf(")"); + if(sk_available) printf(" (*)"); + + printf("\n"); + free(key_path); + free(key_name); + } + } + free(keydir_path); + + return 0; +} + +void bendian_from_hex_string(byte* bendian, char* string, int length){ + char byte[2]; + for(int i=0; i<length; i++){ + memcpy(byte, &string[i*2], 2); + bendian[i] = strtol(byte, 0, 16); + } +} + +// mostly boring command line parsing +int main(int argc, char *argv[]){ + token_path = getenv("FEMTOSTAR_TOKEN_PATH"); + + if(!token_path){ + fprintf(stderr, "The environment variable FEMTOSTAR_TOKEN_PATH does not exist! Please set it before using ctm.\n"); + exit(1); + } + + if(argc < 2){ + fprintf(stderr, "Provide at least one argument. Try \"%s help\" for more information.\n", argv[0]); + return 1; + }else if(strcmp(argv[1], "help") == 0){ + print_help(argv[0]); + return 0; + }else if(strcmp(argv[1], "path") == 0){ + print_path(); + return 0; + }else if(strcmp(argv[1], "list") == 0){ + return list_targets(argc > 2 && strcmp(argv[2], "verbose") == 0); // i don't know if this is cursed or genius + }else if(strcmp(argv[1], "keygen") == 0){ + byte ikm[32]; + int idbits, hashbits; + + if(argc > 5){ + printf("Too many arguments. Exiting.\n"); + } + + // Make sure there's a target name + if(argc < 3){ + fprintf(stderr, "A target name must be provided, e.g. %s keygen [targ]\n", argv[0]); + return(1); + } + + // Default behaviour for if only target name is provided: default TFS, random IKM. Otherwise, validate and use provided. + if(argc < 4){ + idbits = IDBITS_DEFAULT; + hashbits = HASHBITS_DEFAULT; + }else{ + sscanf(argv[3], "%i/%i", &idbits, &hashbits); + if(idbits < 1 || idbits > IDBITS_MAX){ + printf("Invalid Token Format Specifier: number of ID bits must be between 1 and 256 inclusive\n"); + return 1; + } + if(hashbits < 1 || hashbits > HASHBITS_MAX){ + printf("Invalid Token Format Specifier: number of hash bits must be between 1 and 256 inclusive\n"); + return 1; + } + } + + // If no IKM is provided, use the system true randomness source + if(argc < 5){ + getrandom(ikm, 32, GRND_RANDOM); + }else{ + if(strlen(argv[4]) != 64){ + fprintf(stderr, "If providing IKM, it must be 32 bytes (64 hexadecimal digits)\n"); + return 1; + } + + bendian_from_hex_string(ikm, argv[4], 64); + } + + return keygen(argv[2], ikm, idbits, hashbits); + }else if(strcmp(argv[1], "keydump") == 0){ + // Make sure there's a target name + if(argc < 3){ + fprintf(stderr, "A target name must be provided, e.g. %s keydump [targ]\n", argv[0]); + return(1); + } + + return keydump(argv[2]); + }else if(strcmp(argv[1], "keyrepair") == 0){ + // Make sure there's a target name + if(argc < 3){ + fprintf(stderr, "A target name must be provided, e.g. %s keyrepair [targ]\n", argv[0]); + return(1); + } + + return keyrepair(argv[2]); + } +} \ No newline at end of file diff --git a/debugprint.c b/debugprint.c new file mode 100644 index 0000000..4a72ca0 --- /dev/null +++ b/debugprint.c @@ -0,0 +1,30 @@ +#include <stdio.h> +#include "debugprint.h" +#include "blst/blst.h" + +void print_bytes(const char* label, byte *toprint, int length){ + printf("%s", label); + for(int i=0;i<length;i++){ + printf("%.2x ", toprint[i]); + } + printf("\n"); +} + +void debug_print_bytes(__attribute__((unused)) const char* label, __attribute__((unused)) byte *toprint, __attribute__((unused)) int length){ + #ifdef INSECURE_CTM_DEBUG_PRINT + print_bytes(label, toprint, length); + #endif +} + +void print_scalar(const char* label, blst_scalar *toprint){ + byte temp_buffer[32]; + blst_bendian_from_scalar(temp_buffer, toprint); + + debug_print_bytes(label, temp_buffer, 32); +} + +void debug_print_scalar(__attribute__((unused)) const char* label, __attribute__((unused)) blst_scalar *toprint){ + #ifdef INSECURE_CTM_DEBUG_PRINT + print_scalar(label, toprint); + #endif +} \ No newline at end of file diff --git a/debugprint.h b/debugprint.h new file mode 100644 index 0000000..fc9a7cc --- /dev/null +++ b/debugprint.h @@ -0,0 +1,13 @@ +#ifndef __DEBUGPRINT_H_ +#define __DEBUGPRINT_H_ +#include "blst/blst.h" + +// Uncomment the line below to enable debug prints (including private keys!) - use for development only, this is insecure +#define INSECURE_CTM_DEBUG_PRINT + +void print_bytes(const char* label, byte *toprint, int length); +void debug_print_bytes(__attribute__((unused)) const char* label, __attribute__((unused)) byte *toprint, __attribute__((unused)) int length); +void print_scalar(const char* label, blst_scalar *toprint); +void debug_print_scalar(__attribute__((unused)) const char* label, __attribute__((unused)) blst_scalar *toprint); + +#endif \ No newline at end of file diff --git a/fstoken.c b/fstoken.c new file mode 100644 index 0000000..f9dd63c --- /dev/null +++ b/fstoken.c @@ -0,0 +1,25 @@ +#include "fstoken.h" + +void fstoken_keygen(byte* ikm, byte* sk_byte, byte* pk_byte){ + blst_scalar sk; + blst_p2 pk; + blst_p2_affine pk_affine; + + blst_keygen(&sk, ikm, 32, 0, 0); // generate a secret key from IKM + blst_bendian_from_scalar(sk_byte, &sk); // convert it to 32 big-endian bytes in sk_byte to return + + blst_sk_to_pk_in_g2(&pk, &sk); // get a public key from the secret key + blst_p2_to_affine(&pk_affine, &pk); // convert it to an affine point, which is what most uses of the public key use + blst_p2_affine_compress(pk_byte, &pk_affine); // compress it to 96 bytes in pk_byte to return +} + +void fstoken_get_pk_from_sk(byte* sk_byte, byte* pk_byte){ + blst_p2 pk; + blst_p2_affine pk_affine; + blst_scalar sk; + + blst_scalar_from_bendian(&sk, sk_byte); + blst_sk_to_pk_in_g2(&pk, &sk); + blst_p2_to_affine(&pk_affine, &pk); + blst_p2_affine_compress(pk_byte, &pk_affine); +} \ No newline at end of file diff --git a/fstoken.h b/fstoken.h new file mode 100644 index 0000000..d7ccb08 --- /dev/null +++ b/fstoken.h @@ -0,0 +1,20 @@ +#ifndef __FSTOKEN_H__ +#define __FSTOKEN_H__ +#include "blst/blst.h" + +#define IDBITS_DEFAULT 128 +#define IDBITS_MAX 256 +#define HASHBITS_DEFAULT 256 +#define HASHBITS_MAX 256 + +typedef struct{ + byte sk[32]; + byte pk[96]; + uint8_t idbits; + uint8_t hashbits; +} target_descriptor; + +void fstoken_keygen(byte* ikm, byte* sk_byte, byte* pk_byte); +void fstoken_get_pk_from_sk(byte* sk_byte, byte* pk_byte); + +#endif \ No newline at end of file diff --git a/libblst.a b/libblst.a new file mode 100644 index 0000000..8a80909 Binary files /dev/null and b/libblst.a differ diff --git a/main.c b/main.c new file mode 100644 index 0000000..de2dc15 --- /dev/null +++ b/main.c @@ -0,0 +1,114 @@ +#include <stdio.h> +#include <string.h> +#include <time.h> +#include "blst/blst.h" + +const byte dst[] = "MY-DST"; +double time_taken; +clock_t t; + +void printbytes(byte *toprint, int length){ + for(int i=0;i<length;i++){ + printf("%.2x ", toprint[i]); + } + printf("\n"); +} + +void signer(byte *compressed_signature, byte *compressed_public_key, byte *msg){ + blst_scalar sk; + blst_p2 pk; + blst_p1 hash, signature; + byte debug_print_buf[256]; + byte myikm[32] = {'*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*'}; + + // On signer's side: + printf("IKM: "); + printbytes(myikm, 32); + + blst_keygen(&sk, myikm, 32, 0, 0); + + blst_bendian_from_scalar(debug_print_buf, &sk); + printf("Secret Key: "); + printbytes(debug_print_buf, 32); + + blst_sk_to_pk_in_g2(&pk, &sk); + + blst_p2_compress(compressed_public_key, &pk); + printf("Compressed Public Key: "); + printbytes(compressed_public_key, 96); + + t = clock(); + blst_hash_to_g1(&hash, msg, strlen((char *) msg), dst, strlen((char *) dst), 0, 0); + t = clock() - t; + + time_taken = ((double)t)/CLOCKS_PER_SEC*1000.0; + printf("blst_hash_to_g1 took %f ms\n", time_taken); + + blst_p1_serialize(debug_print_buf, &hash); + printf("Message Hash: "); + printbytes(debug_print_buf, 96); + + t = clock(); + blst_sign_pk_in_g2(&signature, &hash, &sk); + t = clock() - t; + + time_taken = ((double)t)/CLOCKS_PER_SEC*1000.0; + printf("blst_sign_pk_in_g2 took %f ms\n", time_taken); + + blst_p1_serialize(debug_print_buf, &signature); + printf("Signature: "); + printbytes(debug_print_buf, 96); + + blst_p1_compress(compressed_signature, &signature); + printf("Compressed Signature: "); + printbytes(compressed_signature, 48); +} + +void verifier(byte *compressed_signature, byte *compressed_public_key, byte *msg){ + blst_p1_affine sig; + blst_p2_affine pk; + + blst_p1_uncompress(&sig, compressed_signature); + blst_p2_uncompress(&pk, compressed_public_key); + + BLST_ERROR returned; + + // TODO: check if in g2 group + + t = clock(); + returned = blst_core_verify_pk_in_g2(&pk, &sig, 1, msg, strlen((char *) msg), dst, strlen((char *) dst), 0, 0); + t = clock() - t; + + time_taken = ((double)t)/CLOCKS_PER_SEC*1000.0; + printf("blst_core_verify_pk_in_g2 took %f ms\n", time_taken); + + if(returned == BLST_SUCCESS){ + printf("Verified!\n"); + }else{ + printf("Not verified!\n"); + } +} + +int main(){ + byte compressed_signature[48]; + byte compressed_public_key[96]; + byte msg[] = "assertion"; + + t = clock(); + t = clock() - t; + + time_taken = ((double)t)/CLOCKS_PER_SEC*1000.0; + printf("Doing nothing took %f ms\n", time_taken); + + printf("msg is now %s\n", msg); + + // Sign the message and get the results back + signer(compressed_signature, compressed_public_key, msg); + + //msg[8] = 'A'; + + printf("msg is now %s\n", msg); + + // Now on verifier's side (after compressed_signature, serialized_public_key, and msg are passed over the network) + verifier(compressed_signature, compressed_public_key, msg); +} \ No newline at end of file diff --git a/nonblind.c b/nonblind.c new file mode 100644 index 0000000..873364f --- /dev/null +++ b/nonblind.c @@ -0,0 +1,131 @@ +// This is a (very rough) test of BLST blind signatures based on run.me from BLST's Python example code +// Do not trust this to be secure, also this doesn't do a lot of the sanity checking yet + +#include <stdio.h> +#include <string.h> +#include <time.h> +#include "blst/blst.h" + +const byte dst[] = "MY-DST"; +double time_taken; +clock_t t; + +byte signer_private_key[32]; +byte signer_public_key[96]; + +void printbytes(byte *toprint, int length){ + for(int i=0;i<length;i++){ + printf("%.2x ", toprint[i]); + } + printf("\n"); +} + +void signer_key_setup(){ + blst_scalar sk; + blst_p2 pk; + blst_p2_affine pk_affine; + + byte myikm[32] = {'*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*'}; + + // On signer's side: + printf("IKM: "); + printbytes(myikm, 32); + + blst_keygen(&sk, myikm, 32, 0, 0); + + blst_bendian_from_scalar(signer_private_key, &sk); + printf("Secret Key: "); + printbytes(signer_private_key, 32); + + blst_sk_to_pk_in_g2(&pk, &sk); + + blst_p2_to_affine(&pk_affine, &pk); + + blst_p2_affine_compress(signer_public_key, &pk_affine); + printf("Compressed Public Key (affine): "); + printbytes(signer_public_key, 96); +} + +void signer(byte *compressed_signature, byte *msg_for_wire){ + blst_scalar sk; + blst_p1 msg, signature; + blst_p1_affine msg_affine; + byte debug_print_buf[256]; + + // get the secret key as a scalar + blst_scalar_from_bendian(&sk, signer_private_key); + + // Deserialize the message - it's already a serialized P1 point, we don't need to (literally) rehash it + blst_p1_deserialize(&msg_affine, msg_for_wire); + + // i do not know why deserializing always gives you affine points + blst_p1_from_affine(&msg, &msg_affine); + + // sign with it + blst_sign_pk_in_g2(&signature, &msg, &sk); + + // Serialize and print the signature + blst_p1_serialize(debug_print_buf, &signature); + printf("Signature: "); + printbytes(debug_print_buf, 96); + + // Compress and print the signature + blst_p1_compress(compressed_signature, &signature); + printf("Compressed Signature: "); + printbytes(compressed_signature, 48); +} + +void verifier(byte *compressed_signature, byte *msg){ + blst_p1_affine sig; + blst_p2_affine pk; + + blst_p1_uncompress(&sig, compressed_signature); + blst_p2_uncompress(&pk, signer_public_key); + + BLST_ERROR returned; + + // TODO: check if in g2 group + + returned = blst_core_verify_pk_in_g2(&pk, &sig, 1, msg, strlen((char *) msg), dst, strlen((char *) dst), signer_public_key, 96); + + if(returned == BLST_SUCCESS){ + printf("Verified!\n"); + }else{ + printf("Not verified!\n"); + } +} + +// main is the "user" in this test +int main(){ + byte compressed_signature[48]; + byte msg[] = "assertion"; + blst_p1 hash; + byte msg_for_wire_bytes[96]; + + printf("msg is now %s\n", msg); + + // Set up the signer's keys first so that we can know its public key + signer_key_setup(); + + // Get a hash of the message - we put the signer's public key in aug here, I don't know why + blst_hash_to_g1(&hash, msg, strlen((char *) msg), dst, strlen((char *) dst), signer_public_key, 96); + + // Serialize the blinded message to send it over the wire + blst_p1_serialize(msg_for_wire_bytes, &hash); + + printf("Hashed for wire: "); + printbytes(msg_for_wire_bytes, 96); + + // Send the message off to be signed and get the results back + signer(compressed_signature, msg_for_wire_bytes); + + printf("RETURNED SIGNATURE: "); + printbytes(compressed_signature, 48); + + //msg[8] = 'A'; + + printf("msg is now %s\n", msg); + + // Now on verifier's side (after compressed_signature, serialized_public_key, and msg are passed over the network) + verifier(compressed_signature, msg); +} \ No newline at end of file diff --git a/server.o b/server.o new file mode 100644 index 0000000..6aeaa31 Binary files /dev/null and b/server.o differ diff --git a/set_token_path.sh b/set_token_path.sh new file mode 100755 index 0000000..1738612 --- /dev/null +++ b/set_token_path.sh @@ -0,0 +1,3 @@ +#!/bin/bash +# source this file to set your token path, or add this to your profile +export FEMTOSTAR_TOKEN_PATH=~/fstokens \ No newline at end of file diff --git a/test b/test new file mode 100755 index 0000000..4469211 Binary files /dev/null and b/test differ