commit 943c07066e737a935b3b7f345610cdb5077d7633
Author: John Doe <johndoe@example.com>
Date:   Fri Sep 9 02:47:49 2022 -0400

    initial stuff

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..b3d8c2f
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,13 @@
+{
+    "files.associations": {
+        "vector": "c",
+        "memory": "c",
+        "optional": "c",
+        "string_view": "c",
+        "string": "c",
+        "system_error": "c",
+        "thread": "c",
+        "typeindex": "c",
+        "variant": "c"
+    }
+}
\ No newline at end of file
diff --git a/assembly.o b/assembly.o
new file mode 100644
index 0000000..5ade239
Binary files /dev/null and b/assembly.o differ
diff --git a/blindsig.c b/blindsig.c
new file mode 100644
index 0000000..219290b
--- /dev/null
+++ b/blindsig.c
@@ -0,0 +1,182 @@
+// This is a (very rough) test of BLST blind signatures based on run.me from BLST's Python example code
+// Do not trust this to be secure, also this doesn't do a lot of the sanity checking yet
+
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include "blst/blst.h"
+
+const byte dst[] = "MY-DST";
+double time_taken;
+clock_t t;
+
+byte signer_private_key[32];
+byte signer_public_key[96];
+
+void printbytes(byte *toprint, int length){
+    for(int i=0;i<length;i++){
+        printf("%.2x ", toprint[i]);
+    }
+    printf("\n");
+}
+
+void signer_key_setup(){
+    blst_scalar sk;
+    blst_p2 pk;
+    blst_p2_affine pk_affine;
+
+    byte myikm[32] = {'*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*'};
+
+    // On signer's side:
+    printf("IKM: ");
+    printbytes(myikm, 32);
+
+    blst_keygen(&sk, myikm, 32, 0, 0);
+
+    blst_bendian_from_scalar(signer_private_key, &sk);
+    printf("Secret Key: ");
+    printbytes(signer_private_key, 32);
+
+    blst_sk_to_pk_in_g2(&pk, &sk);
+
+    blst_p2_to_affine(&pk_affine, &pk);
+
+    blst_p2_affine_compress(signer_public_key, &pk_affine);
+    printf("Compressed Public Key (affine): ");
+    printbytes(signer_public_key, 96);
+}
+
+void signer(byte *compressed_signature, byte *msg_for_wire){
+    blst_scalar sk;
+    blst_p1 msg, signature;
+    blst_p1_affine msg_affine;
+    byte debug_print_buf[256];
+
+    // get the secret key as a scalar
+    blst_scalar_from_bendian(&sk, signer_private_key);
+
+    // Deserialize the message - it's already a serialized P1 point, we don't need to (literally) rehash it
+    blst_p1_uncompress(&msg_affine, msg_for_wire);
+
+    // i do not know why deserializing always gives you affine points
+    blst_p1_from_affine(&msg, &msg_affine);
+
+    // Confirm the message point is in the G1 group
+    assert(blst_p1_in_g1(&msg));
+
+    // sign with it
+    blst_sign_pk_in_g2(&signature, &msg, &sk);
+
+    // Serialize and print the signature
+    blst_p1_serialize(debug_print_buf, &signature);
+    printf("Signature: ");
+    printbytes(debug_print_buf, 96);
+
+    // Compress and print the signature
+    blst_p1_compress(compressed_signature, &signature);
+    printf("Compressed Signature: ");
+    printbytes(compressed_signature, 48);
+}
+
+void verifier(byte *compressed_signature, byte *msg){
+    blst_p1_affine sig;
+    blst_p2_affine pk;
+
+    blst_p1_uncompress(&sig, compressed_signature);
+    blst_p2_uncompress(&pk, signer_public_key);
+
+    BLST_ERROR returned;
+
+    // TODO: check if in g2 group
+    
+    returned = blst_core_verify_pk_in_g2(&pk, &sig, 1, msg, strlen((char *) msg), dst, strlen((char *) dst), signer_public_key, 96);
+
+    if(returned == BLST_SUCCESS){
+        printf("Verified!\n");
+    }else{
+        printf("Not verified!\n");
+    }
+}
+
+// main is the "user" in this test
+int main(){
+    byte debug_print_buf[256];
+    byte compressed_blinded_signature[48];
+    byte compressed_signature[48];
+    byte msg[] = "assertion";
+    byte blinding_r_bytes[32] = {'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R'};
+    blst_scalar blinding_r, inverse_blinding_r;
+    blst_p1 hash, msg_for_wire;
+    byte msg_for_wire_bytes[96];
+    blst_p1_affine returned_signature_affine;
+    blst_p1 returned_signature, unblinded_signature;
+
+    printf("msg is now %s\n", msg);
+
+    // Set up the signer's keys first so that we can know its public key
+    signer_key_setup();
+
+    // Get a hash of the message - we put the signer's public key in aug here, I don't know why
+    blst_hash_to_g1(&hash, msg, strlen((char *) msg), dst, strlen((char *) dst), signer_public_key, 96);
+
+    printf("HASH: ");
+    blst_p1_serialize(debug_print_buf, &hash);
+    printbytes(debug_print_buf, 96);
+
+    // Get a BLST scalar of your "random" (LOL) blinding factor r
+    blst_scalar_from_bendian(&blinding_r, blinding_r_bytes);
+
+    printf("R BYTES: ");
+    printbytes(blinding_r_bytes, 32);
+
+    // Blind the message by signing it with the blinding factor R as if it was a secret key
+    blst_sign_pk_in_g2(&msg_for_wire, &hash, &blinding_r);
+
+    // Serialize the blinded message to send it over the wire
+    blst_p1_compress(msg_for_wire_bytes, &msg_for_wire);
+
+    printf("Blinded and compressed for wire: ");
+    printbytes(msg_for_wire_bytes, 48);
+
+    // Send the message off to be signed and get the results back
+    signer(compressed_blinded_signature, msg_for_wire_bytes);
+
+    printf("COMPRESSED BLINDED SIG: ");
+    printbytes(compressed_blinded_signature, 48);
+
+    // We now have the signature back. returned_signature is a blst_p1_affine because this is pk_in_g2.
+    blst_p1_uncompress(&returned_signature_affine, compressed_blinded_signature);
+
+    // Convert the uncompressed returned signature from an affine to a P1
+    blst_p1_from_affine(&returned_signature, &returned_signature_affine);
+
+    // Confirm the signature point is in the G1 group
+    assert(blst_p1_in_g1(&returned_signature));
+
+    printf("RETURNED SIGNATURE: ");
+    blst_p1_serialize(debug_print_buf, &returned_signature);
+    printbytes(debug_print_buf, 96);
+
+    // Get the inverse of R. We'll need this to unblind the signature.
+    blst_sk_inverse(&inverse_blinding_r, &blinding_r);
+
+    // Print the inverse of R
+    printf("INVERSE R: ");
+    blst_bendian_from_scalar(debug_print_buf, &inverse_blinding_r);
+    printbytes(debug_print_buf, 32);
+
+    // Sign the blinded signature we get back from the signer with the inverse of the blinding factor
+    blst_sign_pk_in_g2(&unblinded_signature, &returned_signature, &inverse_blinding_r);
+
+    blst_p1_compress(compressed_signature, &unblinded_signature);
+
+    printf("UNBLINDED SIGNATURE: ");
+    printbytes(compressed_signature, 48);
+
+    //msg[8] = 'A';
+
+    printf("msg is now %s\n", msg);
+
+    // Now on verifier's side (after compressed_signature, serialized_public_key, and msg are passed over the network)
+    verifier(compressed_signature, msg);
+}
\ No newline at end of file
diff --git a/blst/aggregate.c b/blst/aggregate.c
new file mode 100644
index 0000000..f2c4be7
--- /dev/null
+++ b/blst/aggregate.c
@@ -0,0 +1,674 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * Usage pattern on single-processor system is
+ *
+ * blst_pairing_init(ctx, hash_or_encode, DST);
+ * blst_pairing_aggregate_pk_in_g1(ctx, PK[0], aggregated_signature, msg[0]);
+ * blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, msg[1]);
+ * ...
+ * blst_pairing_commit(ctx);
+ * blst_pairing_finalverify(ctx, NULL);
+ *
+ ***********************************************************************
+ * Usage pattern on multi-processor system is
+ *
+ *   blst_pairing_init(pk[0], hash_or_encode, DST);
+ *   blst_pairing_init(pk[1], hash_or_encode, DST);
+ *   ...
+ * start threads each processing an N/nthreads slice of PKs and messages:
+ *     blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+0], NULL, msg[i*n+0]);
+ *     blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+1], NULL, msg[i*n+1]);
+ *     ...
+ *     blst_pairing_commit(pkx);
+ *   ...
+ * meanwhile in main thread
+ *   blst_fp12 gtsig;
+ *   blst_aggregated_in_g2(&gtsig, aggregated_signature);
+ * join threads and merge their contexts:
+ *   blst_pairing_merge(pk[0], pk[1]);
+ *   blst_pairing_merge(pk[0], pk[2]);
+ *   ...
+ *   blst_pairing_finalverify(pk[0], gtsig);
+ */
+
+#ifndef N_MAX
+# define N_MAX 8
+#endif
+
+typedef union { POINTonE1 e1; POINTonE2 e2; } AggregatedSignature;
+typedef struct {
+    unsigned int ctrl;
+    unsigned int nelems;
+    const void *DST;
+    size_t DST_len;
+    vec384fp12 GT;
+    AggregatedSignature AggrSign;
+    POINTonE2_affine Q[N_MAX];
+    POINTonE1_affine P[N_MAX];
+} PAIRING;
+
+enum { AGGR_UNDEFINED      = 0,
+       AGGR_MIN_SIG        = 1,
+       AGGR_MIN_PK         = 2,
+       AGGR_SIGN_SET       = 0x10,
+       AGGR_GT_SET         = 0x20,
+       AGGR_HASH_OR_ENCODE = 0x40 };
+#define MIN_SIG_OR_PK (AGGR_MIN_SIG | AGGR_MIN_PK)
+
+static const size_t sizeof_pairing = (sizeof(PAIRING) + 7) & ~(size_t)7;
+
+size_t blst_pairing_sizeof(void)
+{   return sizeof_pairing;   }
+
+void blst_pairing_init(PAIRING *ctx, int hash_or_encode,
+                       const void *DST, size_t DST_len)
+{
+    ctx->ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0);
+    ctx->nelems = 0;
+    ctx->DST = (uptr_t)DST==(uptr_t)((byte *)ctx+sizeof_pairing) ? (void *)42
+                                                                 : DST;
+    ctx->DST_len = DST_len;
+}
+
+static const void *pairing_get_dst(const PAIRING *ctx)
+{   return (uptr_t)ctx->DST==(uptr_t)42 ? (const byte *)ctx+sizeof_pairing
+                                        : ctx->DST;
+}
+
+const void *blst_pairing_get_dst(const PAIRING *ctx)
+{   return pairing_get_dst(ctx);   }
+
+#define FROM_AFFINE(out,in) do { \
+    vec_copy((out)->X, in->X, 2*sizeof(in->X)), \
+    vec_select((out)->Z, in->X, BLS12_381_Rx.p, sizeof(in->X), \
+                         vec_is_zero(in->X, 2*sizeof(in->X))); } while(0)
+
+/*
+ * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated
+ * signature vetification as discussed at
+ * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407.
+ * Usage pattern is not finalized yet, because (sig != NULL) is better and
+ * will be handled separately...
+ */
+static BLST_ERROR PAIRING_Aggregate_PK_in_G2(PAIRING *ctx,
+                                             const POINTonE2_affine *PK,
+                                             size_t pk_groupcheck,
+                                             const POINTonE1_affine *sig,
+                                             size_t sig_groupcheck,
+                                             const byte *scalar, size_t nbits,
+                                             const void *msg, size_t msg_len,
+                                             const void *aug, size_t aug_len)
+{
+    if (ctx->ctrl & AGGR_MIN_PK)
+        return BLST_AGGR_TYPE_MISMATCH;
+
+    ctx->ctrl |= AGGR_MIN_SIG;
+
+    /*
+     * Since we don't know if the signature is individual or aggregated,
+     * the only sensible thing to do is to skip over infinite one and
+     * count on the corresponding infinite public key to be rejected,
+     * in case the signature is non-aggregated that is.
+     */
+    if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) {
+        POINTonE1 *S = &ctx->AggrSign.e1;
+        POINTonE1 P[1];
+
+        FROM_AFFINE(P, sig);
+
+        if (sig_groupcheck && !POINTonE1_in_G1(P))
+            return BLST_POINT_NOT_IN_GROUP;
+
+        if (ctx->ctrl & AGGR_SIGN_SET) {
+            if (nbits != 0 && scalar != NULL) {
+                POINTonE1_mult_w5(P, P, scalar, nbits);
+                POINTonE1_dadd(S, S, P, NULL);
+            } else {
+                POINTonE1_dadd_affine(S, S, sig);
+            }
+        } else {
+            ctx->ctrl |= AGGR_SIGN_SET;
+            if (nbits != 0 && scalar != NULL)
+                POINTonE1_mult_w5(S, P, scalar, nbits);
+            else
+                vec_copy(S, P, sizeof(P));
+        }
+    }
+
+    if (PK != NULL) {
+        unsigned int n;
+        POINTonE1 H[1];
+        const void *DST = pairing_get_dst(ctx);
+
+        /*
+         * Reject infinite public keys.
+         */
+        if (vec_is_zero(PK, sizeof(*PK)))
+            return BLST_PK_IS_INFINITY;
+
+        if (pk_groupcheck) {
+            POINTonE2 P[1];
+
+            FROM_AFFINE(P, PK);
+            if (!POINTonE2_in_G2(P))
+                return BLST_POINT_NOT_IN_GROUP;
+        }
+
+        if (ctx->ctrl & AGGR_HASH_OR_ENCODE)
+            Hash_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len);
+        else
+            Encode_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len);
+
+        if (nbits != 0 && scalar != NULL)
+            POINTonE1_mult_w5(H, H, scalar, nbits);
+
+        POINTonE1_from_Jacobian(H, H);
+
+        n = ctx->nelems;
+        vec_copy(ctx->Q + n, PK, sizeof(POINTonE2_affine));
+        vec_copy(ctx->P + n, H, sizeof(POINTonE1_affine));
+        if (++n == N_MAX) {
+            if (ctx->ctrl & AGGR_GT_SET) {
+                vec384fp12 GT;
+                miller_loop_n(GT, ctx->Q, ctx->P, n);
+                mul_fp12(ctx->GT, ctx->GT, GT);
+            } else {
+                miller_loop_n(ctx->GT, ctx->Q, ctx->P, n);
+                ctx->ctrl |= AGGR_GT_SET;
+            }
+            n = 0;
+        }
+        ctx->nelems = n;
+    }
+
+    return BLST_SUCCESS;
+}
+
+BLST_ERROR blst_pairing_aggregate_pk_in_g2(PAIRING *ctx,
+                                           const POINTonE2_affine *PK,
+                                           const POINTonE1_affine *signature,
+                                           const void *msg, size_t msg_len,
+                                           const void *aug, size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, signature, 1, NULL, 0,
+                                      msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(PAIRING *ctx,
+                                                 const POINTonE2_affine *PK,
+                                                 const POINTonE1_affine *sig,
+                                                 const byte *scalar,
+                                                 size_t nbits,
+                                                 const void *msg,
+                                                 size_t msg_len,
+                                                 const void *aug,
+                                                 size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, sig, 1, scalar, nbits,
+                                      msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(PAIRING *ctx,
+                                            const POINTonE2_affine *PK,
+                                            size_t pk_grpchk,
+                                            const POINTonE1_affine *signature,
+                                            size_t sig_grpchk,
+                                            const void *msg, size_t msg_len,
+                                            const void *aug, size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, signature, sig_grpchk,
+                                      NULL, 0, msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(PAIRING *ctx,
+                                                  const POINTonE2_affine *PK,
+                                                  size_t pk_grpchk,
+                                                  const POINTonE1_affine *sig,
+                                                  size_t sig_grpchk,
+                                                  const byte *scalar,
+                                                  size_t nbits,
+                                                  const void *msg,
+                                                  size_t msg_len,
+                                                  const void *aug,
+                                                  size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, sig, sig_grpchk,
+                                      scalar, nbits,
+                                      msg, msg_len, aug, aug_len);
+}
+
+static BLST_ERROR PAIRING_Aggregate_PK_in_G1(PAIRING *ctx,
+                                             const POINTonE1_affine *PK,
+                                             size_t pk_groupcheck,
+                                             const POINTonE2_affine *sig,
+                                             size_t sig_groupcheck,
+                                             const byte *scalar, size_t nbits,
+                                             const void *msg, size_t msg_len,
+                                             const void *aug, size_t aug_len)
+{
+    if (ctx->ctrl & AGGR_MIN_SIG)
+        return BLST_AGGR_TYPE_MISMATCH;
+
+    ctx->ctrl |= AGGR_MIN_PK;
+
+    /*
+     * Since we don't know if the signature is individual or aggregated,
+     * the only sensible thing to do is to skip over infinite one and
+     * count on the corresponding infinite public key to be rejected,
+     * in case the signature is non-aggregated that is.
+     */
+    if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) {
+        POINTonE2 *S = &ctx->AggrSign.e2;
+        POINTonE2 P[1];
+
+        FROM_AFFINE(P, sig);
+
+        if (sig_groupcheck && !POINTonE2_in_G2(P))
+            return BLST_POINT_NOT_IN_GROUP;
+
+        if (ctx->ctrl & AGGR_SIGN_SET) {
+            if (nbits != 0 && scalar != NULL) {
+
+                POINTonE2_mult_w5(P, P, scalar, nbits);
+                POINTonE2_dadd(S, S, P, NULL);
+            } else {
+                POINTonE2_dadd_affine(S, S, sig);
+            }
+        } else {
+            ctx->ctrl |= AGGR_SIGN_SET;
+            if (nbits != 0 && scalar != NULL)
+                POINTonE2_mult_w5(S, P, scalar, nbits);
+            else
+                vec_copy(S, P, sizeof(P));
+        }
+    }
+
+    if (PK != NULL) {
+        unsigned int n;
+        POINTonE2 H[1];
+        const void *DST = pairing_get_dst(ctx);
+
+        /*
+         * Reject infinite public keys.
+         */
+        if (vec_is_zero(PK, sizeof(*PK)))
+            return BLST_PK_IS_INFINITY;
+
+        if (pk_groupcheck) {
+            POINTonE1 P[1];
+
+            FROM_AFFINE(P, PK);
+            if (!POINTonE1_in_G1(P))
+                return BLST_POINT_NOT_IN_GROUP;
+        }
+
+        if (ctx->ctrl & AGGR_HASH_OR_ENCODE)
+            Hash_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len);
+        else
+            Encode_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len);
+
+        POINTonE2_from_Jacobian(H, H);
+
+        if (nbits != 0 && scalar != NULL) {
+            POINTonE1 pk[1];
+
+            FROM_AFFINE(pk, PK);
+            POINTonE1_mult_w5(pk, pk, scalar, nbits);
+            POINTonE1_from_Jacobian(pk, pk);
+            PK = (const POINTonE1_affine *)pk;
+        }
+
+        n = ctx->nelems;
+        vec_copy(ctx->Q + n, H, sizeof(POINTonE2_affine));
+        vec_copy(ctx->P + n, PK, sizeof(POINTonE1_affine));
+        if (++n == N_MAX) {
+            if (ctx->ctrl & AGGR_GT_SET) {
+                vec384fp12 GT;
+                miller_loop_n(GT, ctx->Q, ctx->P, n);
+                mul_fp12(ctx->GT, ctx->GT, GT);
+            } else {
+                miller_loop_n(ctx->GT, ctx->Q, ctx->P, n);
+                ctx->ctrl |= AGGR_GT_SET;
+            }
+            n = 0;
+        }
+        ctx->nelems = n;
+    }
+
+    return BLST_SUCCESS;
+}
+
+BLST_ERROR blst_pairing_aggregate_pk_in_g1(PAIRING *ctx,
+                                           const POINTonE1_affine *PK,
+                                           const POINTonE2_affine *signature,
+                                           const void *msg, size_t msg_len,
+                                           const void *aug, size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, signature, 1, NULL, 0,
+                                      msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(PAIRING *ctx,
+                                                 const POINTonE1_affine *PK,
+                                                 const POINTonE2_affine *sig,
+                                                 const byte *scalar,
+                                                 size_t nbits,
+                                                 const void *msg,
+                                                 size_t msg_len,
+                                                 const void *aug,
+                                                 size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, sig, 1, scalar, nbits,
+                                      msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(PAIRING *ctx,
+                                            const POINTonE1_affine *PK,
+                                            size_t pk_grpchk,
+                                            const POINTonE2_affine *signature,
+                                            size_t sig_grpchk,
+                                            const void *msg, size_t msg_len,
+                                            const void *aug, size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, signature, sig_grpchk,
+                                      NULL, 0, msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(PAIRING *ctx,
+                                                  const POINTonE1_affine *PK,
+                                                  size_t pk_grpchk,
+                                                  const POINTonE2_affine *sig,
+                                                  size_t sig_grpchk,
+                                                  const byte *scalar,
+                                                  size_t nbits,
+                                                  const void *msg,
+                                                  size_t msg_len,
+                                                  const void *aug,
+                                                  size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, sig, sig_grpchk,
+                                      scalar, nbits,
+                                      msg, msg_len, aug, aug_len);
+}
+
+static void PAIRING_Commit(PAIRING *ctx)
+{
+    unsigned int n;
+
+    if ((n = ctx->nelems) != 0) {
+        if (ctx->ctrl & AGGR_GT_SET) {
+            vec384fp12 GT;
+            miller_loop_n(GT, ctx->Q, ctx->P, n);
+            mul_fp12(ctx->GT, ctx->GT, GT);
+        } else {
+            miller_loop_n(ctx->GT, ctx->Q, ctx->P, n);
+            ctx->ctrl |= AGGR_GT_SET;
+        }
+        ctx->nelems = 0;
+    }
+}
+
+void blst_pairing_commit(PAIRING *ctx)
+{   PAIRING_Commit(ctx);   }
+
+BLST_ERROR blst_pairing_merge(PAIRING *ctx, const PAIRING *ctx1)
+{
+    if ((ctx->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED
+        && (ctx1->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED
+        && (ctx->ctrl & ctx1->ctrl & MIN_SIG_OR_PK) == 0)
+        return BLST_AGGR_TYPE_MISMATCH;
+
+    /* context producers are expected to have called blst_pairing_commit */
+    if (ctx->nelems || ctx1->nelems)
+        return BLST_AGGR_TYPE_MISMATCH;
+
+    ctx->ctrl |= ctx1->ctrl & MIN_SIG_OR_PK;
+
+    switch (ctx->ctrl & MIN_SIG_OR_PK) {
+        case AGGR_MIN_SIG:
+            if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) {
+                POINTonE1_dadd(&ctx->AggrSign.e1, &ctx->AggrSign.e1,
+                                                  &ctx1->AggrSign.e1, NULL);
+            } else if (ctx1->ctrl & AGGR_SIGN_SET) {
+                ctx->ctrl |= AGGR_SIGN_SET;
+                vec_copy(&ctx->AggrSign.e1, &ctx1->AggrSign.e1,
+                         sizeof(ctx->AggrSign.e1));
+            }
+            break;
+        case AGGR_MIN_PK:
+            if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) {
+                POINTonE2_dadd(&ctx->AggrSign.e2, &ctx->AggrSign.e2,
+                                                  &ctx1->AggrSign.e2, NULL);
+            } else if (ctx1->ctrl & AGGR_SIGN_SET) {
+                ctx->ctrl |= AGGR_SIGN_SET;
+                vec_copy(&ctx->AggrSign.e2, &ctx1->AggrSign.e2,
+                         sizeof(ctx->AggrSign.e2));
+            }
+            break;
+        case AGGR_UNDEFINED:
+            break;
+        default:
+            return BLST_AGGR_TYPE_MISMATCH;
+    }
+
+    if (ctx->ctrl & ctx1->ctrl & AGGR_GT_SET) {
+        mul_fp12(ctx->GT, ctx->GT, ctx1->GT);
+    } else if (ctx1->ctrl & AGGR_GT_SET) {
+        ctx->ctrl |= AGGR_GT_SET;
+        vec_copy(ctx->GT, ctx1->GT, sizeof(ctx->GT));
+    }
+
+    return BLST_SUCCESS;
+}
+
+static bool_t PAIRING_FinalVerify(const PAIRING *ctx, const vec384fp12 GTsig)
+{
+    vec384fp12 GT;
+
+    if (!(ctx->ctrl & AGGR_GT_SET))
+        return 0;
+
+    if (GTsig != NULL) {
+        vec_copy(GT, GTsig, sizeof(GT));
+    } else if (ctx->ctrl & AGGR_SIGN_SET) {
+        AggregatedSignature AggrSign;
+
+        switch (ctx->ctrl & MIN_SIG_OR_PK) {
+            case AGGR_MIN_SIG:
+                POINTonE1_from_Jacobian(&AggrSign.e1, &ctx->AggrSign.e1);
+                miller_loop_n(GT, (const POINTonE2_affine *)&BLS12_381_G2,
+                                  (const POINTonE1_affine *)&AggrSign.e1, 1);
+                break;
+            case AGGR_MIN_PK:
+                POINTonE2_from_Jacobian(&AggrSign.e2, &ctx->AggrSign.e2);
+                miller_loop_n(GT, (const POINTonE2_affine *)&AggrSign.e2,
+                                  (const POINTonE1_affine *)&BLS12_381_G1, 1);
+                break;
+            default:
+                return 0;
+        }
+    } else {
+        /*
+         * The aggregated signature was infinite, relation between the
+         * hashes and the public keys has to be VERY special...
+         */
+        vec_copy(GT, BLS12_381_Rx.p12, sizeof(GT));
+    }
+
+    conjugate_fp12(GT);
+    mul_fp12(GT, GT, ctx->GT);
+    final_exp(GT, GT);
+
+    /* return GT==1 */
+    return vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) &
+           vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0]));
+}
+
+int blst_pairing_finalverify(const PAIRING *ctx, const vec384fp12 GTsig)
+{   return (int)PAIRING_FinalVerify(ctx, GTsig);   }
+
+int blst_fp12_finalverify(const vec384fp12 GT1, const vec384fp12 GT2)
+{
+    vec384fp12 GT;
+
+    vec_copy(GT, GT1, sizeof(GT));
+    conjugate_fp12(GT);
+    mul_fp12(GT, GT, GT2);
+    final_exp(GT, GT);
+
+    /* return GT==1 */
+    return (int)(vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) &
+                 vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0])));
+}
+
+void blst_pairing_raw_aggregate(PAIRING *ctx, const POINTonE2_affine *q,
+                                              const POINTonE1_affine *p)
+{
+    unsigned int n;
+
+    if (vec_is_zero(q, sizeof(*q)) & vec_is_zero(p, sizeof(*p)))
+        return;
+
+    n = ctx->nelems;
+    vec_copy(ctx->Q + n, q, sizeof(*q));
+    vec_copy(ctx->P + n, p, sizeof(*p));
+    if (++n == N_MAX) {
+        if (ctx->ctrl & AGGR_GT_SET) {
+            vec384fp12 GT;
+            miller_loop_n(GT, ctx->Q, ctx->P, n);
+            mul_fp12(ctx->GT, ctx->GT, GT);
+        } else {
+            miller_loop_n(ctx->GT, ctx->Q, ctx->P, n);
+            ctx->ctrl |= AGGR_GT_SET;
+        }
+        n = 0;
+    }
+    ctx->nelems = n;
+}
+
+vec384fp12 *blst_pairing_as_fp12(PAIRING *ctx)
+{
+    PAIRING_Commit(ctx);
+    return (vec384fp12 *)ctx->GT;
+}
+
+/*
+ * PAIRING context-free entry points.
+ *
+ * To perform FastAggregateVerify, aggregate all public keys and
+ * signatures with corresponding blst_aggregate_in_g{12}, convert
+ * result to affine and call suitable blst_core_verify_pk_in_g{12}
+ * or blst_aggregated_in_g{12}...
+ */
+BLST_ERROR blst_aggregate_in_g1(POINTonE1 *out, const POINTonE1 *in,
+                                                const unsigned char *zwire)
+{
+    POINTonE1 P[1];
+    BLST_ERROR ret;
+
+    ret = POINTonE1_Deserialize_Z((POINTonE1_affine *)P, zwire);
+
+    if (ret != BLST_SUCCESS)
+        return ret;
+
+    if (vec_is_zero(P, sizeof(POINTonE1_affine))) {
+        if (in == NULL)
+            vec_zero(out, sizeof(*out));
+        return BLST_SUCCESS;
+    }
+
+    vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z));
+
+    if (!POINTonE1_in_G1(P))
+        return BLST_POINT_NOT_IN_GROUP;
+
+    if (in == NULL)
+        vec_copy(out, P, sizeof(P));
+    else
+        POINTonE1_dadd_affine(out, in, (POINTonE1_affine *)P);
+
+    return BLST_SUCCESS;
+}
+
+BLST_ERROR blst_aggregate_in_g2(POINTonE2 *out, const POINTonE2 *in,
+                                                const unsigned char *zwire)
+{
+    POINTonE2 P[1];
+    BLST_ERROR ret;
+
+    ret = POINTonE2_Deserialize_Z((POINTonE2_affine *)P, zwire);
+
+    if (ret != BLST_SUCCESS)
+        return ret;
+
+    if (vec_is_zero(P, sizeof(POINTonE2_affine))) {
+        if (in == NULL)
+            vec_zero(out, sizeof(*out));
+        return BLST_SUCCESS;
+    }
+
+    vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z));
+
+    if (!POINTonE2_in_G2(P))
+        return BLST_POINT_NOT_IN_GROUP;
+
+    if (in == NULL) {
+        vec_copy(out, P, sizeof(P));
+    } else {
+        POINTonE2_dadd_affine(out, in, (POINTonE2_affine *)P);
+    }
+    return BLST_SUCCESS;
+}
+
+void blst_aggregated_in_g1(vec384fp12 ret, const POINTonE1_affine *sig)
+{   miller_loop_n(ret, (const POINTonE2_affine *)&BLS12_381_G2, sig, 1);   }
+
+void blst_aggregated_in_g2(vec384fp12 ret, const POINTonE2_affine *sig)
+{   miller_loop_n(ret, sig, (const POINTonE1_affine *)&BLS12_381_G1, 1);   }
+
+BLST_ERROR blst_core_verify_pk_in_g1(const POINTonE1_affine *pk,
+                                     const POINTonE2_affine *signature,
+                                     int hash_or_encode,
+                                     const void *msg, size_t msg_len,
+                                     const void *DST, size_t DST_len,
+                                     const void *aug, size_t aug_len)
+{
+    PAIRING ctx;
+    BLST_ERROR ret;
+
+    ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0);
+    ctx.nelems = 0;
+    ctx.DST = DST;
+    ctx.DST_len = DST_len;
+
+    ret = PAIRING_Aggregate_PK_in_G1(&ctx, pk, 1, signature, 1, NULL, 0,
+                                     msg, msg_len, aug, aug_len);
+    if (ret != BLST_SUCCESS)
+        return ret;
+
+    PAIRING_Commit(&ctx);
+
+    return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL;
+}
+
+BLST_ERROR blst_core_verify_pk_in_g2(const POINTonE2_affine *pk,
+                                     const POINTonE1_affine *signature,
+                                     int hash_or_encode,
+                                     const void *msg, size_t msg_len,
+                                     const void *DST, size_t DST_len,
+                                     const void *aug, size_t aug_len)
+{
+    PAIRING ctx;
+    BLST_ERROR ret;
+
+    ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0);
+    ctx.nelems = 0;
+    ctx.DST = DST;
+    ctx.DST_len = DST_len;
+
+    ret = PAIRING_Aggregate_PK_in_G2(&ctx, pk, 1, signature, 1, NULL, 0,
+                                     msg, msg_len, aug, aug_len);
+    if (ret != BLST_SUCCESS)
+        return ret;
+
+    PAIRING_Commit(&ctx);
+
+    return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL;
+}
diff --git a/blst/asm/add_mod_256-armv8.pl b/blst/asm/add_mod_256-armv8.pl
new file mode 100755
index 0000000..34d9145
--- /dev/null
+++ b/blst/asm/add_mod_256-armv8.pl
@@ -0,0 +1,412 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
+
+@mod=map("x$_",(4..7));
+@a=map("x$_",(8..11));
+@b=map("x$_",(12..15));
+@t=map("x$_",(16,17,1..3));
+
+$code.=<<___;
+.text
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,%function
+.align	5
+add_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+
+	 ldp	@a[2],@a[3],[$a_ptr,#16]
+	adds	@a[0],@a[0],@b[0]
+	 ldp	@b[2],@b[3],[$b_ptr,#16]
+	adcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	adcs	@a[2],@a[2],@b[2]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	adcs	@a[3],@a[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[3],@a[3],@t[3],lo
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	add_mod_256,.-add_mod_256
+
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,%function
+.align	5
+mul_by_3_mod_256:
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+
+	adds	@a[0],@b[0],@b[0]
+	 ldp	@mod[0],@mod[1],[$b_ptr]
+	adcs	@a[1],@b[1],@b[1]
+	 ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	adcs	@a[2],@b[2],@b[2]
+	adcs	@a[3],@b[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	csel	@a[3],@a[3],@t[3],lo
+
+	adds	@a[0],@a[0],@b[0]
+	adcs	@a[1],@a[1],@b[1]
+	adcs	@a[2],@a[2],@b[2]
+	adcs	@a[3],@a[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[3],@a[3],@t[3],lo
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,%function
+.align	5
+lshift_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+
+.Loop_lshift_mod_256:
+	adds	@a[0],@a[0],@a[0]
+	sub	$b_ptr,$b_ptr,#1
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@b[0],@a[0],@mod[0]
+	sbcs	@b[1],@a[1],@mod[1]
+	sbcs	@b[2],@a[2],@mod[2]
+	sbcs	@b[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@b[0],lo
+	csel	@a[1],@a[1],@b[1],lo
+	csel	@a[2],@a[2],@b[2],lo
+	csel	@a[3],@a[3],@b[3],lo
+
+	cbnz	$b_ptr,.Loop_lshift_mod_256
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	lshift_mod_256,.-lshift_mod_256
+
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,%function
+.align	5
+rshift_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+
+.Loop_rshift:
+	adds	@b[0],@a[0],@mod[0]
+	sub	$b_ptr,$b_ptr,#1
+	adcs	@b[1],@a[1],@mod[1]
+	adcs	@b[2],@a[2],@mod[2]
+	adcs	@b[3],@a[3],@mod[3]
+	adc	@t[4],xzr,xzr
+	tst	@a[0],#1
+
+	csel	@b[0],@b[0],@a[0],ne
+	csel	@b[1],@b[1],@a[1],ne
+	csel	@b[2],@b[2],@a[2],ne
+	csel	@b[3],@b[3],@a[3],ne
+	csel	@t[4],@t[4],xzr,ne
+
+	extr	@a[0],@b[1],@b[0],#1
+	extr	@a[1],@b[2],@b[1],#1
+	extr	@a[2],@b[3],@b[2],#1
+	extr	@a[3],@t[4],@b[3],#1
+
+	cbnz	$b_ptr,.Loop_rshift
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	rshift_mod_256,.-rshift_mod_256
+
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,%function
+.align	5
+cneg_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@mod[0],@mod[1],[$n_ptr]
+
+	 ldp	@a[2],@a[3],[$a_ptr,#16]
+	subs	@b[0],@mod[0],@a[0]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	 orr	@mod[0],@a[0],@a[1]
+	sbcs	@b[1],@mod[1],@a[1]
+	 orr	@mod[1],@a[2],@a[3]
+	sbcs	@b[2],@mod[2],@a[2]
+	 orr	@t[4],@mod[0],@mod[1]
+	sbc	@b[3],@mod[3],@a[3]
+
+	cmp	@t[4],#0
+	csetm	@t[4],ne
+	ands	$b_ptr,$b_ptr,@t[4]
+
+	csel	@a[0],@a[0],@b[0],eq
+	csel	@a[1],@a[1],@b[1],eq
+	csel	@a[2],@a[2],@b[2],eq
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[3],@a[3],@b[3],eq
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	cneg_mod_256,.-cneg_mod_256
+
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,%function
+.align	5
+sub_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+
+	 ldp	@a[2],@a[3],[$a_ptr,#16]
+	subs	@a[0],@a[0],@b[0]
+	 ldp	@b[2],@b[3],[$b_ptr,#16]
+	sbcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	sbcs	@a[2],@a[2],@b[2]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	sbcs	@a[3],@a[3],@b[3]
+	sbc	@t[4],xzr,xzr
+
+	 and	@mod[0],@mod[0],@t[4]
+	 and	@mod[1],@mod[1],@t[4]
+	adds	@a[0],@a[0],@mod[0]
+	 and	@mod[2],@mod[2],@t[4]
+	adcs	@a[1],@a[1],@mod[1]
+	 and	@mod[3],@mod[3],@t[4]
+	adcs	@a[2],@a[2],@mod[2]
+	stp	@a[0],@a[1],[$r_ptr]
+	adc	@a[3],@a[3],@mod[3]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	sub_mod_256,.-sub_mod_256
+
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,%function
+.align	5
+check_mod_256:
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@a[1],@a[1]
+	rev	@a[2],@a[2]
+	rev	@a[3],@a[3]
+#endif
+
+	subs	xzr,@a[0],@mod[0]
+	sbcs	xzr,@a[1],@mod[1]
+	orr	@a[0],@a[0],@a[1]
+	sbcs	xzr,@a[2],@mod[2]
+	orr	@a[0],@a[0],@a[2]
+	sbcs	xzr,@a[3],@mod[3]
+	orr	@a[0],@a[0],@a[3]
+	sbc	$a_ptr,xzr,xzr
+
+	cmp	@a[0],#0
+	mov	x0,#1
+	csel	x0,x0,xzr,ne
+	and	x0,x0,$a_ptr
+
+	ret
+.size	check_mod_256,.-check_mod_256
+
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,%function
+.align	5
+add_n_check_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@b[0],@b[0]
+	rev	@a[1],@a[1]
+	rev	@b[1],@b[1]
+	rev	@a[2],@a[2]
+	rev	@b[2],@b[2]
+	rev	@a[3],@a[3]
+	rev	@b[3],@b[3]
+#endif
+
+	adds	@a[0],@a[0],@b[0]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	adcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	adcs	@a[2],@a[2],@b[2]
+	adcs	@a[3],@a[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	csel	@a[3],@a[3],@t[3],lo
+
+	orr	@t[0], @a[0], @a[1]
+	orr	@t[1], @a[2], @a[3]
+	orr	@t[0], @t[0], @t[1]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@a[1],@a[1]
+	rev	@a[2],@a[2]
+	rev	@a[3],@a[3]
+#endif
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	mov	@t[1], #1
+	cmp	@t[0], #0
+	csel	x0, @t[1], xzr, ne
+
+	ret
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,%function
+.align	5
+sub_n_check_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@b[0],@b[0]
+	rev	@a[1],@a[1]
+	rev	@b[1],@b[1]
+	rev	@a[2],@a[2]
+	rev	@b[2],@b[2]
+	rev	@a[3],@a[3]
+	rev	@b[3],@b[3]
+#endif
+
+	subs	@a[0],@a[0],@b[0]
+	sbcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	sbcs	@a[2],@a[2],@b[2]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	sbcs	@a[3],@a[3],@b[3]
+	sbc	@t[4],xzr,xzr
+
+	 and	@mod[0],@mod[0],@t[4]
+	 and	@mod[1],@mod[1],@t[4]
+	adds	@a[0],@a[0],@mod[0]
+	 and	@mod[2],@mod[2],@t[4]
+	adcs	@a[1],@a[1],@mod[1]
+	 and	@mod[3],@mod[3],@t[4]
+	adcs	@a[2],@a[2],@mod[2]
+	adc	@a[3],@a[3],@mod[3]
+
+	orr	@t[0], @a[0], @a[1]
+	orr	@t[1], @a[2], @a[3]
+	orr	@t[0], @t[0], @t[1]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@a[1],@a[1]
+	rev	@a[2],@a[2]
+	rev	@a[3],@a[3]
+#endif
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	mov	@t[1], #1
+	cmp	@t[0], #0
+	csel	x0, @t[1], xzr, ne
+
+	ret
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
+___
+
+print $code;
+
+close STDOUT;
diff --git a/blst/asm/add_mod_256-x86_64.pl b/blst/asm/add_mod_256-x86_64.pl
new file mode 100755
index 0000000..1d656fb
--- /dev/null
+++ b/blst/asm/add_mod_256-x86_64.pl
@@ -0,0 +1,547 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx");
+$b_ptr = "%rbx";
+
+{ ############################################################## 256 bits add
+my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12));
+
+$code.=<<___;
+.text
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,\@function,4,"unwind"
+.align	32
+add_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+.Loaded_a_add_mod_256:
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	 mov	@acc[0], @acc[4]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[1], @acc[5]
+	adc	8*3($b_org), @acc[3]
+	sbb	$b_org, $b_org
+
+	 mov	@acc[2], @acc[6]
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], @acc[7]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[4], @acc[0]
+	cmovc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_mod_256,.-add_mod_256
+
+########################################################################
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,\@function,3,"unwind"
+.align	32
+mul_by_3_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	$b_org,$n_ptr
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	$a_ptr,$b_org
+	mov	8*3($a_ptr), @acc[3]
+
+	call	__lshift_mod_256
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	jmp	.Loaded_a_add_mod_256
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.type	__lshift_mod_256,\@abi-omnipotent
+.align	32
+__lshift_mod_256:
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	 mov	@acc[0], @acc[4]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[1], @acc[5]
+	adc	@acc[3], @acc[3]
+	sbb	@acc[8], @acc[8]
+
+	 mov	@acc[2], @acc[6]
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], @acc[7]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	\$0, @acc[8]
+
+	cmovc	@acc[4], @acc[0]
+	cmovc	@acc[5], @acc[1]
+	cmovc	@acc[6], @acc[2]
+	cmovc	@acc[7], @acc[3]
+
+	ret
+.size	__lshift_mod_256,.-__lshift_mod_256
+
+########################################################################
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,\@function,4,"unwind"
+.align	32
+lshift_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+.Loop_lshift_mod_256:
+	call	__lshift_mod_256
+	dec	%edx
+	jnz	.Loop_lshift_mod_256
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	lshift_mod_256,.-lshift_mod_256
+
+########################################################################
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,\@function,4,"unwind"
+.align	32
+rshift_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[7]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+.Loop_rshift_mod_256:
+	mov	@acc[7], @acc[0]
+	and	\$1, @acc[7]
+	mov	8*0($n_ptr), @acc[4]
+	neg	@acc[7]
+	mov	8*1($n_ptr), @acc[5]
+	mov	8*2($n_ptr), @acc[6]
+
+	and	@acc[7], @acc[4]
+	and	@acc[7], @acc[5]
+	and	@acc[7], @acc[6]
+	and	8*3($n_ptr), @acc[7]
+
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	adc	@acc[6], @acc[2]
+	adc	@acc[7], @acc[3]
+	sbb	@acc[4], @acc[4]
+
+	shr	\$1, @acc[0]
+	mov	@acc[1], @acc[7]
+	shr	\$1, @acc[1]
+	mov	@acc[2], @acc[6]
+	shr	\$1, @acc[2]
+	mov	@acc[3], @acc[5]
+	shr	\$1, @acc[3]
+
+	shl	\$63, @acc[7]
+	shl	\$63, @acc[6]
+	or	@acc[0], @acc[7]
+	shl	\$63, @acc[5]
+	or	@acc[6], @acc[1]
+	shl	\$63, @acc[4]
+	or	@acc[5], @acc[2]
+	or	@acc[4], @acc[3]
+
+	dec	%edx
+	jnz	.Loop_rshift_mod_256
+
+	mov	@acc[7], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	rshift_mod_256,.-rshift_mod_256
+
+########################################################################
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,\@function,4,"unwind"
+.align	32
+cneg_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[8]	# load a[0:3]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	@acc[8], @acc[0]
+	mov	8*3($a_ptr), @acc[3]
+	or	@acc[1], @acc[8]
+	or	@acc[2], @acc[8]
+	or	@acc[3], @acc[8]
+	mov	\$-1, @acc[7]
+
+	mov	8*0($n_ptr), @acc[4]	# load n[0:3]
+	cmovnz	@acc[7], @acc[8]	# mask = a[0:3] ? -1 : 0
+	mov	8*1($n_ptr), @acc[5]
+	mov	8*2($n_ptr), @acc[6]
+	and	@acc[8], @acc[4]	# n[0:3] &= mask
+	mov	8*3($n_ptr), @acc[7]
+	and	@acc[8], @acc[5]
+	and	@acc[8], @acc[6]
+	and	@acc[8], @acc[7]
+
+	sub	@acc[0], @acc[4]	# a[0:3] ? n[0:3]-a[0:3] : 0-0
+	sbb	@acc[1], @acc[5]
+	sbb	@acc[2], @acc[6]
+	sbb	@acc[3], @acc[7]
+
+	or	$b_org, $b_org		# check condition flag
+
+	cmovz	@acc[0], @acc[4]	# flag ? n[0:3]-a[0:3] : a[0:3]
+	cmovz	@acc[1], @acc[5]
+	mov	@acc[4], 8*0($r_ptr)
+	cmovz	@acc[2], @acc[6]
+	mov	@acc[5], 8*1($r_ptr)
+	cmovz	@acc[3], @acc[7]
+	mov	@acc[6], 8*2($r_ptr)
+	mov	@acc[7], 8*3($r_ptr)
+
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	cneg_mod_256,.-cneg_mod_256
+
+########################################################################
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,\@function,4,"unwind"
+.align	32
+sub_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[4]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[5]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[6]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[7]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_mod_256,.-sub_mod_256
+
+########################################################################
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,\@function,2,"unwind"
+.align	32
+check_mod_256:
+.cfi_startproc
+	mov	8*0($r_ptr), %rax
+	mov	8*1($r_ptr), @acc[1]
+	mov	8*2($r_ptr), @acc[2]
+	mov	8*3($r_ptr), @acc[3]
+
+	mov	%rax, @acc[0]		# see if it's zero
+	or	@acc[1], %rax
+	or	@acc[2], %rax
+	or	@acc[3], %rax
+
+	sub	8*0($a_ptr), @acc[0]	# does subtracting modulus borrow?
+	sbb	8*1($a_ptr), @acc[1]
+	sbb	8*2($a_ptr), @acc[2]
+	sbb	8*3($a_ptr), @acc[3]
+	sbb	$a_ptr, $a_ptr
+
+	mov	\$1, %rdx
+	cmp	\$0, %rax
+	cmovne	%rdx, %rax
+	and	$a_ptr, %rax
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	check_mod_256,.-check_mod_256
+
+########################################################################
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,\@function,4,"unwind"
+.align	32
+add_n_check_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	 mov	@acc[0], @acc[4]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[1], @acc[5]
+	adc	8*3($b_org), @acc[3]
+	sbb	$b_org, $b_org
+
+	 mov	@acc[2], @acc[6]
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], @acc[7]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[4], @acc[0]
+	cmovc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	or	@acc[1], @acc[0]
+	or	@acc[3], @acc[2]
+	or	@acc[2], @acc[0]
+	mov	\$1, %rax
+	cmovz	@acc[0], %rax
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+########################################################################
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,\@function,4,"unwind"
+.align	32
+sub_n_check_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[4]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[5]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[6]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[7]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	or	@acc[1], @acc[0]
+	or	@acc[3], @acc[2]
+	or	@acc[2], @acc[0]
+	mov	\$1, %rax
+	cmovz	@acc[0], %rax
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/add_mod_384-armv8.pl b/blst/asm/add_mod_384-armv8.pl
new file mode 100755
index 0000000..c6b2a53
--- /dev/null
+++ b/blst/asm/add_mod_384-armv8.pl
@@ -0,0 +1,872 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
+
+@mod=map("x$_",(4..9));
+@a=map("x$_",(10..15));
+@b=map("x$_",(16,17,19..22));
+$carry=$n_ptr;
+
+$code.=<<___;
+.text
+
+.globl	add_mod_384
+.hidden	add_mod_384
+.type	add_mod_384,%function
+.align	5
+add_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	add_mod_384,.-add_mod_384
+
+.type	__add_mod_384,%function
+.align	5
+__add_mod_384:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@b[4],@b[5],[$b_ptr,#32]
+
+__add_mod_384_ab_are_loaded:
+	adds	@a[0],@a[0],@b[0]
+	adcs	@a[1],@a[1],@b[1]
+	adcs	@a[2],@a[2],@b[2]
+	adcs	@a[3],@a[3],@b[3]
+	adcs	@a[4],@a[4],@b[4]
+	adcs	@a[5],@a[5],@b[5]
+	adc	$carry,xzr,xzr
+
+	subs	@b[0],@a[0],@mod[0]
+	sbcs	@b[1],@a[1],@mod[1]
+	sbcs	@b[2],@a[2],@mod[2]
+	sbcs	@b[3],@a[3],@mod[3]
+	sbcs	@b[4],@a[4],@mod[4]
+	sbcs	@b[5],@a[5],@mod[5]
+	sbcs	xzr,$carry,xzr
+
+	csel	@a[0],@a[0],@b[0],lo
+	csel	@a[1],@a[1],@b[1],lo
+	csel	@a[2],@a[2],@b[2],lo
+	csel	@a[3],@a[3],@b[3],lo
+	csel	@a[4],@a[4],@b[4],lo
+	csel	@a[5],@a[5],@b[5],lo
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.globl	add_mod_384x
+.hidden	add_mod_384x
+.type	add_mod_384x,%function
+.align	5
+add_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__add_mod_384
+
+	stp	@a[0],@a[1],[$r_ptr]
+	add	$a_ptr,$a_ptr,#48
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	add	$b_ptr,$b_ptr,#48
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	add_mod_384x,.-add_mod_384x
+
+.globl	rshift_mod_384
+.hidden	rshift_mod_384
+.type	rshift_mod_384,%function
+.align	5
+rshift_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+.Loop_rshift_mod_384:
+	sub	$b_ptr,$b_ptr,#1
+	bl	__rshift_mod_384
+	cbnz	$b_ptr,.Loop_rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	rshift_mod_384,.-rshift_mod_384
+
+.type	__rshift_mod_384,%function
+.align	5
+__rshift_mod_384:
+	sbfx	@b[5],@a[0],#0,#1
+	 and	@b[0],@b[5],@mod[0]
+	 and	@b[1],@b[5],@mod[1]
+	adds	@a[0],@a[0],@b[0]
+	 and	@b[2],@b[5],@mod[2]
+	adcs	@a[1],@a[1],@b[1]
+	 and	@b[3],@b[5],@mod[3]
+	adcs	@a[2],@a[2],@b[2]
+	 and	@b[4],@b[5],@mod[4]
+	adcs	@a[3],@a[3],@b[3]
+	 and	@b[5],@b[5],@mod[5]
+	adcs	@a[4],@a[4],@b[4]
+	 extr	@a[0],@a[1],@a[0],#1	// a[0:5] >>= 1
+	adcs	@a[5],@a[5],@b[5]
+	 extr	@a[1],@a[2],@a[1],#1
+	adc	@b[5],xzr,xzr
+	 extr	@a[2],@a[3],@a[2],#1
+	 extr	@a[3],@a[4],@a[3],#1
+	 extr	@a[4],@a[5],@a[4],#1
+	 extr	@a[5],@b[5],@a[5],#1
+	ret
+.size	__rshift_mod_384,.-__rshift_mod_384
+
+.globl	div_by_2_mod_384
+.hidden	div_by_2_mod_384
+.type	div_by_2_mod_384,%function
+.align	5
+div_by_2_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	div_by_2_mod_384,.-div_by_2_mod_384
+
+.globl	lshift_mod_384
+.hidden	lshift_mod_384
+.type	lshift_mod_384,%function
+.align	5
+lshift_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+.Loop_lshift_mod_384:
+	sub	$b_ptr,$b_ptr,#1
+	bl	__lshift_mod_384
+	cbnz	$b_ptr,.Loop_lshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	lshift_mod_384,.-lshift_mod_384
+
+.type	__lshift_mod_384,%function
+.align	5
+__lshift_mod_384:
+	adds	@a[0],@a[0],@a[0]
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$carry,xzr,xzr
+
+	subs	@b[0],@a[0],@mod[0]
+	sbcs	@b[1],@a[1],@mod[1]
+	sbcs	@b[2],@a[2],@mod[2]
+	sbcs	@b[3],@a[3],@mod[3]
+	sbcs	@b[4],@a[4],@mod[4]
+	sbcs	@b[5],@a[5],@mod[5]
+	sbcs	xzr,$carry,xzr
+
+	csel	@a[0],@a[0],@b[0],lo
+	csel	@a[1],@a[1],@b[1],lo
+	csel	@a[2],@a[2],@b[2],lo
+	csel	@a[3],@a[3],@b[3],lo
+	csel	@a[4],@a[4],@b[4],lo
+	csel	@a[5],@a[5],@b[5],lo
+
+	ret
+.size	__lshift_mod_384,.-__lshift_mod_384
+
+.globl	mul_by_3_mod_384
+.hidden	mul_by_3_mod_384
+.type	mul_by_3_mod_384,%function
+.align	5
+mul_by_3_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+	ldp	@b[4],@b[5],[$a_ptr,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_3_mod_384,.-mul_by_3_mod_384
+
+.globl	mul_by_8_mod_384
+.hidden	mul_by_8_mod_384
+.type	mul_by_8_mod_384,%function
+.align	5
+mul_by_8_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_8_mod_384,.-mul_by_8_mod_384
+
+.globl	mul_by_3_mod_384x
+.hidden	mul_by_3_mod_384x
+.type	mul_by_3_mod_384x,%function
+.align	5
+mul_by_3_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+	ldp	@b[4],@b[5],[$a_ptr,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+
+	stp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[0],@a[1],[$a_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[2],@a[3],[$a_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	ldp	@a[4],@a[5],[$a_ptr,#80]
+
+	bl	__lshift_mod_384
+
+	ldp	@b[0],@b[1],[$a_ptr,#48]
+	ldp	@b[2],@b[3],[$a_ptr,#64]
+	ldp	@b[4],@b[5],[$a_ptr,#80]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
+
+.globl	mul_by_8_mod_384x
+.hidden	mul_by_8_mod_384x
+.type	mul_by_8_mod_384x,%function
+.align	5
+mul_by_8_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+
+	stp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[0],@a[1],[$a_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[2],@a[3],[$a_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	ldp	@a[4],@a[5],[$a_ptr,#80]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
+
+.globl	cneg_mod_384
+.hidden	cneg_mod_384
+.type	cneg_mod_384,%function
+.align	5
+cneg_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+
+	subs	@b[0],@mod[0],@a[0]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+	 orr	$carry,@a[0],@a[1]
+	sbcs	@b[1],@mod[1],@a[1]
+	 orr	$carry,$carry,@a[2]
+	sbcs	@b[2],@mod[2],@a[2]
+	 orr	$carry,$carry,@a[3]
+	sbcs	@b[3],@mod[3],@a[3]
+	 orr	$carry,$carry,@a[4]
+	sbcs	@b[4],@mod[4],@a[4]
+	 orr	$carry,$carry,@a[5]
+	sbc	@b[5],@mod[5],@a[5]
+
+	cmp	$carry,#0
+	csetm	$carry,ne
+	ands	$b_ptr,$b_ptr,$carry
+
+	csel	@a[0],@a[0],@b[0],eq
+	csel	@a[1],@a[1],@b[1],eq
+	csel	@a[2],@a[2],@b[2],eq
+	csel	@a[3],@a[3],@b[3],eq
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[4],@a[4],@b[4],eq
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	csel	@a[5],@a[5],@b[5],eq
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	cneg_mod_384,.-cneg_mod_384
+
+.globl	sub_mod_384
+.hidden	sub_mod_384
+.type	sub_mod_384,%function
+.align	5
+sub_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	sub_mod_384,.-sub_mod_384
+
+.type	__sub_mod_384,%function
+.align	5
+__sub_mod_384:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@b[4],@b[5],[$b_ptr,#32]
+
+	subs	@a[0],@a[0],@b[0]
+	sbcs	@a[1],@a[1],@b[1]
+	sbcs	@a[2],@a[2],@b[2]
+	sbcs	@a[3],@a[3],@b[3]
+	sbcs	@a[4],@a[4],@b[4]
+	sbcs	@a[5],@a[5],@b[5]
+	sbc	$carry,xzr,xzr
+
+	 and	@b[0],@mod[0],$carry
+	 and	@b[1],@mod[1],$carry
+	adds	@a[0],@a[0],@b[0]
+	 and	@b[2],@mod[2],$carry
+	adcs	@a[1],@a[1],@b[1]
+	 and	@b[3],@mod[3],$carry
+	adcs	@a[2],@a[2],@b[2]
+	 and	@b[4],@mod[4],$carry
+	adcs	@a[3],@a[3],@b[3]
+	 and	@b[5],@mod[5],$carry
+	adcs	@a[4],@a[4],@b[4]
+	adc	@a[5],@a[5],@b[5]
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	sub_mod_384x
+.hidden	sub_mod_384x
+.type	sub_mod_384x,%function
+.align	5
+sub_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__sub_mod_384
+
+	stp	@a[0],@a[1],[$r_ptr]
+	add	$a_ptr,$a_ptr,#48
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	add	$b_ptr,$b_ptr,#48
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	sub_mod_384x,.-sub_mod_384x
+
+.globl	mul_by_1_plus_i_mod_384x
+.hidden	mul_by_1_plus_i_mod_384x
+.type	mul_by_1_plus_i_mod_384x,%function
+.align	5
+mul_by_1_plus_i_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+	add	$b_ptr,$a_ptr,#48
+
+	bl	__sub_mod_384			// a->re - a->im
+
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+	ldp	@b[4],@b[5],[$a_ptr,#32]
+	stp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[0],@a[1],[$a_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[2],@a[3],[$a_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	ldp	@a[4],@a[5],[$a_ptr,#80]
+
+	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
+
+.globl	sgn0_pty_mod_384
+.hidden	sgn0_pty_mod_384
+.type	sgn0_pty_mod_384,%function
+.align	5
+sgn0_pty_mod_384:
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+	ldp	@mod[4],@mod[5],[$a_ptr,#32]
+
+	and	$r_ptr,@a[0],#1
+	adds	@a[0],@a[0],@a[0]
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$carry,xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	$carry,$carry,xzr
+
+	mvn	$carry,$carry
+	and	$carry,$carry,#2
+	orr	$r_ptr,$r_ptr,$carry
+
+	ret
+.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
+
+.globl	sgn0_pty_mod_384x
+.hidden	sgn0_pty_mod_384x
+.type	sgn0_pty_mod_384x,%function
+.align	5
+sgn0_pty_mod_384x:
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+	ldp	@mod[4],@mod[5],[$a_ptr,#32]
+
+	and	$b_ptr,@a[0],#1
+	 orr	$n_ptr,@a[0],@a[1]
+	adds	@a[0],@a[0],@a[0]
+	 orr	$n_ptr,$n_ptr,@a[2]
+	adcs	@a[1],@a[1],@a[1]
+	 orr	$n_ptr,$n_ptr,@a[3]
+	adcs	@a[2],@a[2],@a[2]
+	 orr	$n_ptr,$n_ptr,@a[4]
+	adcs	@a[3],@a[3],@a[3]
+	 orr	$n_ptr,$n_ptr,@a[5]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	@b[0],xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	@b[0],@b[0],xzr
+
+	ldp	@a[0],@a[1],[$r_ptr,#48]
+	ldp	@a[2],@a[3],[$r_ptr,#64]
+	ldp	@a[4],@a[5],[$r_ptr,#80]
+
+	mvn	@b[0],@b[0]
+	and	@b[0],@b[0],#2
+	orr	$b_ptr,$b_ptr,@b[0]
+
+	and	$r_ptr,@a[0],#1
+	 orr	$a_ptr,@a[0],@a[1]
+	adds	@a[0],@a[0],@a[0]
+	 orr	$a_ptr,$a_ptr,@a[2]
+	adcs	@a[1],@a[1],@a[1]
+	 orr	$a_ptr,$a_ptr,@a[3]
+	adcs	@a[2],@a[2],@a[2]
+	 orr	$a_ptr,$a_ptr,@a[4]
+	adcs	@a[3],@a[3],@a[3]
+	 orr	$a_ptr,$a_ptr,@a[5]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	@b[0],xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	@b[0],@b[0],xzr
+
+	mvn	@b[0],@b[0]
+	and	@b[0],@b[0],#2
+	orr	$r_ptr,$r_ptr,@b[0]
+
+	cmp	$n_ptr,#0
+	csel	$n_ptr,$r_ptr,$b_ptr,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	$a_ptr,#0
+	csel	$a_ptr,$r_ptr,$b_ptr,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	$n_ptr,$n_ptr,#1
+	and	$a_ptr,$a_ptr,#2
+	orr	$r_ptr,$a_ptr,$n_ptr	// pack sign and parity
+
+	ret
+.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
+___
+if (1) {
+sub vec_select {
+my $sz = shift;
+my @v=map("v$_",(0..5,16..21));
+
+$code.=<<___;
+.globl	vec_select_$sz
+.hidden	vec_select_$sz
+.type	vec_select_$sz,%function
+.align	5
+vec_select_$sz:
+	dup	v6.2d, $n_ptr
+	ld1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48
+___
+for($i=0; $i<$sz-48; $i+=48) {
+$code.=<<___;
+	bit	@v[0].16b, @v[3].16b, v6.16b
+	ld1	{@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48
+	bit	@v[1].16b, @v[4].16b, v6.16b
+	ld1	{@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48
+	bit	@v[2].16b, @v[5].16b, v6.16b
+	st1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48
+___
+	@v = @v[6..11,0..5];
+}
+$code.=<<___;
+	bit	@v[0].16b, @v[3].16b, v6.16b
+	bit	@v[1].16b, @v[4].16b, v6.16b
+	bit	@v[2].16b, @v[5].16b, v6.16b
+	st1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr]
+	ret
+.size	vec_select_$sz,.-vec_select_$sz
+___
+}
+vec_select(48);
+vec_select(96);
+vec_select(192);
+vec_select(144);
+vec_select(288);
+}
+
+{
+my ($inp, $end, $step) = map("x$_", (0..2));
+
+$code.=<<___;
+.globl	vec_prefetch
+.hidden	vec_prefetch
+.type	vec_prefetch,%function
+.align	5
+vec_prefetch:
+	add	$end, $end, $inp
+	sub	$end, $end, #1
+	mov	$step, #64
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	prfm	pldl1keep, [$inp]
+	ret
+.size	vec_prefetch,.-vec_prefetch
+___
+}
+
+print $code;
+
+close STDOUT;
diff --git a/blst/asm/add_mod_384-x86_64.pl b/blst/asm/add_mod_384-x86_64.pl
new file mode 100755
index 0000000..88dde45
--- /dev/null
+++ b/blst/asm/add_mod_384-x86_64.pl
@@ -0,0 +1,1430 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+{ ############################################################## 384 bits add
+my @acc=map("%r$_",(8..15, "ax", "bx", "bp"));
+   push(@acc, $a_ptr);
+
+$code.=<<___;
+.text
+
+.globl	add_mod_384
+.hidden	add_mod_384
+.type	add_mod_384,\@function,4,"unwind"
+.align	32
+add_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__add_mod_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_mod_384,.-add_mod_384
+
+.type	__add_mod_384,\@abi-omnipotent
+.align	32
+__add_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+__add_mod_384_a_is_loaded:
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	8*3($b_org), @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	8*4($b_org), @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	8*5($b_org), @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[9],  @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.globl	add_mod_384x
+.hidden	add_mod_384x
+.type	add_mod_384x,\@function,4,"unwind"
+.align	32
+add_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$24, %rsp
+.cfi_adjust_cfa_offset	24
+.cfi_end_prologue
+
+	mov	$a_ptr, 8*0(%rsp)
+	mov	$b_org, 8*1(%rsp)
+	lea	48($a_ptr), $a_ptr	# a->im
+	lea	48($b_org), $b_org	# b->im
+	lea	48($r_ptr), $r_ptr	# ret->im
+	call	__add_mod_384		# add_mod_384(ret->im, a->im, b->im, mod);
+
+	mov	8*0(%rsp), $a_ptr	# a->re
+	mov	8*1(%rsp), $b_org	# b->re
+	lea	-48($r_ptr), $r_ptr	# ret->re
+	call	__add_mod_384		# add_mod_384(ret->re, a->re, b->re, mod);
+
+	mov	24+8*0(%rsp),%r15
+.cfi_restore	%r15
+	mov	24+8*1(%rsp),%r14
+.cfi_restore	%r14
+	mov	24+8*2(%rsp),%r13
+.cfi_restore	%r13
+	mov	24+8*3(%rsp),%r12
+.cfi_restore	%r12
+	mov	24+8*4(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	24+8*5(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24+8*6(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_mod_384x,.-add_mod_384x
+
+########################################################################
+.globl	rshift_mod_384
+.hidden	rshift_mod_384
+.type	rshift_mod_384,\@function,4,"unwind"
+.align	32
+rshift_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+.Loop_rshift_mod_384:
+	call	__rshift_mod_384
+	dec	%edx
+	jnz	.Loop_rshift_mod_384
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	rshift_mod_384,.-rshift_mod_384
+
+.type	__rshift_mod_384,\@abi-omnipotent
+.align	32
+__rshift_mod_384:
+	mov	\$1, @acc[11]
+	mov	8*0($n_ptr), @acc[6]
+	and	@acc[0], @acc[11]
+	mov	8*1($n_ptr), @acc[7]
+	neg	@acc[11]
+	mov	8*2($n_ptr), @acc[8]
+	and	@acc[11], @acc[6]
+	mov	8*3($n_ptr), @acc[9]
+	and	@acc[11], @acc[7]
+	mov	8*4($n_ptr), @acc[10]
+	and	@acc[11], @acc[8]
+	and	@acc[11], @acc[9]
+	and	@acc[11], @acc[10]
+	and	8*5($n_ptr), @acc[11]
+
+	add	@acc[0], @acc[6]
+	adc	@acc[1], @acc[7]
+	adc	@acc[2], @acc[8]
+	adc	@acc[3], @acc[9]
+	adc	@acc[4], @acc[10]
+	adc	@acc[5], @acc[11]
+	sbb	@acc[5], @acc[5]
+
+	shr	\$1, @acc[6]
+	mov	@acc[7], @acc[0]
+	shr	\$1, @acc[7]
+	mov	@acc[8], @acc[1]
+	shr	\$1, @acc[8]
+	mov	@acc[9], @acc[2]
+	shr	\$1, @acc[9]
+	mov	@acc[10], @acc[3]
+	shr	\$1, @acc[10]
+	mov	@acc[11], @acc[4]
+	shr	\$1, @acc[11]
+	shl	\$63, @acc[0]
+	shl	\$63, @acc[1]
+	or	@acc[6], @acc[0]
+	shl	\$63, @acc[2]
+	or	@acc[7], @acc[1]
+	shl	\$63, @acc[3]
+	or	@acc[8], @acc[2]
+	shl	\$63, @acc[4]
+	or	@acc[9], @acc[3]
+	shl	\$63, @acc[5]
+	or	@acc[10], @acc[4]
+	or	@acc[11], @acc[5]
+
+	ret
+.size	__rshift_mod_384,.-__rshift_mod_384
+
+.globl	div_by_2_mod_384
+.hidden	div_by_2_mod_384
+.type	div_by_2_mod_384,\@function,3,"unwind"
+.align	32
+div_by_2_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	$b_org, $n_ptr
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	call	__rshift_mod_384
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	div_by_2_mod_384,.-div_by_2_mod_384
+
+########################################################################
+.globl	lshift_mod_384
+.hidden	lshift_mod_384
+.type	lshift_mod_384,\@function,4,"unwind"
+.align	32
+lshift_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+.Loop_lshift_mod_384:
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	@acc[3], @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	@acc[4], @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	@acc[5], @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$r_ptr, $r_ptr
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $r_ptr
+
+	mov	(%rsp), $r_ptr
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	cmovc	@acc[9],  @acc[3]
+	cmovc	@acc[10], @acc[4]
+	cmovc	@acc[11], @acc[5]
+
+	dec	%edx
+	jnz	.Loop_lshift_mod_384
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	lshift_mod_384,.-lshift_mod_384
+
+.type	__lshift_mod_384,\@abi-omnipotent
+.align	32
+__lshift_mod_384:
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	@acc[3], @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	@acc[4], @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	@acc[5], @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	cmovc	@acc[9],  @acc[3]
+	cmovc	@acc[10], @acc[4]
+	cmovc	@acc[11], @acc[5]
+
+	ret
+.size	__lshift_mod_384,.-__lshift_mod_384
+
+########################################################################
+.globl	mul_by_3_mod_384
+.hidden	mul_by_3_mod_384
+.type	mul_by_3_mod_384,\@function,3,"unwind"
+.align	32
+mul_by_3_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$a_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+
+	call	__lshift_mod_384
+
+	mov	(%rsp), $b_org
+	call	__add_mod_384_a_is_loaded
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_3_mod_384,.-mul_by_3_mod_384
+
+.globl	mul_by_8_mod_384
+.hidden	mul_by_8_mod_384
+.type	mul_by_8_mod_384,\@function,3,"unwind"
+.align	32
+mul_by_8_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_8_mod_384,.-mul_by_8_mod_384
+
+########################################################################
+.globl	mul_by_3_mod_384x
+.hidden	mul_by_3_mod_384x
+.type	mul_by_3_mod_384x,\@function,3,"unwind"
+.align	32
+mul_by_3_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$a_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+
+	call	__lshift_mod_384
+
+	mov	(%rsp), $b_org
+	call	__add_mod_384_a_is_loaded
+
+	mov	(%rsp), $a_ptr
+	lea	8*6($r_ptr), $r_ptr
+
+	mov	8*6($a_ptr), @acc[0]
+	mov	8*7($a_ptr), @acc[1]
+	mov	8*8($a_ptr), @acc[2]
+	mov	8*9($a_ptr), @acc[3]
+	mov	8*10($a_ptr), @acc[4]
+	mov	8*11($a_ptr), @acc[5]
+
+	call	__lshift_mod_384
+
+	mov	\$8*6, $b_org
+	add	(%rsp), $b_org
+	call	__add_mod_384_a_is_loaded
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
+
+.globl	mul_by_8_mod_384x
+.hidden	mul_by_8_mod_384x
+.type	mul_by_8_mod_384x,\@function,3,"unwind"
+.align	32
+mul_by_8_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$a_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	mov	(%rsp), $a_ptr
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mov	48+8*0($a_ptr), @acc[0]
+	mov	48+8*1($a_ptr), @acc[1]
+	mov	48+8*2($a_ptr), @acc[2]
+	mov	48+8*3($a_ptr), @acc[3]
+	mov	48+8*4($a_ptr), @acc[4]
+	mov	48+8*5($a_ptr), @acc[5]
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	mov	@acc[0], 48+8*0($r_ptr)
+	mov	@acc[1], 48+8*1($r_ptr)
+	mov	@acc[2], 48+8*2($r_ptr)
+	mov	@acc[3], 48+8*3($r_ptr)
+	mov	@acc[4], 48+8*4($r_ptr)
+	mov	@acc[5], 48+8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
+
+########################################################################
+.globl	cneg_mod_384
+.hidden	cneg_mod_384
+.type	cneg_mod_384,\@function,4,"unwind"
+.align	32
+cneg_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$b_org			# condition flag
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), $b_org	# load a[0:5]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	$b_org, @acc[0]
+	mov	8*3($a_ptr), @acc[3]
+	or	@acc[1], $b_org
+	mov	8*4($a_ptr), @acc[4]
+	or	@acc[2], $b_org
+	mov	8*5($a_ptr), @acc[5]
+	or	@acc[3], $b_org
+	mov	\$-1, @acc[11]
+	or	@acc[4], $b_org
+	or	@acc[5], $b_org
+
+	mov	8*0($n_ptr), @acc[6]	# load n[0:5]
+	cmovnz	@acc[11], $b_org	# mask = a[0:5] ? -1 : 0
+	mov	8*1($n_ptr), @acc[7]
+	mov	8*2($n_ptr), @acc[8]
+	and	$b_org, @acc[6]		# n[0:5] &= mask
+	mov	8*3($n_ptr), @acc[9]
+	and	$b_org, @acc[7]
+	mov	8*4($n_ptr), @acc[10]
+	and	$b_org, @acc[8]
+	mov	8*5($n_ptr), @acc[11]
+	and	$b_org, @acc[9]
+	mov	0(%rsp), $n_ptr		# restore condition flag
+	and	$b_org, @acc[10]
+	and	$b_org, @acc[11]
+
+	sub	@acc[0], @acc[6]	# a[0:5] ? n[0:5]-a[0:5] : 0-0
+	sbb	@acc[1], @acc[7]
+	sbb	@acc[2], @acc[8]
+	sbb	@acc[3], @acc[9]
+	sbb	@acc[4], @acc[10]
+	sbb	@acc[5], @acc[11]
+
+	or	$n_ptr, $n_ptr		# check condition flag
+
+	cmovz	@acc[0], @acc[6]	# flag ? n[0:5]-a[0:5] : a[0:5]
+	cmovz	@acc[1], @acc[7]
+	cmovz	@acc[2], @acc[8]
+	mov	@acc[6], 8*0($r_ptr)
+	cmovz	@acc[3], @acc[9]
+	mov	@acc[7], 8*1($r_ptr)
+	cmovz	@acc[4], @acc[10]
+	mov	@acc[8], 8*2($r_ptr)
+	cmovz	@acc[5], @acc[11]
+	mov	@acc[9], 8*3($r_ptr)
+	mov	@acc[10], 8*4($r_ptr)
+	mov	@acc[11], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	cneg_mod_384,.-cneg_mod_384
+
+########################################################################
+.globl	sub_mod_384
+.hidden	sub_mod_384
+.type	sub_mod_384,\@function,4,"unwind"
+.align	32
+sub_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__sub_mod_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_mod_384,.-sub_mod_384
+
+.type	__sub_mod_384,\@abi-omnipotent
+.align	32
+__sub_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[6]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[7]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[8]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[9]
+	sbb	8*4($b_org), @acc[4]
+	 mov	8*4($n_ptr), @acc[10]
+	sbb	8*5($b_org), @acc[5]
+	 mov	8*5($n_ptr), @acc[11]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+	and	$b_org, @acc[8]
+	and	$b_org, @acc[9]
+	and	$b_org, @acc[10]
+	and	$b_org, @acc[11]
+
+	add	@acc[6], @acc[0]
+	adc	@acc[7], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[8], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[9], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	adc	@acc[10], @acc[4]
+	mov	@acc[3], 8*3($r_ptr)
+	adc	@acc[11], @acc[5]
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	sub_mod_384x
+.hidden	sub_mod_384x
+.type	sub_mod_384x,\@function,4,"unwind"
+.align	32
+sub_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$24, %rsp
+.cfi_adjust_cfa_offset	24
+.cfi_end_prologue
+
+	mov	$a_ptr, 8*0(%rsp)
+	mov	$b_org, 8*1(%rsp)
+	lea	48($a_ptr), $a_ptr	# a->im
+	lea	48($b_org), $b_org	# b->im
+	lea	48($r_ptr), $r_ptr	# ret->im
+	call	__sub_mod_384		# sub_mod_384(ret->im, a->im, b->im, mod);
+
+	mov	8*0(%rsp), $a_ptr	# a->re
+	mov	8*1(%rsp), $b_org	# b->re
+	lea	-48($r_ptr), $r_ptr	# ret->re
+	call	__sub_mod_384		# sub_mod_384(ret->re, a->re, b->re, mod);
+
+	mov	24+8*0(%rsp),%r15
+.cfi_restore	%r15
+	mov	24+8*1(%rsp),%r14
+.cfi_restore	%r14
+	mov	24+8*2(%rsp),%r13
+.cfi_restore	%r13
+	mov	24+8*3(%rsp),%r12
+.cfi_restore	%r12
+	mov	24+8*4(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	24+8*5(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24+8*6(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_mod_384x,.-sub_mod_384x
+___
+}
+{ ###################################################### ret = a * (1 + i)
+my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx");
+my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp"));
+
+$code.=<<___;
+.globl	mul_by_1_plus_i_mod_384x
+.hidden	mul_by_1_plus_i_mod_384x
+.type	mul_by_1_plus_i_mod_384x,\@function,3,"unwind"
+.align	32
+mul_by_1_plus_i_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$56, %rsp
+.cfi_adjust_cfa_offset	56
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	mov	@acc[0], @acc[6]
+	add	8*6($a_ptr), @acc[0]	# a->re + a->im
+	mov	@acc[1], @acc[7]
+	adc	8*7($a_ptr), @acc[1]
+	mov	@acc[2], @acc[8]
+	adc	8*8($a_ptr), @acc[2]
+	mov	@acc[3], @acc[9]
+	adc	8*9($a_ptr), @acc[3]
+	mov	@acc[4], @acc[10]
+	adc	8*10($a_ptr), @acc[4]
+	mov	@acc[5], @acc[11]
+	adc	8*11($a_ptr), @acc[5]
+	mov	$r_ptr, 8*6(%rsp)	# offload r_ptr
+	sbb	$r_ptr, $r_ptr
+
+	sub	8*6($a_ptr), @acc[6]	# a->re - a->im
+	sbb	8*7($a_ptr), @acc[7]
+	sbb	8*8($a_ptr), @acc[8]
+	sbb	8*9($a_ptr), @acc[9]
+	sbb	8*10($a_ptr), @acc[10]
+	sbb	8*11($a_ptr), @acc[11]
+	sbb	$a_ptr, $a_ptr
+
+	mov	@acc[0], 8*0(%rsp)	# offload a->re + a->im [without carry]
+	 mov	8*0($n_ptr), @acc[0]
+	mov	@acc[1], 8*1(%rsp)
+	 mov	8*1($n_ptr), @acc[1]
+	mov	@acc[2], 8*2(%rsp)
+	 mov	8*2($n_ptr), @acc[2]
+	mov	@acc[3], 8*3(%rsp)
+	 mov	8*3($n_ptr), @acc[3]
+	mov	@acc[4], 8*4(%rsp)
+	 and	$a_ptr, @acc[0]
+	 mov	8*4($n_ptr), @acc[4]
+	mov	@acc[5], 8*5(%rsp)
+	 and	$a_ptr, @acc[1]
+	 mov	8*5($n_ptr), @acc[5]
+	 and	$a_ptr, @acc[2]
+	 and	$a_ptr, @acc[3]
+	 and	$a_ptr, @acc[4]
+	 and	$a_ptr, @acc[5]
+	mov	8*6(%rsp), $a_ptr	# restore r_ptr
+
+	add	@acc[0], @acc[6]
+	 mov	8*0(%rsp), @acc[0]	# restore a->re + a->im
+	adc	@acc[1], @acc[7]
+	 mov	8*1(%rsp), @acc[1]
+	adc	@acc[2], @acc[8]
+	 mov	8*2(%rsp), @acc[2]
+	adc	@acc[3], @acc[9]
+	 mov	8*3(%rsp), @acc[3]
+	adc	@acc[4], @acc[10]
+	 mov	8*4(%rsp), @acc[4]
+	adc	@acc[5], @acc[11]
+	 mov	8*5(%rsp), @acc[5]
+
+	mov	@acc[6], 8*0($a_ptr)	# ret->re = a->re - a->im
+	 mov	@acc[0], @acc[6]
+	mov	@acc[7], 8*1($a_ptr)
+	mov	@acc[8], 8*2($a_ptr)
+	 mov	@acc[1], @acc[7]
+	mov	@acc[9], 8*3($a_ptr)
+	mov	@acc[10], 8*4($a_ptr)
+	 mov	@acc[2], @acc[8]
+	mov	@acc[11], 8*5($a_ptr)
+
+	sub	8*0($n_ptr), @acc[0]
+	 mov	@acc[3], @acc[9]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[4], @acc[10]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $r_ptr
+
+	cmovc	@acc[6], @acc[0]
+	cmovc	@acc[7], @acc[1]
+	cmovc	@acc[8], @acc[2]
+	mov	@acc[0], 8*6($a_ptr)	# ret->im = a->re + a->im
+	cmovc	@acc[9], @acc[3]
+	mov	@acc[1], 8*7($a_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2], 8*8($a_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3], 8*9($a_ptr)
+	mov	@acc[4], 8*10($a_ptr)
+	mov	@acc[5], 8*11($a_ptr)
+
+	mov	56+8*0(%rsp),%r15
+.cfi_restore	%r15
+	mov	56+8*1(%rsp),%r14
+.cfi_restore	%r14
+	mov	56+8*2(%rsp),%r13
+.cfi_restore	%r13
+	mov	56+8*3(%rsp),%r12
+.cfi_restore	%r12
+	mov	56+8*4(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	56+8*5(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56+8*6(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
+___
+}
+{ ######################################################
+my ($r_ptr,$n_ptr) = ("%rdi","%rsi");
+my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp"));
+
+$code.=<<___;
+.globl	sgn0_pty_mod_384
+.hidden	sgn0_pty_mod_384
+.type	sgn0_pty_mod_384,\@function,2,"unwind"
+.align	32
+sgn0_pty_mod_384:
+.cfi_startproc
+.cfi_end_prologue
+	mov	8*0($r_ptr), @acc[0]
+	mov	8*1($r_ptr), @acc[1]
+	mov	8*2($r_ptr), @acc[2]
+	mov	8*3($r_ptr), @acc[3]
+	mov	8*4($r_ptr), @acc[4]
+	mov	8*5($r_ptr), @acc[5]
+
+	xor	%rax, %rax
+	mov	@acc[0], $r_ptr
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	not	%rax			# 2*x > p, which means "negative"
+	and	\$1, $r_ptr
+	and	\$2, %rax
+	or	$r_ptr, %rax		# pack sign and parity
+
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
+
+.globl	sgn0_pty_mod_384x
+.hidden	sgn0_pty_mod_384x
+.type	sgn0_pty_mod_384x,\@function,2,"unwind"
+.align	32
+sgn0_pty_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*6($r_ptr), @acc[0]	# sgn0(a->im)
+	mov	8*7($r_ptr), @acc[1]
+	mov	8*8($r_ptr), @acc[2]
+	mov	8*9($r_ptr), @acc[3]
+	mov	8*10($r_ptr), @acc[4]
+	mov	8*11($r_ptr), @acc[5]
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	lea	0($r_ptr), %rax		# sgn0(a->re)
+	xor	$r_ptr, $r_ptr
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, $r_ptr
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $r_ptr
+
+	mov	@acc[0], 0(%rsp)	# a->im is zero or not
+	not	$r_ptr			# 2*x > p, which means "negative"
+	and	\$1, @acc[7]
+	and	\$2, $r_ptr
+	or	@acc[7], $r_ptr		# pack sign and parity
+
+	mov	8*0(%rax), @acc[0]
+	mov	8*1(%rax), @acc[1]
+	mov	8*2(%rax), @acc[2]
+	mov	8*3(%rax), @acc[3]
+	mov	8*4(%rax), @acc[4]
+	mov	8*5(%rax), @acc[5]
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	xor	%rax, %rax
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	mov	0(%rsp), @acc[6]
+
+	not	%rax			# 2*x > p, which means "negative"
+
+	test	@acc[0], @acc[0]
+	cmovz	$r_ptr, @acc[7]		# a->re==0? prty(a->im) : prty(a->re)
+
+	test	@acc[6], @acc[6]
+	cmovnz	$r_ptr, %rax		# a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	\$1, @acc[7]
+	and	\$2, %rax
+	or	@acc[7], %rax		# pack sign and parity
+
+	mov	8(%rsp), %rbx
+.cfi_restore	%rbx
+	mov	16(%rsp), %rbp
+.cfi_restore	%rbp
+	lea	24(%rsp), %rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
+___
+}
+if (0) {
+my $inp = $win64 ? "%rcx" : "%rdi";
+$code.=<<___;
+.globl	nbits_384
+.hidden	nbits_384
+.type	nbits_384,\@abi-omnipotent
+.align	32
+nbits_384:
+	mov	8*5($inp), %r8
+	mov	8*4($inp), %r9
+	mov	8*3($inp), %r10
+	mov	8*2($inp), %r11
+	mov	\$-1, %rdx
+	mov	\$127, %eax
+	bsr	%r8, %r8
+	cmovnz	%rdx,%r9
+	cmovz	%rax,%r8
+	bsr	%r9, %r9
+	cmovnz	%rdx,%r10
+	cmovz	%rax,%r9
+	xor	\$63,%r8
+	bsr	%r10, %r10
+	cmovnz	%rdx, %r11
+	cmovz	%rax, %r10
+	xor	\$63,%r9
+	add	%r8, %r9
+	mov	8*1($inp), %r8
+	bsr	%r11, %r11
+	cmovnz	%rdx, %r8
+	cmovz	%rax, %r11
+	xor	\$63, %r10
+	add	%r9, %r10
+	mov	8*0($inp), %r9
+	bsr	%r8, %r8
+	cmovnz	%rdx, %r9
+	cmovz	%rax, %r8
+	xor	\$63, %r11
+	add	%r10, %r11
+	bsr	%r9, %r9
+	cmovz	%rax, %r9
+	xor	\$63, %r8
+	add	%r11, %r8
+	xor	\$63, %r9
+	add	%r8, %r9
+	mov	\$384, %eax
+	sub	%r9, %rax
+	ret
+.size	nbits_384,.-nbits_384
+___
+}
+
+if (1) {
+my ($out, $inp1, $inp2, $select) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9d")
+                                          : ("%rdi", "%rsi", "%rdx", "%ecx");
+
+sub vec_select {
+my $sz = shift;
+my $half = $sz/2;
+my ($xmm0,$xmm1,$xmm2,$xmm3)=map("%xmm$_",(0..3));
+
+$code.=<<___;
+.globl	vec_select_$sz
+.hidden	vec_select_$sz
+.type	vec_select_$sz,\@abi-omnipotent
+.align	32
+vec_select_$sz:
+	movd	$select, %xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	\$0,%xmm5,%xmm5		# broadcast
+	movdqu	($inp1),$xmm0
+	lea	$half($inp1),$inp1
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	($inp2),$xmm1
+	lea	$half($inp2),$inp2
+	pcmpeqd	%xmm5,%xmm4
+	lea	$half($out),$out
+___
+for($i=0; $i<$sz-16; $i+=16) {
+$code.=<<___;
+	pand	%xmm4,$xmm0
+	movdqu	$i+16-$half($inp1),$xmm2
+	pand	%xmm5,$xmm1
+	movdqu	$i+16-$half($inp2),$xmm3
+	por	$xmm1,$xmm0
+	movdqu	$xmm0,$i-$half($out)
+___
+	($xmm0,$xmm1,$xmm2,$xmm3)=($xmm2,$xmm3,$xmm0,$xmm1);
+}
+$code.=<<___;
+	pand	%xmm4,$xmm0
+	pand	%xmm5,$xmm1
+	por	$xmm1,$xmm0
+	movdqu	$xmm0,$i-$half($out)
+	ret
+.size	vec_select_$sz,.-vec_select_$sz
+___
+}
+vec_select(48);
+vec_select(96);
+vec_select(192);
+vec_select(144);
+vec_select(288);
+}
+
+{
+my ($inp, $end) = $win64 ? ("%rcx", "%rdx") : ("%rdi", "%rsi");
+
+$code.=<<___;
+.globl	vec_prefetch
+.hidden	vec_prefetch
+.type	vec_prefetch,\@abi-omnipotent
+.align	32
+vec_prefetch:
+	leaq		-1($inp,$end), $end
+	mov		\$64, %rax
+	xor		%r8, %r8
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	cmova		%r8, %rax
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	cmova		%r8, %rax
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	cmova		%r8, %rax
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	cmova		%r8, %rax
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	cmova		%r8, %rax
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	prefetchnta	($inp)
+	ret
+.size	vec_prefetch,.-vec_prefetch
+___
+}
+print $code;
+close STDOUT;
diff --git a/blst/asm/add_mod_384x384-x86_64.pl b/blst/asm/add_mod_384x384-x86_64.pl
new file mode 100755
index 0000000..6ee3cf8
--- /dev/null
+++ b/blst/asm/add_mod_384x384-x86_64.pl
@@ -0,0 +1,260 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+# common accumulator layout
+@acc=map("%r$_",(8..15));
+
+############################################################ 384x384 add/sub
+# Double-width addition/subtraction modulo n<<384, as opposite to
+# naively expected modulo n*n. It works because n<<384 is the actual
+# input boundary condition for Montgomery reduction, not n*n.
+# Just in case, this is duplicated, but only one module is
+# supposed to be linked...
+{
+my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.text
+
+.type	__add_mod_384x384,\@abi-omnipotent
+.align	32
+__add_mod_384x384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	8*6($a_ptr), @acc[6]
+
+	add	8*0($b_org), @acc[0]
+	mov	8*7($a_ptr), @acc[7]
+	adc	8*1($b_org), @acc[1]
+	mov	8*8($a_ptr), @acc[8]
+	adc	8*2($b_org), @acc[2]
+	mov	8*9($a_ptr), @acc[9]
+	adc	8*3($b_org), @acc[3]
+	mov	8*10($a_ptr), @acc[10]
+	adc	8*4($b_org), @acc[4]
+	mov	8*11($a_ptr), @acc[11]
+	adc	8*5($b_org), @acc[5]
+	 mov	@acc[0], 8*0($r_ptr)
+	adc	8*6($b_org), @acc[6]
+	 mov	@acc[1], 8*1($r_ptr)
+	adc	8*7($b_org), @acc[7]
+	 mov	@acc[2], 8*2($r_ptr)
+	adc	8*8($b_org), @acc[8]
+	 mov	@acc[4], 8*4($r_ptr)
+	 mov	@acc[6], @acc[0]
+	adc	8*9($b_org), @acc[9]
+	 mov	@acc[3], 8*3($r_ptr)
+	 mov	@acc[7], @acc[1]
+	adc	8*10($b_org), @acc[10]
+	 mov	@acc[5], 8*5($r_ptr)
+	 mov	@acc[8], @acc[2]
+	adc	8*11($b_org), @acc[11]
+	 mov	@acc[9], @acc[3]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[7]
+	 mov	@acc[10], @acc[4]
+	sbb	8*2($n_ptr), @acc[8]
+	sbb	8*3($n_ptr), @acc[9]
+	sbb	8*4($n_ptr), @acc[10]
+	 mov	@acc[11], @acc[5]
+	sbb	8*5($n_ptr), @acc[11]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[0], @acc[6]
+	cmovc	@acc[1], @acc[7]
+	cmovc	@acc[2], @acc[8]
+	mov	@acc[6], 8*6($r_ptr)
+	cmovc	@acc[3], @acc[9]
+	mov	@acc[7], 8*7($r_ptr)
+	cmovc	@acc[4], @acc[10]
+	mov	@acc[8], 8*8($r_ptr)
+	cmovc	@acc[5], @acc[11]
+	mov	@acc[9], 8*9($r_ptr)
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__add_mod_384x384,.-__add_mod_384x384
+
+.type	__sub_mod_384x384,\@abi-omnipotent
+.align	32
+__sub_mod_384x384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	8*6($a_ptr), @acc[6]
+
+	sub	8*0($b_org), @acc[0]
+	mov	8*7($a_ptr), @acc[7]
+	sbb	8*1($b_org), @acc[1]
+	mov	8*8($a_ptr), @acc[8]
+	sbb	8*2($b_org), @acc[2]
+	mov	8*9($a_ptr), @acc[9]
+	sbb	8*3($b_org), @acc[3]
+	mov	8*10($a_ptr), @acc[10]
+	sbb	8*4($b_org), @acc[4]
+	mov	8*11($a_ptr), @acc[11]
+	sbb	8*5($b_org), @acc[5]
+	 mov	@acc[0], 8*0($r_ptr)
+	sbb	8*6($b_org), @acc[6]
+	 mov	8*0($n_ptr), @acc[0]
+	 mov	@acc[1], 8*1($r_ptr)
+	sbb	8*7($b_org), @acc[7]
+	 mov	8*1($n_ptr), @acc[1]
+	 mov	@acc[2], 8*2($r_ptr)
+	sbb	8*8($b_org), @acc[8]
+	 mov	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], 8*3($r_ptr)
+	sbb	8*9($b_org), @acc[9]
+	 mov	8*3($n_ptr), @acc[3]
+	 mov	@acc[4], 8*4($r_ptr)
+	sbb	8*10($b_org), @acc[10]
+	 mov	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], 8*5($r_ptr)
+	sbb	8*11($b_org), @acc[11]
+	 mov	8*5($n_ptr), @acc[5]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[0]
+	and	$b_org, @acc[1]
+	and	$b_org, @acc[2]
+	and	$b_org, @acc[3]
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+
+	add	@acc[0], @acc[6]
+	adc	@acc[1], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[2], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[3], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[4], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[5], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.globl	add_mod_384x384
+.hidden	add_mod_384x384
+.type	add_mod_384x384,\@function,4,"unwind"
+.align	32
+add_mod_384x384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__add_mod_384x384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_mod_384x384,.-add_mod_384x384
+
+.globl	sub_mod_384x384
+.hidden	sub_mod_384x384
+.type	sub_mod_384x384,\@function,4,"unwind"
+.align	32
+sub_mod_384x384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__sub_mod_384x384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_mod_384x384,.-sub_mod_384x384
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/arm-xlate.pl b/blst/asm/arm-xlate.pl
new file mode 100755
index 0000000..5028a62
--- /dev/null
+++ b/blst/asm/arm-xlate.pl
@@ -0,0 +1,381 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ARM assembler distiller/adapter by \@dot-asm.
+
+use strict;
+
+################################################################
+# Recognized "flavour"-s are:
+#
+# linux[32|64]	GNU assembler, effectively pass-through
+# ios[32|64]	global symbols' decorations, PIC tweaks, etc.
+# win[32|64]	Visual Studio armasm-specific directives
+# coff[32|64]	e.g. clang --target=arm-windows ...
+#
+my $flavour = shift;
+   $flavour = "linux" if (!$flavour or $flavour eq "void");
+
+my $output = shift;
+open STDOUT,">$output" || die "can't open $output: $!";
+
+my %GLOBALS;
+my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0;
+my $in_proc;	# used with 'windows' flavour
+
+################################################################
+# directives which need special treatment on different platforms
+################################################################
+my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch
+my $fpu  = sub { } if ($flavour !~ /linux/);       # omit .fpu
+
+my $rodata = sub {
+    SWITCH: for ($flavour) {
+	/linux/		&& return ".section\t.rodata";
+	/ios/		&& return ".section\t__TEXT,__const";
+	/coff/		&& return ".section\t.rdata,\"dr\"";
+	/win/		&& return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8";
+	last;
+    }
+};
+
+my $hidden = sub {
+    if ($flavour =~ /ios/)	{ ".private_extern\t".join(',',@_); }
+} if ($flavour !~ /linux/);
+
+my $comm = sub {
+    my @args = split(/,\s*/,shift);
+    my $name = @args[0];
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    if ($flavour =~ /ios32/)	{
+	$ret = ".comm\t_$name,@args[1]\n";
+	$ret .= ".non_lazy_symbol_pointer\n";
+	$ret .= "$name:\n";
+	$ret .= ".indirect_symbol\t_$name\n";
+	$ret .= ".long\t0\n";
+	$ret .= ".previous";
+	$name = "_$name";
+    } elsif ($flavour =~ /win/) {
+	$ret = "\tCOMMON\t|$name|,@args[1]";
+    } elsif ($flavour =~ /coff/) {
+	$ret = ".comm\t$name,@args[1]";
+    } else {
+	$ret = ".comm\t".join(',',@args);
+    }
+
+    $$global = $name;
+    $ret;
+};
+
+my $globl = sub {
+    my $name = shift;
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    SWITCH: for ($flavour) {
+	/ios/		&& do { $name = "_$name"; last; };
+	/win/		&& do { $ret = ""; last; };
+    }
+
+    $ret = ".globl	$name" if (!defined($ret));
+    $$global = $name;
+    $ret;
+};
+my $global = $globl;
+
+my $extern = sub {
+    &$globl(@_);
+    if ($flavour =~ /win/) {
+	return "\tEXTERN\t@_";
+    }
+    return;	# return nothing
+};
+
+my $type = sub {
+    my $arg = join(',',@_);
+    my $ret;
+
+    SWITCH: for ($flavour) {
+	/ios32/		&& do { if ($arg =~ /(\w+),\s*%function/) {
+				    $ret = "#ifdef __thumb2__\n" .
+					   ".thumb_func	$1\n" .
+					   "#endif";
+				}
+				last;
+			      };
+	/win/		&& do { if ($arg =~ /(\w+),\s*%(function|object)/) {
+				    my $type = "[DATA]";
+				    if ($2 eq "function") {
+					$in_proc = $1;
+					$type = "[FUNC]";
+				    }
+				    $ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type"
+							: "";
+				}
+				last;
+			      };
+	/coff/		&& do { if ($arg =~ /(\w+),\s*%function/) {
+				    $ret = ".def	$1;\n".
+					   ".type	32;\n".
+					   ".endef";
+				}
+				last;
+			      };
+    }
+    return $ret;
+} if ($flavour !~ /linux/);
+
+my $size = sub {
+    if ($in_proc && $flavour =~ /win/) {
+	$in_proc = undef;
+	return "\tENDP";
+    }
+} if ($flavour !~ /linux/);
+
+my $inst = sub {
+    if ($flavour =~ /win/)	{ "\tDCDU\t".join(',',@_); }
+    else			{ ".long\t".join(',',@_);  }
+} if ($flavour !~ /linux/);
+
+my $asciz = sub {
+    my $line = join(",",@_);
+    if ($line =~ /^"(.*)"$/)
+    {	if ($flavour =~ /win/) {
+	    "\tDCB\t$line,0\n\tALIGN\t4";
+	} else {
+	    ".byte	" . join(",",unpack("C*",$1),0) . "\n.align	2";
+	}
+    } else {	"";	}
+};
+
+my $align = sub {
+    "\tALIGN\t".2**@_[0];
+} if ($flavour =~ /win/);
+   $align = sub {
+    ".p2align\t".@_[0];
+} if ($flavour =~ /coff/);
+
+my $byte = sub {
+    "\tDCB\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $short = sub {
+    "\tDCWU\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $word = sub {
+    "\tDCDU\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $long = $word if ($flavour =~ /win/);
+
+my $quad = sub {
+    "\tDCQU\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $skip = sub {
+    "\tSPACE\t".shift;
+} if ($flavour =~ /win/);
+
+my $code = sub {
+    "\tCODE@_[0]";
+} if ($flavour =~ /win/);
+
+my $thumb = sub {	# .thumb should appear prior .text in source
+    "# define ARM THUMB\n" .
+    "\tTHUMB";
+} if ($flavour =~ /win/);
+
+my $text = sub {
+    "\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM");
+} if ($flavour =~ /win/);
+
+my $syntax = sub {} if ($flavour =~ /win/);	# omit .syntax
+
+my $rva = sub {
+    # .rva directive comes in handy only on 32-bit Windows, i.e. it can
+    # be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections.
+    # However! Corresponding compilers don't seem to bet on PIC, which
+    # raises the question why would assembler programmer have to jump
+    # through the hoops? But just in case, it would go as following:
+    #
+    #	ldr	r1,.LOPENSSL_armcap
+    #	ldr	r2,.LOPENSSL_armcap+4
+    #	adr	r0,.LOPENSSL_armcap
+    #	bic	r1,r1,#1		; de-thumb-ify link.exe's ideas
+    #	sub	r0,r0,r1		; r0 is image base now
+    #	ldr	r0,[r0,r2]
+    #	...
+    #.LOPENSSL_armcap:
+    #	.rva	.LOPENSSL_armcap	; self-reference
+    #	.rva	OPENSSL_armcap_P	; real target
+    #
+    # Non-position-independent [and ISA-neutral] alternative is so much
+    # simpler:
+    #
+    #	ldr	r0,.LOPENSSL_armcap
+    #	ldr	r0,[r0]
+    #	...
+    #.LOPENSSL_armcap:
+    #	.long	OPENSSL_armcap_P
+    #
+    "\tDCDU\t@_[0]\n\tRELOC\t2"
+} if ($flavour =~ /win(?!64)/);
+
+################################################################
+# some broken instructions in Visual Studio armasm[64]...
+
+my $it = sub {} if ($flavour =~ /win32/);	# omit 'it'
+
+my $ext = sub {
+    "\text8\t".join(',',@_);
+} if ($flavour =~ /win64/);
+
+my $csel = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    my @regs = split(m|,\s*|,$args);
+    my $cond = pop(@regs);
+
+    "\tcsel$cond\t".join(',',@regs);
+} if ($flavour =~ /win64/);
+
+my $csetm = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    my @regs = split(m|,\s*|,$args);
+    my $cond = pop(@regs);
+
+    "\tcsetm$cond\t".join(',',@regs);
+} if ($flavour =~ /win64/);
+
+# ... then conditional branch instructions are also broken, but
+# maintaining all the variants is tedious, so I kludge-fix it
+# elsewhere...
+################################################################
+my $adrp = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    "\tadrp\t$args\@PAGE";
+} if ($flavour =~ /ios64/);
+
+my $paciasp = sub {
+    ($flavour =~ /linux/) ? "\t.inst\t0xd503233f"
+                          : &$inst(0xd503233f);
+};
+
+my $autiasp = sub {
+    ($flavour =~ /linux/) ? "\t.inst\t0xd50323bf"
+                          : &$inst(0xd50323bf);
+};
+
+sub range {
+  my ($r,$sfx,$start,$end) = @_;
+
+    join(",",map("$r$_$sfx",($start..$end)));
+}
+
+sub expand_line {
+  my $line = shift;
+  my @ret = ();
+
+    pos($line)=0;
+
+    while ($line =~ m/\G[^@\/\{\"]*/g) {
+	if ($line =~ m/\G(@|\/\/|$)/gc) {
+	    last;
+	}
+	elsif ($line =~ m/\G\{/gc) {
+	    my $saved_pos = pos($line);
+	    $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e;
+	    pos($line) = $saved_pos;
+	    $line =~ m/\G[^\}]*\}/g;
+	}
+	elsif ($line =~ m/\G\"/gc) {
+	    $line =~ m/\G[^\"]*\"/g;
+	}
+    }
+
+    $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
+
+    if ($flavour =~ /win/) {
+	# adjust alignment hints, "[rN,:32]" -> "[rN@32]"
+	$line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/;
+	# adjust local labels, ".Lwhatever" -> "|$Lwhatever|"
+	$line =~ s/\.(L\w{2,})/|\$$1|/g;
+	# omit "#:lo12:" on win64
+	$line =~ s/#:lo12://;
+    } elsif ($flavour =~ /coff(?!64)/) {
+	$line =~ s/\.L(\w{2,})/(\$ML$1)/g;
+    } elsif ($flavour =~ /ios64/) {
+	$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
+    }
+
+    return $line;
+}
+
+while(my $line=<>) {
+
+    # fix up assembler-specific commentary delimiter
+    $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/);
+
+    if ($line =~ m/^\s*(#|@|;|\/\/)/)	{ print $line; next; }
+
+    $line =~ s|/\*.*\*/||;	# get rid of C-style comments...
+    $line =~ s|^\s+||;		# ... and skip white spaces in beginning...
+    $line =~ s|\s+$||;		# ... and at the end
+
+    {
+	$line =~ s|[\b\.]L(\w{2,})|L$1|g;	# common denominator for Locallabel
+	$line =~ s|\bL(\w{2,})|\.L$1|g	if ($dotinlocallabels);
+    }
+
+    {
+	$line =~ s|(^[\.\w]+)\:\s*||;
+	my $label = $1;
+	if ($label) {
+	    $label = ($GLOBALS{$label} or $label);
+	    if ($flavour =~ /win/) {
+		$label =~ s|^\.L(?=\w)|\$L|;
+		printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : "");
+	    } else {
+		$label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/);
+		printf "%s:", $label;
+	    }
+	}
+    }
+
+    if ($line !~ m/^[#@;]/) {
+	$line =~ s|^\s*(\.?)(\S+)\s*||;
+	my $c = $1; $c = "\t" if ($c eq "");
+	my $mnemonic = $2;
+	my $opcode;
+	if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) {
+	    $opcode = eval("\$$1_$2");
+	} else {
+	    $opcode = eval("\$$mnemonic");
+	}
+
+	my $arg=expand_line($line);
+
+	if (ref($opcode) eq 'CODE') {
+	    $line = &$opcode($arg);
+	} elsif ($mnemonic)         {
+	    if ($flavour =~ /win64/) {
+		# "b.cond" -> "bcond", kludge-fix:-(
+		$mnemonic =~ s/^b\.([a-z]{2}$)/b$1/;
+	    }
+	    $line = $c.$mnemonic;
+	    $line.= "\t$arg" if ($arg ne "");
+	}
+    }
+
+    print $line if ($line);
+    print "\n";
+}
+
+print "\tEND\n" if ($flavour =~ /win/);
+
+close STDOUT;
diff --git a/blst/asm/ct_inverse_mod_256-armv8.pl b/blst/asm/ct_inverse_mod_256-armv8.pl
new file mode 100755
index 0000000..ced8c6c
--- /dev/null
+++ b/blst/asm/ct_inverse_mod_256-armv8.pl
@@ -0,0 +1,586 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 -
+# on Cortex-A57.
+#
+# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
+#                                                       const vec256 modx);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_256(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 31
+    mask = (1 << k) - 1
+
+    for i in range(0, 512 // k - 1):
+        # __ab_approximation_31
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-k-2)) << k)
+            b_ = (b & mask) | ((b >> (n-k-2)) << k)
+
+        # __inner_loop_31
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smul_256_n_shift_by_31
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smul_512x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 512 % k + k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 512 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    mod <<= 512 - mod.bit_length()  # align to the left
+    if v < 0:
+        v += mod
+    if v < 0:
+        v += mod
+    elif v == 1<<512
+        v -= mod
+
+    return v & (2**512 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
+my @acc=map("x$_",(4..11));
+my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17));
+my $cnt = $n_ptr;
+my @t = map("x$_",(19..26));
+my ($a_lo, $b_lo) = @acc[3,7];
+
+$frame = 16+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256, %function
+.align	5
+ct_inverse_mod_256:
+	paciasp
+	stp	x29, x30, [sp,#-80]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	sub	sp, sp, #$frame
+
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0]
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2]
+
+	add	$in_ptr, sp, #16+511	// find closest 512-byte-aligned spot
+	and	$in_ptr, $in_ptr, #-512	// in the frame...
+	str	$out_ptr, [sp]
+
+	ldp	@acc[4], @acc[5], [$n_ptr,#8*0]
+	ldp	@acc[6], @acc[7], [$n_ptr,#8*2]
+
+	stp	@acc[0], @acc[1], [$in_ptr,#8*0]	// copy input to |a|
+	stp	@acc[2], @acc[3], [$in_ptr,#8*2]
+	stp	@acc[4], @acc[5], [$in_ptr,#8*4]	// copy modulus to |b|
+	stp	@acc[6], @acc[7], [$in_ptr,#8*6]
+
+	////////////////////////////////////////// first iteration
+	bl	.Lab_approximation_31_256_loaded
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	str	$f0,[$out_ptr,#8*8]		// initialize |u| with |f0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*4	// pointer to dst |b|
+	bl	__smul_256_n_shift_by_31
+	str	$f0, [$out_ptr,#8*9]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	ldr	@acc[4], [$in_ptr,#8*8]		// |u|
+	ldr	@acc[5], [$in_ptr,#8*13]	// |v|
+	madd	@acc[0], $f_, @acc[4], xzr	// |u|*|f0|
+	madd	@acc[0], $g_, @acc[5], @acc[0]	// |v|*|g0|
+	str	@acc[0], [$out_ptr,#8*4]
+	asr	@acc[1], @acc[0], #63		// sign extenstion
+	stp	@acc[1], @acc[1], [$out_ptr,#8*5]
+	stp	@acc[1], @acc[1], [$out_ptr,#8*7]
+
+	madd	@acc[0], $f0, @acc[4], xzr	// |u|*|f1|
+	madd	@acc[0], $g0, @acc[5], @acc[0]	// |v|*|g1|
+	str	@acc[0], [$out_ptr,#8*9]
+	asr	@acc[1], @acc[0], #63		// sign extenstion
+	stp	@acc[1], @acc[1], [$out_ptr,#8*10]
+	stp	@acc[1], @acc[1], [$out_ptr,#8*12]
+___
+for($i=2; $i<15; $i++) {
+$code.=<<___;
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	@t[3], @t[3], @t[4]
+	str	@t[3], [$out_ptr,#8*4]
+
+	mov	$f_, $f0			// corrected |f1|
+	mov	$g_, $g0			// corrected |g1|
+	add	$out_ptr, $out_ptr, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+___
+$code.=<<___	if ($i>7);
+	bl	__smul_512x63_tail
+___
+$code.=<<___	if ($i<=7);
+	adc	@t[3], @t[3], @t[4]
+	stp	@t[3], @t[3], [$out_ptr,#8*4]
+	stp	@t[3], @t[3], [$out_ptr,#8*6]
+___
+}
+$code.=<<___;
+	////////////////////////////////////////// two[!] last iterations
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #47			// 31 + 512 % 31
+	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
+	ldr	$a_lo, [$in_ptr,#8*0]		// just load
+	ldr	$b_lo, [$in_ptr,#8*4]
+	bl	__inner_loop_62_256
+
+	mov	$f_, $f1
+	mov	$g_, $g1
+	ldr	$out_ptr, [sp]			// original out_ptr
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	ldr	x30, [x29,#8]
+
+	smulh	@t[1], @acc[3], $g_		// figure out top-most limb
+	ldp	@acc[4], @acc[5], [$nx_ptr,#8*0]
+	adc	@t[4], @t[4], @t[6]
+	ldp	@acc[6], @acc[7], [$nx_ptr,#8*2]
+
+	add	@t[1], @t[1], @t[4]		// @t[1] is 1, 0 or -1
+	asr	@t[0], @t[1], #63		// sign as mask
+
+	and	@t[4],   @acc[4], @t[0]		// add mod<<256 conditionally
+	and	@t[5],   @acc[5], @t[0]
+	adds	@acc[0], @acc[0], @t[4]
+	and	@t[6],   @acc[6], @t[0]
+	adcs	@acc[1], @acc[1], @t[5]
+	and	@t[7],   @acc[7], @t[0]
+	adcs	@acc[2], @acc[2], @t[6]
+	adcs	@acc[3], @t[3],   @t[7]
+	adc	@t[1], @t[1], xzr		// @t[1] is 1, 0 or -1
+
+	neg	@t[0], @t[1]
+	orr	@t[1], @t[1], @t[0]		// excess bit or sign as mask
+	asr	@t[0], @t[0], #63		// excess bit as mask
+
+	and	@acc[4], @acc[4], @t[1]		// mask |mod|
+	and	@acc[5], @acc[5], @t[1]
+	and	@acc[6], @acc[6], @t[1]
+	and	@acc[7], @acc[7], @t[1]
+
+	eor	@acc[4], @acc[4], @t[0]		// conditionally negate |mod|
+	eor	@acc[5], @acc[5], @t[0]
+	adds	@acc[4], @acc[4], @t[0], lsr#63
+	eor	@acc[6], @acc[6], @t[0]
+	adcs	@acc[5], @acc[5], xzr
+	eor	@acc[7], @acc[7], @t[0]
+	adcs	@acc[6], @acc[6], xzr
+	adc	@acc[7], @acc[7], xzr
+
+	adds	@acc[0], @acc[0], @acc[4]	// final adjustment for |mod|<<256
+	adcs	@acc[1], @acc[1], @acc[5]
+	adcs	@acc[2], @acc[2], @acc[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
+	adc	@acc[3], @acc[3], @acc[7]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*6]
+
+	add	sp, sp, #$frame
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldr	x29, [sp],#80
+	autiasp
+	ret
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+
+////////////////////////////////////////////////////////////////////////
+.type	__smul_256x63, %function
+.align	5
+__smul_256x63:
+___
+for($j=0; $j<2; $j++) {
+my $f_ = $f_;   $f_ = $g_          if ($j);
+my @acc = @acc; @acc = @acc[4..7]  if ($j);
+my $k = 8*8+8*5*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |u| (or |v|)
+	asr	$f1, $f_, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	$f_, $f_, $f1		// conditionally negate |f_| (or |g_|)
+	ldr	@t[3+$j], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], $f1	// conditionally negate |u| (or |v|)
+	sub	$f_, $f_, $f1
+	eor	@acc[1], @acc[1], $f1
+	adds	@acc[0], @acc[0], $f1, lsr#63
+	eor	@acc[2], @acc[2], $f1
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[2], @acc[2], xzr
+	eor	@t[3+$j], @t[3+$j], $f1
+	 umulh	@t[0], @acc[0], $f_
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], $f_
+	adcs	@t[3+$j], @t[3+$j], xzr
+	 umulh	@t[2], @acc[2], $f_
+___
+$code.=<<___	if ($j!=0);
+	adc	$g1, xzr, xzr		// used in __smul_512x63_tail
+___
+$code.=<<___;
+	mul	@acc[0], @acc[0], $f_
+	 cmp	$f_, #0
+	mul	@acc[1], @acc[1], $f_
+	 csel	@t[3+$j], @t[3+$j], xzr, ne
+	mul	@acc[2], @acc[2], $f_
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@t[5+$j], @acc[3], $f_
+	adcs	@acc[2], @acc[2], @t[1]
+	adcs	@t[5+$j], @t[5+$j], @t[2]
+___
+$code.=<<___	if ($j==0);
+	adc	@t[7], xzr, xzr
+___
+}
+$code.=<<___;
+	adc	@t[7], @t[7], xzr
+
+	adds	@acc[0], @acc[0], @acc[4]
+	adcs	@acc[1], @acc[1], @acc[5]
+	adcs	@acc[2], @acc[2], @acc[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@t[5],   @t[5],   @t[6]
+	stp	@acc[2], @t[5], [$out_ptr,#8*2]
+
+	ret
+.size	__smul_256x63,.-__smul_256x63
+
+.type	__smul_512x63_tail, %function
+.align	5
+__smul_512x63_tail:
+	umulh	@t[5], @acc[3], $f_
+	ldp	@acc[1], @acc[2], [$in_ptr,#8*18]	// load rest of |v|
+	adc	@t[7], @t[7], xzr
+	ldr	@acc[3], [$in_ptr,#8*20]
+	and	@t[3], @t[3], $f_
+
+	umulh	@acc[7], @acc[7], $g_	// resume |v|*|g1| chain
+
+	sub	@t[5], @t[5], @t[3]	// tie up |u|*|f1| chain
+	asr	@t[6], @t[5], #63
+
+	eor	@acc[1], @acc[1], $f1	// conditionally negate rest of |v|
+	eor	@acc[2], @acc[2], $f1
+	adds	@acc[1], @acc[1], $g1
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[2], @acc[2], xzr
+	 umulh	@t[0], @t[4],   $g_
+	adc	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], $g_
+	add	@acc[7], @acc[7], @t[7]
+	 umulh	@t[2], @acc[2], $g_
+
+	mul	@acc[0], @t[4],   $g_
+	mul	@acc[1], @acc[1], $g_
+	adds	@acc[0], @acc[0], @acc[7]
+	mul	@acc[2], @acc[2], $g_
+	adcs	@acc[1], @acc[1], @t[0]
+	mul	@t[3],   @acc[3], $g_
+	adcs	@acc[2], @acc[2], @t[1]
+	adcs	@t[3],   @t[3],   @t[2]
+	adc	@t[4], xzr, xzr		// used in the final step
+
+	adds	@acc[0], @acc[0], @t[5]
+	adcs	@acc[1], @acc[1], @t[6]
+	adcs	@acc[2], @acc[2], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
+	adcs	@t[3],   @t[3],   @t[6]	// carry is used in the final step
+	stp	@acc[2], @t[3],   [$out_ptr,#8*6]
+
+	ret
+.size	__smul_512x63_tail,.-__smul_512x63_tail
+
+.type	__smul_256_n_shift_by_31, %function
+.align	5
+__smul_256_n_shift_by_31:
+___
+for($j=0; $j<2; $j++) {
+my $f0 = $f0;   $f0 = $g0           if ($j);
+my @acc = @acc; @acc = @acc[4..7]   if ($j);
+my $k = 8*4*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |a| (or |b|)
+	asr	@t[5], $f0, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	@t[6], $f0, @t[5]	// conditionally negate |f0| (or |g0|)
+
+	eor	@acc[0], @acc[0], @t[5]	// conditionally negate |a| (or |b|)
+	sub	@t[6], @t[6], @t[5]
+	eor	@acc[1], @acc[1], @t[5]
+	adds	@acc[0], @acc[0], @t[5], lsr#63
+	eor	@acc[2], @acc[2], @t[5]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[5]
+	 umulh	@t[0], @acc[0], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	 umulh	@t[1], @acc[1], @t[6]
+	adc	@acc[3], @acc[3], xzr
+	 umulh	@t[2], @acc[2], @t[6]
+	and	@t[5], @t[5], @t[6]
+	 umulh	@t[3+$j], @acc[3], @t[6]
+	neg	@t[5], @t[5]
+
+	mul	@acc[0], @acc[0], @t[6]
+	mul	@acc[1], @acc[1], @t[6]
+	mul	@acc[2], @acc[2], @t[6]
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], @t[1]
+	adcs	@acc[3], @acc[3], @t[2]
+	adc	@t[3+$j], @t[3+$j], @t[5]
+___
+}
+$code.=<<___;
+	adds	@acc[0], @acc[0], @acc[4]
+	adcs	@acc[1], @acc[1], @acc[5]
+	adcs	@acc[2], @acc[2], @acc[6]
+	adcs	@acc[3], @acc[3], @acc[7]
+	adc	@acc[4], @t[3],   @t[4]
+
+	extr	@acc[0], @acc[1], @acc[0], #31
+	extr	@acc[1], @acc[2], @acc[1], #31
+	extr	@acc[2], @acc[3], @acc[2], #31
+	asr	@t[4], @acc[4], #63	// result's sign as mask
+	extr	@acc[3], @acc[4], @acc[3], #31
+
+	eor	@acc[0], @acc[0], @t[4]	// ensure the result is positive
+	eor	@acc[1], @acc[1], @t[4]
+	adds	@acc[0], @acc[0], @t[4], lsr#63
+	eor	@acc[2], @acc[2], @t[4]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[4]
+	adcs	@acc[2], @acc[2], xzr
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adc	@acc[3], @acc[3], xzr
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+
+	eor	$f0, $f0, @t[4]		// adjust |f/g| accordingly
+	eor	$g0, $g0, @t[4]
+	sub	$f0, $f0, @t[4]
+	sub	$g0, $g0, @t[4]
+
+	ret
+.size	__smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
+___
+
+{
+my @a = @acc[0..3];
+my @b = @acc[4..7];
+my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]);
+
+$code.=<<___;
+.type	__ab_approximation_31_256, %function
+.align	4
+__ab_approximation_31_256:
+	ldp	@a[2], @a[3], [$in_ptr,#8*2]
+	ldp	@b[2], @b[3], [$in_ptr,#8*6]
+	ldp	@a[0], @a[1], [$in_ptr,#8*0]
+	ldp	@b[0], @b[1], [$in_ptr,#8*4]
+
+.Lab_approximation_31_256_loaded:
+	orr	@t[0], @a[3], @b[3]	// check top-most limbs, ...
+	cmp	@t[0], #0
+	csel	@a[3], @a[3], @a[2], ne
+	csel	@b[3], @b[3], @b[2], ne
+	csel	@a[2], @a[2], @a[1], ne
+	orr	@t[0], @a[3], @b[3]	// and ones before top-most, ...
+	csel	@b[2], @b[2], @b[1], ne
+
+	cmp	@t[0], #0
+	csel	@a[3], @a[3], @a[2], ne
+	csel	@b[3], @b[3], @b[2], ne
+	csel	@a[2], @a[2], @a[0], ne
+	orr	@t[0], @a[3], @b[3]	// and one more, ...
+	csel	@b[2], @b[2], @b[0], ne
+
+	clz	@t[0], @t[0]
+	cmp	@t[0], #64
+	csel	@t[0], @t[0], xzr, ne
+	csel	@a[3], @a[3], @a[2], ne
+	csel	@b[3], @b[3], @b[2], ne
+	neg	@t[1], @t[0]
+
+	lslv	@a[3], @a[3], @t[0]	// align high limbs to the left
+	lslv	@b[3], @b[3], @t[0]
+	lsrv	@a[2], @a[2], @t[1]
+	lsrv	@b[2], @b[2], @t[1]
+	and	@a[2], @a[2], @t[1], asr#6
+	and	@b[2], @b[2], @t[1], asr#6
+	orr	$a_lo, @a[3], @a[2]
+	orr	$b_lo, @b[3], @b[2]
+
+	bfxil	$a_lo, @a[0], #0, #31
+	bfxil	$b_lo, @b[0], #0, #31
+
+	b	__inner_loop_31_256
+	ret
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+
+.type	__inner_loop_31_256, %function
+.align	4
+__inner_loop_31_256:
+	mov	$cnt, #31
+	mov	$fg0, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	$fg1, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	$bias,#0x7FFFFFFF7FFFFFFF
+
+.Loop_31_256:
+	sbfx	@t[3], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_lo, @t[3]
+	sub	@t[1], $b_lo, $a_lo	// |b_|-|a_|
+	subs	@t[2], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	@t[0], $fg1
+	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
+	csel	$a_lo, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	$fg1, $fg1, $fg0,    hs	// exchange |fg0| and |fg1|
+	csel	$fg0, $fg0, @t[0],   hs
+	lsr	$a_lo, $a_lo, #1
+	and	@t[0], $fg1, @t[3]
+	and	@t[1], $bias, @t[3]
+	sub	$fg0, $fg0, @t[0]	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	$fg1, $fg1, $fg1	// |f1|<<=1
+	add	$fg0, $fg0, @t[1]
+	sub	$fg1, $fg1, $bias
+	cbnz	$cnt, .Loop_31_256
+
+	mov	$bias, #0x7FFFFFFF
+	ubfx	$f0, $fg0, #0, #32
+	ubfx	$g0, $fg0, #32, #32
+	ubfx	$f1, $fg1, #0, #32
+	ubfx	$g1, $fg1, #32, #32
+	sub	$f0, $f0, $bias		// remove bias
+	sub	$g0, $g0, $bias
+	sub	$f1, $f1, $bias
+	sub	$g1, $g1, $bias
+
+	ret
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256, %function
+.align	4
+__inner_loop_62_256:
+	mov	$f0, #1		// |f0|=1
+	mov	$g0, #0		// |g0|=0
+	mov	$f1, #0		// |f1|=0
+	mov	$g1, #1		// |g1|=1
+
+.Loop_62_256:
+	sbfx	@t[3], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_lo, @t[3]
+	sub	@t[1], $b_lo, $a_lo	// |b_|-|a_|
+	subs	@t[2], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	@t[0], $f0
+	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
+	csel	$a_lo, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	mov	@t[1], $g0
+	csel	$f0, $f0, $f1,       hs	// exchange |f0| and |f1|
+	csel	$f1, $f1, @t[0],     hs
+	csel	$g0, $g0, $g1,       hs	// exchange |g0| and |g1|
+	csel	$g1, $g1, @t[1],     hs
+	lsr	$a_lo, $a_lo, #1
+	and	@t[0], $f1, @t[3]
+	and	@t[1], $g1, @t[3]
+	add	$f1, $f1, $f1		// |f1|<<=1
+	add	$g1, $g1, $g1		// |g1|<<=1
+	sub	$f0, $f0, @t[0]		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$g0, $g0, @t[1]		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	$cnt, .Loop_62_256
+
+	ret
+.size	__inner_loop_62_256,.-__inner_loop_62_256
+___
+}
+
+foreach(split("\n",$code)) {
+    s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/;
+    print $_,"\n";
+}
+close STDOUT;
diff --git a/blst/asm/ct_inverse_mod_256-x86_64.pl b/blst/asm/ct_inverse_mod_256-x86_64.pl
new file mode 100755
index 0000000..24ab545
--- /dev/null
+++ b/blst/asm/ct_inverse_mod_256-x86_64.pl
@@ -0,0 +1,837 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake.
+#
+# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
+#                                                       const vec256 modx);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_256(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 31
+    mask = (1 << k) - 1
+
+    for i in range(0, 512 // k - 1):
+        # __ab_approximation_31
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-k-2)) << k)
+            b_ = (b & mask) | ((b >> (n-k-2)) << k)
+
+        # __inner_loop_31
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smulq_256_n_shift_by_31
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smulq_512x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 512 % k + k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 512 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    mod <<= 512 - mod.bit_length()  # align to the left
+    if v < 0:
+        v += mod
+    if v < 0:
+        v += mod
+    elif v == 1<<512
+        v -= mod
+
+    return v & (2**512 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc = map("%r$_",(8..15));
+my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
+my $cnt = "%edx";
+
+$frame = 8*6+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256,\@function,4,"unwind"
+.align	32
+ct_inverse_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*6+511(%rsp), %rax	# find closest 512-byte-aligned spot
+	and	\$-512, %rax		# in the frame...
+	mov	$out_ptr, 8*4(%rsp)
+	mov	$nx_ptr,  8*5(%rsp)
+
+	mov	8*0($in_ptr), @acc[0]	# load input
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+
+	mov	8*0($n_ptr), @acc[4]	# load modulus
+	mov	8*1($n_ptr), @acc[5]
+	mov	8*2($n_ptr), @acc[6]
+	mov	8*3($n_ptr), @acc[7]
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+
+	mov	@acc[4], 8*4(%rax)	# copy modulus to |b|
+	mov	@acc[5], 8*5(%rax)
+	mov	@acc[6], 8*6(%rax)
+	mov	@acc[7], 8*7(%rax)
+	mov	%rax, $in_ptr
+
+	################################# first iteration
+	mov	\$31, $cnt
+	call	__ab_approximation_31_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	mov	$f1, 8*2(%rsp)
+	mov	$g1, 8*3(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_256_n_shift_by_31
+	#mov	$f0, 8*0(%rsp)		# corrected |f0|
+	#mov	$g0, 8*1(%rsp)		# corrected |g0|
+	mov	$f0, 8*8($out_ptr)	# initialize |u| with |f0|
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_256_n_shift_by_31
+	#mov	$f0, 8*2(%rsp)		# corrected |f1|
+	#mov	$g0, 8*3(%rsp)		# corrected |g1|
+	mov	$f0, 8*9($out_ptr)	# initialize |v| with |f1|
+
+	################################# second iteration
+	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	mov	$f1, 8*2(%rsp)
+	mov	$g1, 8*3(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_256_n_shift_by_31
+	mov	$f0, 8*0(%rsp)		# corrected |f0|
+	mov	$g0, 8*1(%rsp)		# corrected |g0|
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_256_n_shift_by_31
+	#mov	$f0, 8*2(%rsp)		# corrected |f1|
+	#mov	$g0, 8*3(%rsp)		# corrected |g1|
+
+	mov	8*8($in_ptr),  @acc[0]	# |u|
+	mov	8*13($in_ptr), @acc[4]	# |v|
+	mov	@acc[0], @acc[1]
+	imulq	8*0(%rsp), @acc[0]	# |u|*|f0|
+	mov	@acc[4], @acc[5]
+	imulq	8*1(%rsp), @acc[4]	# |v|*|g0|
+	add	@acc[4], @acc[0]
+	mov	@acc[0], 8*4($out_ptr)	# destination |u|
+	sar	\$63, @acc[0]		# sign extension
+	mov	@acc[0], 8*5($out_ptr)
+	mov	@acc[0], 8*6($out_ptr)
+	mov	@acc[0], 8*7($out_ptr)
+	mov	@acc[0], 8*8($out_ptr)
+	lea	8*8($in_ptr), $in_ptr	# make in_ptr "rewindable" with xor
+
+	imulq	$f0, @acc[1]		# |u|*|f1|
+	imulq	$g0, @acc[5]		# |v|*|g1|
+	add	@acc[5], @acc[1]
+	mov	@acc[1], 8*9($out_ptr)	# destination |v|
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*10($out_ptr)
+	mov	@acc[1], 8*11($out_ptr)
+	mov	@acc[1], 8*12($out_ptr)
+	mov	@acc[1], 8*13($out_ptr)
+___
+for($i=2; $i<15; $i++) {
+my $smul_512x63  = $i>8  ? "__smulq_512x63"
+                         : "__smulq_256x63";
+$code.=<<___;
+	xor	\$256+8*8, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	mov	$f1, 8*2(%rsp)
+	mov	$g1, 8*3(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_256_n_shift_by_31
+	mov	$f0, 8*0(%rsp)		# corrected |f0|
+	mov	$g0, 8*1(%rsp)		# corrected |g0|
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_256_n_shift_by_31
+	mov	$f0, 8*2(%rsp)		# corrected |f1|
+	mov	$g0, 8*3(%rsp)		# corrected |g1|
+
+	mov	8*0(%rsp), $f0		# |f0|
+	mov	8*1(%rsp), $g0		# |g0|
+	lea	8*8($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |u|
+	call	__smulq_256x63
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*5($out_ptr),$out_ptr	# pointer to destination |v|
+	call	$smul_512x63
+___
+$code.=<<___	if ($i==8);
+	sar	\$63, %rbp		# sign extension
+	mov	%rbp, 8*5($out_ptr)
+	mov	%rbp, 8*6($out_ptr)
+	mov	%rbp, 8*7($out_ptr)
+___
+}
+$code.=<<___;
+	################################# two[!] last iterations in one go
+	xor	\$256+8*8, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$47, $cnt		# 31 + 512 % 31
+	#call	__ab_approximation_31	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	#xor	@acc[1],      @acc[1]	# |a_hi|
+	mov	8*4($in_ptr), @acc[2]	# |b_lo|
+	#xor	@acc[3],      @acc[3]	# |b_hi|
+	call	__inner_loop_62_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	#mov	$f1, 8*2(%rsp)
+	#mov	$g1, 8*3(%rsp)
+
+	#mov	8*0(%rsp), $f0		# |f0|
+	#mov	8*1(%rsp), $g0		# |g0|
+	lea	8*8($in_ptr), $in_ptr	# pointer to source |u|v|
+	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	#call	__smulq_256x63
+
+	#mov	8*2(%rsp), $f0		# |f1|
+	#mov	8*3(%rsp), $g0		# |g1|
+	mov	$f1, $f0
+	mov	$g1, $g0
+	mov	8*4(%rsp), $out_ptr	# original |out_ptr|
+	call	__smulq_512x63
+	adc	%rbp, %rdx		# the excess limb of the result
+
+	mov	8*5(%rsp), $in_ptr	# original |nx_ptr|
+	mov	%rdx, %rax
+	sar	\$63, %rdx		# result's sign as mask
+
+	mov	%rdx, @acc[0]		# mask |modulus|
+	mov	%rdx, @acc[1]
+	and	8*0($in_ptr), @acc[0]
+	mov	%rdx, @acc[2]
+	and	8*1($in_ptr), @acc[1]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), %rdx
+
+	add	@acc[0], @acc[4]	# conditionally add |modulus|<<256
+	adc	@acc[1], @acc[5]
+	adc	@acc[2], @acc[6]
+	adc	%rdx,    @acc[7]
+	adc	\$0,     %rax
+
+	mov	%rax, %rdx
+	neg	%rax
+	or	%rax, %rdx		# excess bit or sign as mask
+	sar	\$63, %rax		# excess bit as mask
+
+	mov	%rdx, @acc[0]		# mask |modulus|
+	mov	%rdx, @acc[1]
+	and	8*0($in_ptr), @acc[0]
+	mov	%rdx, @acc[2]
+	and	8*1($in_ptr), @acc[1]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), %rdx
+
+	xor	%rax, @acc[0]		# conditionally negate |modulus|
+	xor	%rcx, %rcx
+	xor	%rax, @acc[1]
+	sub	%rax, %rcx
+	xor	%rax, @acc[2]
+	xor	%rax, %rdx
+	add	%rcx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, %rdx
+
+	add	@acc[0], @acc[4]	# final adjustment for |modulus|<<256
+	adc	@acc[1], @acc[5]
+	adc	@acc[2], @acc[6]
+	adc	%rdx,    @acc[7]
+
+	mov	@acc[4], 8*4($out_ptr)	# store absolute value
+	mov	@acc[5], 8*5($out_ptr)
+	mov	@acc[6], 8*6($out_ptr)
+	mov	@acc[7], 8*7($out_ptr)
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+___
+########################################################################
+# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
+# to the maximum bit-length of the *result*, and "63" - to the maximum
+# bit-length of the |f?| and |g?| single-limb multiplicands. However!
+# The latter should not be taken literally, as they are always chosen so
+# that "bad things" don't happen. For example, there comes a point when
+# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
+# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
+# because past that point |f0| is always 1 and |g0| is always 0. And,
+# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
+# perform full-width |u|*|f1| multiplication, half-width one with sign
+# extension is sufficient...
+$code.=<<___;
+.type	__smulq_512x63,\@abi-omnipotent
+.align	32
+__smulq_512x63:
+	mov	8*0($in_ptr), @acc[0]	# load |u|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), %rbp	# sign limb
+
+	mov	$f0, %rbx
+	sar	\$63, $f0		# |f0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit
+
+	xor	$f0, %rbx		# conditionally negate |f0|
+	add	%rax, %rbx
+
+	xor	$f0, @acc[0]		# conditionally negate |u|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, %rbp
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, %rbp
+
+	mulq	%rbx			# |u|*|f0|
+	mov	%rax, 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<3; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	@acc[$i], 8*$i($out_ptr)
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	and	%rbx, %rbp
+	neg	%rbp
+	mulq	%rbx
+	add	%rax, @acc[3]
+	adc	%rdx, %rbp
+	mov	@acc[3], 8*3($out_ptr)
+
+	mov	8*5($in_ptr), @acc[0]	# load |v|
+	mov	8*6($in_ptr), @acc[1]
+	mov	8*7($in_ptr), @acc[2]
+	mov	8*8($in_ptr), @acc[3]
+	mov	8*9($in_ptr), @acc[4]
+	mov	8*10($in_ptr), @acc[5]
+	mov	8*11($in_ptr), @acc[6]
+	mov	8*12($in_ptr), @acc[7]
+
+	mov	$g0, $f0
+	sar	\$63, $f0		# |g0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |g0|'s sign as bit
+
+	xor	$f0, $g0		# conditionally negate |g0|
+	add	%rax, $g0
+
+	xor	$f0, @acc[0]		# conditionally negate |v|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, @acc[4]
+	xor	$f0, @acc[5]
+	xor	$f0, @acc[6]
+	xor	$f0, @acc[7]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+	adc	\$0, @acc[7]
+
+	mulq	$g0
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<7; $i++) {
+$code.=<<___;
+	mulq	$g0
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	imulq	$g0
+	add	%rax, @acc[7]
+	adc	\$0, %rdx		# used in the final step
+
+	mov	%rbp, %rbx
+	sar	\$63, %rbp		# sign extension
+
+	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	%rbx, @acc[4]
+	adc	%rbp, @acc[5]
+	adc	%rbp, @acc[6]
+	adc	%rbp, @acc[7]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+	mov	@acc[6], 8*6($out_ptr)
+	mov	@acc[7], 8*7($out_ptr)
+
+	ret
+.size	__smulq_512x63,.-__smulq_512x63
+
+.type	__smulq_256x63,\@abi-omnipotent
+.align	32
+__smulq_256x63:
+___
+for($j=0; $j<2; $j++) {
+my $k = 8*5*$j;
+my @acc=@acc;	@acc=@acc[4..7]	if($j);
+my $top="%rbp";	$top=$g0	if($j);
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+	mov	$k+8*4($in_ptr), $top	# sign/excess limb
+
+	mov	$f0, %rbx
+	sar	\$63, $f0		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	$f0, %rbx		# conditionally negate |f0|
+	add	%rax, %rbx
+
+	xor	$f0, @acc[0]		# conditionally negate |u| (or |v|)
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, $top
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, $top
+
+	mulq	%rbx
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<3; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	and	%rbx, $top
+	neg	$top
+	mulq	%rbx
+	add	%rax, @acc[3]
+	adc	%rdx, $top
+___
+$code.=<<___	if ($j==0);
+	mov	$g0, $f0
+___
+}
+$code.=<<___;
+	add	@acc[4], @acc[0]	# accumulate |u|*|f0|
+	adc	@acc[5], @acc[1]
+	adc	@acc[6], @acc[2]
+	adc	@acc[7], @acc[3]
+	adc	%rcx, %rbp
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	%rbp,    8*4($out_ptr)
+
+	ret
+.size	__smulq_256x63,.-__smulq_256x63
+___
+########################################################################
+# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
+# the names refers to maximum bit-lengths of |a| and |b|. As already
+# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
+# chosen so that "bad things" don't happen. For example, so that the
+# sum of the products doesn't overflow, and that the final result is
+# never wider than inputs...
+{
+$code.=<<___;
+.type	__smulq_256_n_shift_by_31,\@abi-omnipotent
+.align	32
+__smulq_256_n_shift_by_31:
+	mov	$f0, 8*0($out_ptr)	# offload |f0|
+	mov	$g0, 8*1($out_ptr)	# offload |g0|
+	mov	$f0, %rbp
+___
+for($j=0; $j<2; $j++) {
+my $k = 8*4*$j;
+my @acc=@acc;	@acc=@acc[4..7] if ($j);
+my $f0="%rbp";	$f0=$g0		if ($j);
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+
+	mov	$f0, %rbx
+	sar	\$63, $f0		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	$f0, %rbx		# conditionally negate |f0| (or |g0|)
+	add	%rax, %rbx
+
+	xor	$f0, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+
+	mulq	%rbx
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	and	%rbx, $f0
+	neg	$f0
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<3; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[3]
+	adc	%rdx, $f0
+___
+}
+$code.=<<___;
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	adc	@acc[6], @acc[2]
+	adc	@acc[7], @acc[3]
+	adc	$g0, %rbp
+
+	mov	8*0($out_ptr), $f0	# restore original |f0|
+	mov	8*1($out_ptr), $g0	# restore original |g0|
+
+	shrd	\$31, @acc[1], @acc[0]
+	shrd	\$31, @acc[2], @acc[1]
+	shrd	\$31, @acc[3], @acc[2]
+	shrd	\$31, %rbp,    @acc[3]
+
+	sar	\$63, %rbp		# sign as mask
+	xor	%rax, %rax
+	sub	%rbp, %rax		# sign as bit
+
+	xor	%rbp, @acc[0]		# conditionally negate the result
+	xor	%rbp, @acc[1]
+	xor	%rbp, @acc[2]
+	xor	%rbp, @acc[3]
+	add	%rax, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+
+	xor	%rbp, $f0		# conditionally negate |f0|
+	xor	%rbp, $g0		# conditionally negate |g0|
+	add	%rax, $f0
+	add	%rax, $g0
+
+	ret
+.size	__smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31
+___
+}
+
+{
+my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
+my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
+my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
+my ($a_, $b_) = ($a_lo, $b_lo);
+{
+my @a = ($a_lo, $t1, $a_hi);
+my @b = ($b_lo, $t2, $b_hi);
+
+$code.=<<___;
+.type	__ab_approximation_31_256,\@abi-omnipotent
+.align	32
+__ab_approximation_31_256:
+	mov	8*3($in_ptr), @a[2]	# load |a| in reverse order
+	mov	8*7($in_ptr), @b[2]	# load |b| in reverse order
+	mov	8*2($in_ptr), @a[1]
+	mov	8*6($in_ptr), @b[1]
+	mov	8*1($in_ptr), @a[0]
+	mov	8*5($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# check top-most limbs, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*0($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*4($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0
+	bsr	$t0, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[0], @a[2]
+	cmovz	@b[0], @b[2]
+	cmovz	$t0, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[1], @a[2]	# align second limb to the left
+	shldq	%cl, @b[1], @b[2]
+
+	mov	\$0x7FFFFFFF, %eax
+	and	%rax, @a[0]
+	and	%rax, @b[0]
+	not	%rax
+	and	%rax, @a[2]
+	and	%rax, @b[2]
+	or	@a[2], @a[0]
+	or	@b[2], @b[0]
+
+	jmp	__inner_loop_31_256
+
+	ret
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+___
+}
+$code.=<<___;
+.type	__inner_loop_31_256,\@abi-omnipotent
+.align	32			# comment and punish Coffee Lake by up to 40%
+__inner_loop_31_256:		################# by Thomas Pornin
+	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
+	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
+	mov	\$0x7FFFFFFF7FFFFFFF, $bias
+
+.Loop_31_256:
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	mov	$fg0, $t2
+	mov	$fg1, $t3
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	cmovb	$fg1, $fg0
+	cmovb	$t2, $fg1
+
+	sub	$b_, $a_		# |a_|-|b_|
+	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
+	add	$bias, $fg0
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $fg0
+	cmovz	$t3, $fg1
+
+	shr	\$1, $a_		# |a_|>>=1
+	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
+	sub	$bias, $fg1
+	sub	\$1, $cnt
+	jnz	.Loop_31_256
+
+	shr	\$32, $bias
+	mov	%ecx, %edx		# $fg0, $f0
+	mov	${fg1}d, ${f1}d
+	shr	\$32, $g0
+	shr	\$32, $g1
+	sub	$bias, $f0		# remove the bias
+	sub	$bias, $g0
+	sub	$bias, $f1
+	sub	$bias, $g1
+
+	ret
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256,\@abi-omnipotent
+.align	32
+__inner_loop_62_256:
+	mov	$cnt, %r15d
+	mov	\$1, $f0	# |f0|=1
+	xor	$g0, $g0	# |g0|=0
+	xor	$f1, $f1	# |f1|=0
+	mov	$f0, $g1	# |g1|=1
+	mov	$f0, %r14
+
+.Loop_62_256:
+	xor	$t0, $t0
+	test	%r14, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
+	mov	$b_lo, $t1
+	cmovnz	$b_lo, $t0
+	sub	$a_lo, $t1	# |b_|-|a_|
+	mov	$a_lo, $t2
+	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
+	cmovc	$t1, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
+	cmovc	$t2, $b_lo	# |b_| = |a_|
+	mov	$f0, $t0	# exchange |f0| and |f1|
+	cmovc	$f1, $f0
+	cmovc	$t0, $f1
+	mov	$g0, $t1	# exchange |g0| and |g1|
+	cmovc	$g1, $g0
+	cmovc	$t1, $g1
+	xor	$t0, $t0
+	xor	$t1, $t1
+	shr	\$1, $a_lo
+	test	%r14, $t2	# if |a_| was odd, then we'll be subtracting...
+	cmovnz	$f1, $t0
+	cmovnz	$g1, $t1
+	add	$f1, $f1	# |f1|<<=1
+	add	$g1, $g1	# |g1|<<=1
+	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
+	sub	\$1, %r15d
+	jnz	.Loop_62_256
+
+	ret
+.size	__inner_loop_62_256,.-__inner_loop_62_256
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/ct_inverse_mod_384-armv8.pl b/blst/asm/ct_inverse_mod_384-armv8.pl
new file mode 100755
index 0000000..268bf9d
--- /dev/null
+++ b/blst/asm/ct_inverse_mod_384-armv8.pl
@@ -0,0 +1,610 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >12x better [on
+# Cortex cores] than modulus-specific FLT addition chain...
+#
+# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_383(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 62
+    w = 64
+    mask = (1 << w) - 1
+
+    for i in range(0, 766 // k):
+        # __ab_approximation_62
+        n = max(a.bit_length(), b.bit_length())
+        if n < 128:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_62
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smul_383_n_shift_by_62
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smul_767x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 766 % k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 766 % k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    if v < 0:
+        v += mod << (768 - mod.bit_length())    # left aligned
+
+    return v & (2**768 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
+my @acc=map("x$_",(3..14));
+my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21));
+my $cnt = $n_ptr;
+my @t = map("x$_",(22..28,2));
+my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11];
+
+$frame = 16+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383, %function
+.align	5
+ct_inverse_mod_383:
+	paciasp
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #$frame
+
+	ldp	@t[0],   @acc[1], [$in_ptr,#8*0]
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2]
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4]
+
+	add	$in_ptr, sp, #16+511	// find closest 512-byte-aligned spot
+	and	$in_ptr, $in_ptr, #-512	// in the frame...
+	stp	$out_ptr, $nx_ptr, [sp]
+
+	ldp	@acc[6], @acc[7], [$n_ptr,#8*0]
+	ldp	@acc[8], @acc[9], [$n_ptr,#8*2]
+	ldp	@acc[10], @acc[11], [$n_ptr,#8*4]
+
+	stp	@t[0],   @acc[1], [$in_ptr,#8*0]	// copy input to |a|
+	stp	@acc[2], @acc[3], [$in_ptr,#8*2]
+	stp	@acc[4], @acc[5], [$in_ptr,#8*4]
+	stp	@acc[6], @acc[7], [$in_ptr,#8*6]	// copy modulus to |b|
+	stp	@acc[8], @acc[9], [$in_ptr,#8*8]
+	stp	@acc[10], @acc[11], [$in_ptr,#8*10]
+
+	////////////////////////////////////////// first iteration
+	mov	$cnt, #62
+	bl	.Lab_approximation_62_loaded
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	str	$f0,[$out_ptr,#8*12]		// initialize |u| with |f0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |b|
+	bl	__smul_383_n_shift_by_62
+	str	$f0, [$out_ptr,#8*12]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #62
+	bl	__ab_approximation_62
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	ldr	@acc[4], [$in_ptr,#8*12]	// |u|
+	ldr	@acc[5], [$in_ptr,#8*18]	// |v|
+	mul	@acc[0], $f_, @acc[4]		// |u|*|f0|
+	smulh	@acc[1], $f_, @acc[4]
+	mul	@acc[2], $g_, @acc[5]		// |v|*|g0|
+	smulh	@acc[3], $g_, @acc[5]
+	adds	@acc[0], @acc[0], @acc[2]
+	adc	@acc[1], @acc[1], @acc[3]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
+	asr	@acc[2], @acc[1], #63		// sign extenstion
+	stp	@acc[2], @acc[2], [$out_ptr,#8*8]
+	stp	@acc[2], @acc[2], [$out_ptr,#8*10]
+
+	mul	@acc[0], $f0, @acc[4]		// |u|*|f1|
+	smulh	@acc[1], $f0, @acc[4]
+	mul	@acc[2], $g0, @acc[5]		// |v|*|g1|
+	smulh	@acc[3], $g0, @acc[5]
+	adds	@acc[0], @acc[0], @acc[2]
+	adc	@acc[1], @acc[1], @acc[3]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*12]
+	asr	@acc[2], @acc[1], #63		// sign extenstion
+	stp	@acc[2], @acc[2], [$out_ptr,#8*14]
+	stp	@acc[2], @acc[2], [$out_ptr,#8*16]
+___
+for($i=2; $i<11; $i++) {
+$code.=<<___;
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #62
+	bl	__ab_approximation_62
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	$f_, $f0			// corrected |f1|
+	mov	$g_, $g0			// corrected |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+___
+$code.=<<___	if ($i>5);
+	bl	__smul_767x63_tail
+___
+$code.=<<___	if ($i==5);
+	asr	@t[5], @t[5], #63		// sign extension
+	stp	@t[5], @t[5], [$out_ptr,#8*6]
+	stp	@t[5], @t[5], [$out_ptr,#8*8]
+	stp	@t[5], @t[5], [$out_ptr,#8*10]
+___
+}
+$code.=<<___;
+	////////////////////////////////////////// iteration before last
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldp	$a_lo, $a_hi, [$in_ptr,#8*0]	// just load
+	ldp	$b_lo, $b_hi, [$in_ptr,#8*6]
+	bl	__inner_loop_62
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	str	$a_lo, [$out_ptr,#8*0]
+	str	$b_lo, [$out_ptr,#8*6]
+
+	mov	$f_, $f0			// exact |f0|
+	mov	$g_, $g0			// exact |g0|
+	mov	$f0, $f1
+	mov	$g0, $g1
+	add	$out_ptr, $out_ptr, #8*12	// pointer to dst |u|
+	bl	__smul_383x63
+
+	mov	$f_, $f0			// exact |f1|
+	mov	$g_, $g0			// exact |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+
+	////////////////////////////////////////// last iteration
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #22			// 766 % 62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldr	$a_lo, [$in_ptr,#8*0]		// just load
+	eor	$a_hi, $a_hi, $a_hi
+	ldr	$b_lo, [$in_ptr,#8*6]
+	eor	$b_hi, $b_hi, $b_hi
+	bl	__inner_loop_62
+
+	mov	$f_, $f1
+	mov	$g_, $g1
+	ldp	$out_ptr, $f0, [sp]		// original out_ptr and n_ptr
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	ldr	x30, [x29,#8]
+
+	asr	@t[0], @acc[5], #63		// sign as mask
+	ldp	@acc[6], @acc[7], [$f0,#8*0]
+	ldp	@acc[8], @acc[9], [$f0,#8*2]
+	ldp	@acc[10], @acc[11], [$f0,#8*4]
+
+	and	@acc[6], @acc[6], @t[0]		// add mod<<384 conditionally
+	and	@acc[7], @acc[7], @t[0]
+	adds	@acc[0], @acc[0], @acc[6]
+	and	@acc[8], @acc[8], @t[0]
+	adcs	@acc[1], @acc[1], @acc[7]
+	and	@acc[9], @acc[9], @t[0]
+	adcs	@acc[2], @acc[2], @acc[8]
+	and	@acc[10], @acc[10], @t[0]
+	adcs	@acc[3], @acc[3], @acc[9]
+	and	@acc[11], @acc[11], @t[0]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
+	adcs	@acc[4], @acc[4], @acc[10]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*8]
+	adc	@acc[5], @acc[5], @acc[11]
+	stp	@acc[4], @acc[5], [$out_ptr,#8*10]
+
+	add	sp, sp, #$frame
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	autiasp
+	ret
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+
+////////////////////////////////////////////////////////////////////////
+// see corresponding commentary in ctx_inverse_mod_384-x86_64...
+.type	__smul_383x63, %function
+.align	5
+__smul_383x63:
+___
+for($j=0; $j<2; $j++) {
+my $f_ = $f_;   $f_ = $g_          if ($j);
+my @acc = @acc; @acc = @acc[6..11] if ($j);
+my $k = 8*12+8*6*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |u| (or |v|)
+	asr	$f1, $f_, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	$f_, $f_, $f1		// conditionally negate |f_| (or |g_|)
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], $f1	// conditionally negate |u| (or |v|)
+	sub	$f_, $f_, $f1
+	eor	@acc[1], @acc[1], $f1
+	adds	@acc[0], @acc[0], $f1, lsr#63
+	eor	@acc[2], @acc[2], $f1
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], $f1
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[0], @acc[0], $f_
+	eor	@acc[5], @acc[5], $f1
+	 umulh	@t[1], @acc[1], $f_
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[2], @acc[2], $f_
+	adcs	@acc[5], @acc[5], xzr
+	 umulh	@t[3], @acc[3], $f_
+___
+$code.=<<___	if ($j);
+	adc	$g1, xzr, xzr		// used in __smul_767x63_tail
+___
+$code.=<<___;
+	umulh	@t[4], @acc[4], $f_
+	mul	@acc[0], @acc[0], $f_
+	mul	@acc[1], @acc[1], $f_
+	mul	@acc[2], @acc[2], $f_
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], $f_
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[4], @acc[4], $f_
+	adcs	@acc[3], @acc[3], @t[2]
+	mul	@t[5+$j],@acc[5], $f_
+	adcs	@acc[4], @acc[4], @t[3]
+	adcs	@t[5+$j],@t[5+$j],@t[4]
+___
+$code.=<<___	if ($j==0);
+	adc	@t[7], xzr, xzr
+___
+}
+$code.=<<___;
+	adc	@t[7], @t[7], xzr
+
+	adds	@acc[0], @acc[0], @acc[6]
+	adcs	@acc[1], @acc[1], @acc[7]
+	adcs	@acc[2], @acc[2], @acc[8]
+	adcs	@acc[3], @acc[3], @acc[9]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@acc[4], @acc[4], @acc[10]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+	adcs	@t[5],   @t[5],   @t[6]
+	stp	@acc[4], @t[5],   [$out_ptr,#8*4]
+	adc	@t[6],   @t[7],   xzr	// used in __smul_767x63_tail
+
+	ret
+.size	__smul_383x63,.-__smul_383x63
+
+.type	__smul_767x63_tail, %function
+.align	5
+__smul_767x63_tail:
+	smulh	@t[5],   @acc[5], $f_
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*24]	// load rest of |v|
+	umulh	@acc[11],@acc[11], $g_
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*26]
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*28]
+
+	eor	@acc[0], @acc[0], $f1	// conditionally negate rest of |v|
+	eor	@acc[1], @acc[1], $f1
+	eor	@acc[2], @acc[2], $f1
+	adds	@acc[0], @acc[0], $g1
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[4], @acc[4], $f1
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[5], @acc[5], $f1
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[0], @acc[0], $g_
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[1], @acc[1], $g_
+	adc	@acc[5], @acc[5], xzr
+
+	umulh	@t[2], @acc[2], $g_
+	 add	@acc[11], @acc[11], @t[6]
+	umulh	@t[3], @acc[3], $g_
+	 asr	@t[6], @t[5], #63
+	umulh	@t[4], @acc[4], $g_
+	mul	@acc[0], @acc[0], $g_
+	mul	@acc[1], @acc[1], $g_
+	mul	@acc[2], @acc[2], $g_
+	adds	@acc[0], @acc[0], @acc[11]
+	mul	@acc[3], @acc[3], $g_
+	adcs	@acc[1], @acc[1], @t[0]
+	mul	@acc[4], @acc[4], $g_
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[5], @acc[5], $g_
+	adcs	@acc[3], @acc[3], @t[2]
+	adcs	@acc[4], @acc[4], @t[3]
+	adc	@acc[5], @acc[5], @t[4]
+
+	adds	@acc[0], @acc[0], @t[5]
+	adcs	@acc[1], @acc[1], @t[6]
+	adcs	@acc[2], @acc[2], @t[6]
+	adcs	@acc[3], @acc[3], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
+	adcs	@acc[4], @acc[4], @t[6]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*8]
+	adc	@acc[5], @acc[5], @t[6]
+	stp	@acc[4], @acc[5], [$out_ptr,#8*10]
+
+	ret
+.size	__smul_767x63_tail,.-__smul_767x63_tail
+
+.type	__smul_383_n_shift_by_62, %function
+.align	5
+__smul_383_n_shift_by_62:
+___
+for($j=0; $j<2; $j++) {
+my $f0 = $f0;   $f0 = $g0           if ($j);
+my @acc = @acc; @acc = @acc[6..11]  if ($j);
+my $k = 8*6*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |a| (or |b|)
+	asr	@t[6], $f0, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	@t[7], $f0, @t[6]	// conditionally negate |f0| (or |g0|)
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], @t[6]	// conditionally negate |a| (or |b|)
+	sub	@t[7], @t[7], @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	 umulh	@t[0], @acc[0], @t[7]
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], @t[7]
+	eor	@acc[5], @acc[5], @t[6]
+	 umulh	@t[2], @acc[2], @t[7]
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[3], @acc[3], @t[7]
+	adc	@acc[5], @acc[5], xzr
+
+	umulh	@t[4], @acc[4], @t[7]
+	smulh	@t[5+$j], @acc[5], @t[7]
+	mul	@acc[0], @acc[0], @t[7]
+	mul	@acc[1], @acc[1], @t[7]
+	mul	@acc[2], @acc[2], @t[7]
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], @t[7]
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[4], @acc[4], @t[7]
+	adcs	@acc[3], @acc[3], @t[2]
+	mul	@acc[5], @acc[5], @t[7]
+	adcs	@acc[4], @acc[4], @t[3]
+	adcs	@acc[5], @acc[5] ,@t[4]
+	adc	@t[5+$j], @t[5+$j], xzr
+___
+}
+$code.=<<___;
+	adds	@acc[0], @acc[0], @acc[6]
+	adcs	@acc[1], @acc[1], @acc[7]
+	adcs	@acc[2], @acc[2], @acc[8]
+	adcs	@acc[3], @acc[3], @acc[9]
+	adcs	@acc[4], @acc[4], @acc[10]
+	adcs	@acc[5], @acc[5], @acc[11]
+	adc	@acc[6], @t[5],   @t[6]
+
+	extr	@acc[0], @acc[1], @acc[0], #62
+	extr	@acc[1], @acc[2], @acc[1], #62
+	extr	@acc[2], @acc[3], @acc[2], #62
+	asr	@t[6], @acc[6], #63
+	extr	@acc[3], @acc[4], @acc[3], #62
+	extr	@acc[4], @acc[5], @acc[4], #62
+	extr	@acc[5], @acc[6], @acc[5], #62
+
+	eor	@acc[0], @acc[0], @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	adcs	@acc[3], @acc[3], xzr
+	eor	@acc[5], @acc[5], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@acc[4], @acc[4], xzr
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+	adc	@acc[5], @acc[5], xzr
+	stp	@acc[4], @acc[5], [$out_ptr,#8*4]
+
+	eor	$f0, $f0, @t[6]
+	eor	$g0, $g0, @t[6]
+	sub	$f0, $f0, @t[6]
+	sub	$g0, $g0, @t[6]
+
+	ret
+.size	__smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
+___
+
+{
+my @a = @acc[0..5];
+my @b = @acc[6..11];
+
+$code.=<<___;
+.type	__ab_approximation_62, %function
+.align	4
+__ab_approximation_62:
+	ldp	@a[4], @a[5], [$in_ptr,#8*4]
+	ldp	@b[4], @b[5], [$in_ptr,#8*10]
+	ldp	@a[2], @a[3], [$in_ptr,#8*2]
+	ldp	@b[2], @b[3], [$in_ptr,#8*8]
+
+.Lab_approximation_62_loaded:
+	orr	@t[0], @a[5], @b[5]	// check top-most limbs, ...
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[3], ne
+	orr	@t[0], @a[5], @b[5]	// ... ones before top-most, ...
+	csel	@b[4], @b[4], @b[3], ne
+
+	ldp	@a[0], @a[1], [$in_ptr,#8*0]
+	ldp	@b[0], @b[1], [$in_ptr,#8*6]
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[2], ne
+	orr	@t[0], @a[5], @b[5]	// ... and ones before that ...
+	csel	@b[4], @b[4], @b[2], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[1], ne
+	orr	@t[0], @a[5], @b[5]
+	csel	@b[4], @b[4], @b[1], ne
+
+	clz	@t[0], @t[0]
+	cmp	@t[0], #64
+	csel	@t[0], @t[0], xzr, ne
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	neg	@t[1], @t[0]
+
+	lslv	@a[5], @a[5], @t[0]	// align high limbs to the left
+	lslv	@b[5], @b[5], @t[0]
+	lsrv	@a[4], @a[4], @t[1]
+	lsrv	@b[4], @b[4], @t[1]
+	and	@a[4], @a[4], @t[1], asr#6
+	and	@b[4], @b[4], @t[1], asr#6
+	orr	@a[5], @a[5], @a[4]
+	orr	@b[5], @b[5], @b[4]
+
+	b	__inner_loop_62
+	ret
+.size	__ab_approximation_62,.-__ab_approximation_62
+___
+}
+$code.=<<___;
+.type	__inner_loop_62, %function
+.align	4
+__inner_loop_62:
+	mov	$f0, #1		// |f0|=1
+	mov	$g0, #0		// |g0|=0
+	mov	$f1, #0		// |f1|=0
+	mov	$g1, #1		// |g1|=1
+
+.Loop_62:
+	sbfx	@t[6], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	$cnt, $cnt, #1
+	subs	@t[2], $b_lo, $a_lo	// |b_|-|a_|
+	and	@t[0], $b_lo, @t[6]
+	sbc	@t[3], $b_hi, $a_hi
+	and	@t[1], $b_hi, @t[6]
+	subs	@t[4], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	@t[0], $f0
+	sbcs	@t[5], $a_hi, @t[1]
+	mov	@t[1], $g0
+	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
+	csel	$b_hi, $b_hi, $a_hi, hs
+	csel	$a_lo, @t[4], @t[2], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	$a_hi, @t[5], @t[3], hs
+	csel	$f0, $f0, $f1,       hs	// exchange |f0| and |f1|
+	csel	$f1, $f1, @t[0],     hs
+	csel	$g0, $g0, $g1,       hs	// exchange |g0| and |g1|
+	csel	$g1, $g1, @t[1],     hs
+	extr	$a_lo, $a_hi, $a_lo, #1
+	lsr	$a_hi, $a_hi, #1
+	and	@t[0], $f1, @t[6]
+	and	@t[1], $g1, @t[6]
+	add	$f1, $f1, $f1		// |f1|<<=1
+	add	$g1, $g1, $g1		// |g1|<<=1
+	sub	$f0, $f0, @t[0]		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$g0, $g0, @t[1]		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	$cnt, .Loop_62
+
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
+___
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/ct_is_square_mod_384-armv8.pl b/blst/asm/ct_is_square_mod_384-armv8.pl
new file mode 100755
index 0000000..dcf3ff8
--- /dev/null
+++ b/blst/asm/ct_is_square_mod_384-armv8.pl
@@ -0,0 +1,398 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast quadratic residue test as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >12x better [on
+# Cortex cores] than modulus-specific Legendre symbol addition chain...
+#
+# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_is_square_mod_384(inp, mod):
+    a = inp
+    b = mod
+    L = 0   # only least significant bit, adding 1 makes up for sign change
+
+    k = 30
+    w = 32
+    mask = (1 << w) - 1
+
+    for i in range(0, 768 // k - 1):
+        # __ab_approximation_30
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_30
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                    L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+            L += (b_ + 2) >> 2          # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+        # __smulq_384_n_shift_by_30
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if b < 0:
+            b = -b
+        if a < 0:
+            a = -a
+            L += (b % 4) >> 1           # |b| is always odd, the second bit
+                                        # tells the whole story
+
+    if True:
+        for j in range(0, 768 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b = b, a
+                    L += (a & b) >> 1   # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a = a-b
+            a = a >> 1
+            L += (b + 2) >> 2           # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+    return (L & 1) ^ 1
+___
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2));
+my @acc=map("x$_",(3..14));
+my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20));
+my @t = map("x$_",(21..28));
+my ($a_, $b_) = @acc[5,11];
+
+$frame = 2*256;
+
+$code.=<<___;
+.text
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384, %function
+.align	5
+ct_is_square_mod_384:
+	paciasp
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #$frame
+
+	ldp	@acc[0], @acc[1], [x0,#8*0]		// load input
+	ldp	@acc[2], @acc[3], [x0,#8*2]
+	ldp	@acc[4], @acc[5], [x0,#8*4]
+
+	add	$in_ptr, sp, #255	// find closest 256-byte-aligned spot
+	and	$in_ptr, $in_ptr, #-256	// in the frame...
+
+	ldp	@acc[6], @acc[7], [x1,#8*0]		// load modulus
+	ldp	@acc[8], @acc[9], [x1,#8*2]
+	ldp	@acc[10], @acc[11], [x1,#8*4]
+
+	stp	@acc[0], @acc[1], [$in_ptr,#8*6]	// copy input to |a|
+	stp	@acc[2], @acc[3], [$in_ptr,#8*8]
+	stp	@acc[4], @acc[5], [$in_ptr,#8*10]
+	stp	@acc[6], @acc[7], [$in_ptr,#8*0]	// copy modulus to |b|
+	stp	@acc[8], @acc[9], [$in_ptr,#8*2]
+	stp	@acc[10], @acc[11], [$in_ptr,#8*4]
+
+	eor	$L, $L, $L			// init the Legendre symbol
+	mov	$cnt, #24			// 24 is 768/30-1
+	b	.Loop_is_square
+
+.align	4
+.Loop_is_square:
+	bl	__ab_approximation_30
+	sub	$cnt, $cnt, #1
+
+	eor	$out_ptr, $in_ptr, #128		// pointer to dst |b|
+	bl	__smul_384_n_shift_by_30
+
+	mov	$f1, $f0			// |f0|
+	mov	$g1, $g0			// |g0|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |a|
+	bl	__smul_384_n_shift_by_30
+
+	ldp	@acc[6], @acc[7], [$out_ptr,#-8*6]
+	eor	$in_ptr, $in_ptr, #128		// flip-flop src |a|b|
+	and	@t[6], @t[6], @acc[6]		// if |a| was negative,
+	add	$L, $L, @t[6], lsr#1		// adjust |L|
+
+	cbnz	$cnt, .Loop_is_square
+
+	////////////////////////////////////////// last iteration
+	//bl	__ab_approximation_30		// |a| and |b| are exact,
+	//ldr	$a_, [$in_ptr,#8*6]		// just load
+	mov	$b_, @acc[6]			// ldr	$b_, [$in_ptr,#8*0]
+	mov	$cnt, #48			// 48 is 768%30 + 30
+	bl	__inner_loop_48
+	ldr	x30, [x29,#8]
+
+	and	x0, $L, #1
+	eor	x0, x0, #1
+
+	add	sp, sp, #$frame
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	autiasp
+	ret
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smul_384_n_shift_by_30, %function
+.align	5
+__smul_384_n_shift_by_30:
+___
+for($j=0; $j<2; $j++) {
+my $fx = $g1;   $fx = $f1           if ($j);
+my @acc = @acc; @acc = @acc[6..11]  if ($j);
+my $k = 8*6*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |b| (or |a|)
+	asr	@t[6], $fx, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	$fx, $fx, @t[6]		// conditionally negate |g1| (or |f1|)
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], @t[6]	// conditionally negate |b| (or |a|)
+	sub	$fx, $fx, @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	 umulh	@t[0], @acc[0], $fx
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], $fx
+	eor	@acc[5], @acc[5], @t[6]
+	 umulh	@t[2], @acc[2], $fx
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[3], @acc[3], $fx
+	adc	@acc[5], @acc[5], xzr
+
+	umulh	@t[4], @acc[4], $fx
+	and	@t[7], $fx, @t[6]
+	umulh	@t[5+$j], @acc[5], $fx
+	neg	@t[7], @t[7]
+	mul	@acc[0], @acc[0], $fx
+	mul	@acc[1], @acc[1], $fx
+	mul	@acc[2], @acc[2], $fx
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], $fx
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[4], @acc[4], $fx
+	adcs	@acc[3], @acc[3], @t[2]
+	mul	@acc[5], @acc[5], $fx
+	adcs	@acc[4], @acc[4], @t[3]
+	adcs	@acc[5], @acc[5] ,@t[4]
+	adc	@t[5+$j], @t[5+$j], @t[7]
+___
+}
+$code.=<<___;
+	adds	@acc[0], @acc[0], @acc[6]
+	adcs	@acc[1], @acc[1], @acc[7]
+	adcs	@acc[2], @acc[2], @acc[8]
+	adcs	@acc[3], @acc[3], @acc[9]
+	adcs	@acc[4], @acc[4], @acc[10]
+	adcs	@acc[5], @acc[5], @acc[11]
+	adc	@acc[6], @t[5],   @t[6]
+
+	extr	@acc[0], @acc[1], @acc[0], #30
+	extr	@acc[1], @acc[2], @acc[1], #30
+	extr	@acc[2], @acc[3], @acc[2], #30
+	asr	@t[6], @acc[6], #63
+	extr	@acc[3], @acc[4], @acc[3], #30
+	extr	@acc[4], @acc[5], @acc[4], #30
+	extr	@acc[5], @acc[6], @acc[5], #30
+
+	eor	@acc[0], @acc[0], @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	adcs	@acc[3], @acc[3], xzr
+	eor	@acc[5], @acc[5], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@acc[4], @acc[4], xzr
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+	adc	@acc[5], @acc[5], xzr
+	stp	@acc[4], @acc[5], [$out_ptr,#8*4]
+
+	ret
+.size	__smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
+___
+
+{
+my @a = @acc[0..5];
+my @b = @acc[6..11];
+my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]);
+
+$code.=<<___;
+.type	__ab_approximation_30, %function
+.align	4
+__ab_approximation_30:
+	ldp	@b[4], @b[5], [$in_ptr,#8*4]	// |a| is still in registers
+	ldp	@b[2], @b[3], [$in_ptr,#8*2]
+
+	orr	@t[0], @a[5], @b[5]	// check top-most limbs, ...
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[3], ne
+	orr	@t[0], @a[5], @b[5]	// ... ones before top-most, ...
+	csel	@b[4], @b[4], @b[3], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[2], ne
+	orr	@t[0], @a[5], @b[5]	// ... and ones before that ...
+	csel	@b[4], @b[4], @b[2], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[1], ne
+	orr	@t[0], @a[5], @b[5]	// and one more, ...
+	csel	@b[4], @b[4], @b[1], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[0], ne
+	orr	@t[0], @a[5], @b[5]
+	csel	@b[4], @b[4], @b[0], ne
+
+	clz	@t[0], @t[0]
+	cmp	@t[0], #64
+	csel	@t[0], @t[0], xzr, ne
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	neg	@t[1], @t[0]
+
+	lslv	@a[5], @a[5], @t[0]	// align high limbs to the left
+	lslv	@b[5], @b[5], @t[0]
+	lsrv	@a[4], @a[4], @t[1]
+	lsrv	@b[4], @b[4], @t[1]
+	and	@a[4], @a[4], @t[1], asr#6
+	and	@b[4], @b[4], @t[1], asr#6
+	orr	$a_, @a[5], @a[4]
+	orr	$b_, @b[5], @b[4]
+
+	bfxil	$a_, @a[0], #0, #32
+	bfxil	$b_, @b[0], #0, #32
+
+	b	__inner_loop_30
+	ret
+.size	__ab_approximation_30,.-__ab_approximation_30
+
+.type	__inner_loop_30, %function
+.align	4
+__inner_loop_30:
+	mov	$cnt, #30
+	mov	$fg0, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	$fg1, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	$bias,#0x7FFFFFFF7FFFFFFF
+
+.Loop_30:
+	sbfx	@t[3], $a_, #0, #1	// if |a_| is odd, then we'll be subtracting
+	 and	@t[4], $a_, $b_
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_, @t[3]
+
+	sub	@t[1], $b_, $a_		// |b_|-|a_|
+	subs	@t[2], $a_, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	 add	@t[4], $L, @t[4], lsr#1	// L + (a_ & b_) >> 1
+	mov	@t[0], $fg1
+	csel	$b_, $b_, $a_, hs	// |b_| = |a_|
+	csel	$a_, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	$fg1, $fg1, $fg0,  hs	// exchange |fg0| and |fg1|
+	csel	$fg0, $fg0, @t[0], hs
+	 csel	$L,   $L,   @t[4], hs
+	lsr	$a_, $a_, #1
+	and	@t[0], $fg1, @t[3]
+	and	@t[1], $bias, @t[3]
+	 add	$t[2], $b_, #2
+	sub	$fg0, $fg0, @t[0]	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	$fg1, $fg1, $fg1	// |f1|<<=1
+	 add	$L, $L, $t[2], lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+	add	$fg0, $fg0, @t[1]
+	sub	$fg1, $fg1, $bias
+
+	cbnz	$cnt, .Loop_30
+
+	mov	$bias, #0x7FFFFFFF
+	ubfx	$f0, $fg0, #0, #32
+	ubfx	$g0, $fg0, #32, #32
+	ubfx	$f1, $fg1, #0, #32
+	ubfx	$g1, $fg1, #32, #32
+	sub	$f0, $f0, $bias		// remove the bias
+	sub	$g0, $g0, $bias
+	sub	$f1, $f1, $bias
+	sub	$g1, $g1, $bias
+
+	ret
+.size	__inner_loop_30,.-__inner_loop_30
+___
+}
+
+$code.=<<___;
+.type	__inner_loop_48, %function
+.align	4
+__inner_loop_48:
+.Loop_48:
+	sbfx	@t[3], $a_, #0, #1	// if |a_| is odd, then we'll be subtracting
+	 and	@t[4], $a_, $b_
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_, @t[3]
+	sub	@t[1], $b_, $a_		// |b_|-|a_|
+	subs	@t[2], $a_, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	 add	@t[4], $L, @t[4], lsr#1
+	csel	$b_, $b_, $a_, hs	// |b_| = |a_|
+	csel	$a_, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	 csel	$L,   $L,   @t[4], hs
+	 add	$t[2], $b_, #2
+	lsr	$a_, $a_, #1
+	 add	$L, $L, $t[2], lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+
+	cbnz	$cnt, .Loop_48
+
+	ret
+.size	__inner_loop_48,.-__inner_loop_48
+___
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/ct_is_square_mod_384-x86_64.pl b/blst/asm/ct_is_square_mod_384-x86_64.pl
new file mode 100755
index 0000000..40016ed
--- /dev/null
+++ b/blst/asm/ct_is_square_mod_384-x86_64.pl
@@ -0,0 +1,494 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast quadratic residue test as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >5x better than
+# modulus-specific Legendre symbol addition chain...
+#
+# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_is_square_mod_384(inp, mod):
+    a = inp
+    b = mod
+    L = 0   # only least significant bit, adding 1 makes up for sign change
+
+    k = 30
+    w = 32
+    mask = (1 << w) - 1
+
+    for i in range(0, 768 // k - 1):
+        # __ab_approximation_30
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_30
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                    L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+            L += (b_ + 2) >> 2          # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+        # __smulq_384_n_shift_by_30
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if b < 0:
+            b = -b
+        if a < 0:
+            a = -a
+            L += (b % 4) >> 1           # |b| is always odd, the second bit
+                                        # tells the whole story
+
+    if True:
+        for j in range(0, 768 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b = b, a
+                    L += (a & b) >> 1   # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a = a-b
+            a = a >> 1
+            L += (b + 2) >> 2           # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+    return (L & 1) ^ 1
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr) = ("%rdi", "%rsi");
+my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx");
+my @acc=map("%r$_",(8..15));
+my $L = "%rbp";
+
+$frame = 8*3+2*256;
+
+$code.=<<___;
+.text
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384,\@function,2,"unwind"
+.align	32
+ct_is_square_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*3+255(%rsp), %rax	# find closest 256-byte-aligned spot
+	and	\$-256, %rax		# in the frame...
+
+	mov	8*0(%rdi), @acc[0]	# load input
+	mov	8*1(%rdi), @acc[1]
+	mov	8*2(%rdi), @acc[2]
+	mov	8*3(%rdi), @acc[3]
+	mov	8*4(%rdi), @acc[4]
+	mov	8*5(%rdi), @acc[5]
+
+	mov	8*0(%rsi), @acc[6]	# load modulus
+	mov	8*1(%rsi), @acc[7]
+	mov	8*2(%rsi), %rbx
+	mov	8*3(%rsi), %rcx
+	mov	8*4(%rsi), %rdx
+	mov	8*5(%rsi), %rdi
+	mov	%rax, $in_ptr		# pointer to source |a|b|
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+	mov	@acc[4], 8*4(%rax)
+	mov	@acc[5], 8*5(%rax)
+
+	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
+	mov	@acc[7], 8*7(%rax)
+	mov	%rbx,    8*8(%rax)
+	mov	%rcx,    8*9(%rax)
+	mov	%rdx,    8*10(%rax)
+	mov	%rdi,    8*11(%rax)
+
+	xor	$L, $L			# initialize the Legendre symbol
+	mov	\$24, %ecx		# 24 is 768/30-1
+	jmp	.Loop_is_square
+
+.align	32
+.Loop_is_square:
+	mov	%ecx, 8*2(%rsp)		# offload loop counter
+
+	call	__ab_approximation_30
+	mov	$f0, 8*0(%rsp)		# offload |f0| and |g0|
+	mov	$g0, 8*1(%rsp)
+
+	mov	\$128+8*6, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |b|
+	call	__smulq_384_n_shift_by_30
+
+	mov	8*0(%rsp), $f1		# pop |f0| and |g0|
+	mov	8*1(%rsp), $g1
+	lea	-8*6($out_ptr),$out_ptr	# pointer to destination |a|
+	call	__smulq_384_n_shift_by_30
+
+	mov	8*2(%rsp), %ecx		# re-load loop counter
+	xor	\$128, $in_ptr		# flip-flop pointer to source |a|b|
+
+	and	8*6($out_ptr), @acc[6]	# if |a| was negative, adjust |L|
+	shr	\$1, @acc[6]
+	add	@acc[6], $L
+
+	sub	\$1, %ecx
+	jnz	.Loop_is_square
+
+	################################# last iteration
+	#call	__ab_approximation_30	# |a| and |b| are exact, just load
+	#mov	8*0($in_ptr), @acc[0]	# |a_|
+	mov	8*6($in_ptr), @acc[1]	# |b_|
+	call	__inner_loop_48		# 48 is 768%30+30
+
+	mov	\$1, %rax
+	and	$L,  %rax
+	xor	\$1, %rax		# return value
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smulq_384_n_shift_by_30,\@abi-omnipotent
+.align	32
+__smulq_384_n_shift_by_30:
+___
+for($j=0; $j<2; $j++) {
+$code.=<<___;
+	mov	8*0($in_ptr), @acc[0]	# load |a| (or |b|)
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	%rdx, %rbx		# |f1| (or |g1|)
+	sar	\$63, %rdx		# |f1|'s sign as mask (or |g1|'s)
+	xor	%rax, %rax
+	sub	%rdx, %rax		# |f1|'s sign as bit (or |g1|'s)
+
+	xor	%rdx, %rbx		# conditionally negate |f1| (or |g1|)
+	add	%rax, %rbx
+
+	xor	%rdx, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rdx, @acc[1]
+	xor	%rdx, @acc[2]
+	xor	%rdx, @acc[3]
+	xor	%rdx, @acc[4]
+	xor	%rdx, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	%rdx, @acc[6+$j]
+	and	%rbx, @acc[6+$j]
+	mulq	%rbx			# |a|*|f1| (or |b|*|g1|)
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	neg	@acc[6+$j]
+	mulq	%rbx
+	add	%rax, @acc[5]
+	adc	%rdx, @acc[6+$j]
+___
+$code.=<<___	if ($j==0);
+	lea	8*6($in_ptr), $in_ptr	# pointer to |b|
+	mov	$g1, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
+
+	add	8*0($out_ptr), @acc[0]
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+	adc	@acc[7],       @acc[6]
+
+	shrd	\$30, @acc[1], @acc[0]
+	shrd	\$30, @acc[2], @acc[1]
+	shrd	\$30, @acc[3], @acc[2]
+	shrd	\$30, @acc[4], @acc[3]
+	shrd	\$30, @acc[5], @acc[4]
+	shrd	\$30, @acc[6], @acc[5]
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	%rbx, %rbx
+	sub	@acc[6], %rbx		# sign as bit
+
+	xor	@acc[6], @acc[0]	# conditionally negate the result
+	xor	@acc[6], @acc[1]
+	xor	@acc[6], @acc[2]
+	xor	@acc[6], @acc[3]
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], @acc[5]
+	add	%rbx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	ret
+.size	__smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30
+___
+{
+my ($a_, $b_) = @acc[0..1];
+my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15));
+my ($fg0, $fg1, $bias) = ($g0, $g1, $t5);
+my $cnt = "%edi";
+{
+my @a = @acc[0..5];
+my @b = (@a[1..3], $t4, $t5, $g0);
+
+$code.=<<___;
+.type	__ab_approximation_30,\@abi-omnipotent
+.align	32
+__ab_approximation_30:
+	mov	8*11($in_ptr), @b[5]	# load |b| in reverse order
+	mov	8*10($in_ptr), @b[4]
+	mov	8*9($in_ptr),  @b[3]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# check top-most limbs, ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[3], @a[4]
+	mov	8*8($in_ptr), @b[2]
+	cmovz	@b[3], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# ... ones before top-most, ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[2], @a[4]
+	mov	8*7($in_ptr), @b[1]
+	cmovz	@b[2], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# ... and ones before that ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[1], @a[4]
+	mov	8*6($in_ptr), @b[0]
+	cmovz	@b[1], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# ... and ones before that ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[0], @a[4]
+	cmovz	@b[0], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax
+	bsr	%rax, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[0], @a[5]
+	cmovz	@b[0], @b[5]
+	cmovz	%rax, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[4], @a[5]	# align second limb to the left
+	shldq	%cl, @b[4], @b[5]
+
+	mov	\$0xFFFFFFFF00000000, %rax
+	mov	@a[0]d, ${a_}d
+	mov	@b[0]d, ${b_}d
+	and	%rax, @a[5]
+	and	%rax, @b[5]
+	or	@a[5], ${a_}
+	or	@b[5], ${b_}
+
+	jmp	__inner_loop_30
+
+	ret
+.size	__ab_approximation_30,.-__ab_approximation_30
+___
+}
+$code.=<<___;
+.type	__inner_loop_30,\@abi-omnipotent
+.align	32
+__inner_loop_30:		################# by Thomas Pornin
+	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
+	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
+	lea	-1($fg0), $bias			# 0x7FFFFFFF7FFFFFFF
+	mov	\$30, $cnt
+
+.Loop_30:
+	 mov	$a_, %rax
+	 and	$b_, %rax
+	 shr	\$1, %rax		# (a_ & b_) >> 1
+
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	 lea	(%rax,$L), %rax		# pre-"negate" |L|
+	mov	$fg0, $t2
+	mov	$fg1, $t3
+	 mov	$L,   $t4
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	cmovb	$fg1, $fg0
+	cmovb	$t2, $fg1
+	 cmovb	%rax, $L
+
+	sub	$b_, $a_		# |a_|-|b_|
+	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
+	add	$bias, $fg0
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $fg0
+	cmovz	$t3, $fg1
+	cmovz	$t4, $L
+
+	 lea	2($b_), %rax
+	shr	\$1, $a_		# |a_|>>=1
+	 shr	\$2, %rax
+	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
+	 lea	(%rax,$L), $L		# "negate" |L| if |b|%8 is 3 or 5
+	sub	$bias, $fg1
+
+	sub	\$1, $cnt
+	jnz	.Loop_30
+
+	shr	\$32, $bias
+	mov	%ebx, %eax		# $fg0 -> $f0
+	shr	\$32, $g0
+	mov	%ecx, %edx		# $fg1 -> $f1
+	shr	\$32, $g1
+	sub	$bias, $f0		# remove the bias
+	sub	$bias, $g0
+	sub	$bias, $f1
+	sub	$bias, $g1
+
+	ret
+.size	__inner_loop_30,.-__inner_loop_30
+
+.type	__inner_loop_48,\@abi-omnipotent
+.align	32
+__inner_loop_48:
+	mov	\$48, $cnt		# 48 is 768%30+30
+
+.Loop_48:
+	 mov	$a_, %rax
+	 and	$b_, %rax
+	 shr	\$1, %rax		# (a_ & b_) >> 1
+
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	 lea	(%rax,$L), %rax
+	 mov	$L,  $t2
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	 cmovb	%rax, $L
+
+	sub	$b_, $a_		# |a_|-|b_|
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $L
+
+	 lea	2($b_), %rax
+	shr	\$1, $a_		# |a_|>>=1
+	 shr	\$2, %rax
+	 add	%rax, $L		# "negate" |L| if |b|%8 is 3 or 5
+
+	sub	\$1, $cnt
+	jnz	.Loop_48
+
+	ret
+.size	__inner_loop_48,.-__inner_loop_48
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/ctq_inverse_mod_384-x86_64.pl b/blst/asm/ctq_inverse_mod_384-x86_64.pl
new file mode 100755
index 0000000..2be39d8
--- /dev/null
+++ b/blst/asm/ctq_inverse_mod_384-x86_64.pl
@@ -0,0 +1,886 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >5x better than
+# modulus-specific FLT addition chain...
+#
+# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_383(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 62
+    w = 64
+    mask = (1 << w) - 1
+
+    for i in range(0, 766 // k):
+        # __ab_approximation_62
+        n = max(a.bit_length(), b.bit_length())
+        if n < 128:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_62
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smulq_383_n_shift_by_62
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smulq_767x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 766 % k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 766 % k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    if v < 0:
+        v += mod << (768 - mod.bit_length())    # left aligned
+
+    return v & (2**768 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
+my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
+my $cnt = "%edi";
+
+$frame = 8*11+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383,\@function,4,"unwind"
+.align	32
+ct_inverse_mod_383:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*11+511(%rsp), %rax	# find closest 512-byte-aligned spot
+	and	\$-512, %rax		# in the frame...
+	mov	$out_ptr, 8*4(%rsp)
+	mov	$nx_ptr, 8*5(%rsp)
+
+	mov	8*0($in_ptr), @acc[0]	# load input
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	8*0($n_ptr), @acc[6]	# load modulus
+	mov	8*1($n_ptr), @acc[7]
+	mov	8*2($n_ptr), @acc[8]
+	mov	8*3($n_ptr), @acc[9]
+	mov	8*4($n_ptr), @acc[10]
+	mov	8*5($n_ptr), @acc[11]
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+	mov	@acc[4], 8*4(%rax)
+	mov	@acc[5], 8*5(%rax)
+
+	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
+	mov	@acc[7], 8*7(%rax)
+	mov	@acc[8], 8*8(%rax)
+	mov	@acc[9], 8*9(%rax)
+	mov	@acc[10], 8*10(%rax)
+	mov	%rax, $in_ptr		# pointer to source |a|b|1|0|
+	mov	@acc[11], 8*11(%rax)
+
+	################################# first iteration
+	mov	\$62, $cnt
+	call	__ab_approximation_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_383_n_shift_by_62
+	#mov	$f0, 8*7(%rsp)		# corrected |f0|
+	#mov	$g0, 8*8(%rsp)		# corrected |g0|
+	mov	$f0, 8*12($out_ptr)	# initialize |u| with |f0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_383_n_shift_by_62
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+	mov	$f0, 8*12($out_ptr)	# initialize |v| with |f1|
+
+	################################# second iteration
+	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
+	mov	\$62, $cnt
+	call	__ab_approximation_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_383_n_shift_by_62
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_383_n_shift_by_62
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*12($in_ptr), %rax	# |u|
+	mov	8*18($in_ptr), @acc[3]	# |v|
+	mov	$f0, %rbx
+	mov	%rax, @acc[2]
+	imulq	8*7(%rsp)		# |u|*|f0|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	8*8(%rsp)		# |v|*|g0|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*6($out_ptr)	# destination |u|
+	mov	@acc[1], 8*7($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*8($out_ptr)
+	mov	@acc[1], 8*9($out_ptr)
+	mov	@acc[1], 8*10($out_ptr)
+	mov	@acc[1], 8*11($out_ptr)
+	lea	8*12($in_ptr),$in_ptr	# make in_ptr "rewindable" with xor
+
+	mov	@acc[2], %rax
+	imulq	%rbx			# |u|*|f1|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	%rcx			# |v|*|g1|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*12($out_ptr)	# destination |v|
+	mov	@acc[1], 8*13($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*14($out_ptr)
+	mov	@acc[1], 8*15($out_ptr)
+	mov	@acc[1], 8*16($out_ptr)
+	mov	@acc[1], 8*17($out_ptr)
+___
+for($i=2; $i<11; $i++) {
+my $smul_767x63  = $i>5 ? "__smulq_767x63"
+                        : "__smulq_383x63";
+$code.=<<___;
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$62, $cnt
+	call	__ab_approximation_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_383_n_shift_by_62
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_383_n_shift_by_62
+	mov	$f0, 8*9(%rsp)		# corrected |f1|
+	mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*7(%rsp), $f0		# |f0|
+	mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	call	__smulq_383x63
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
+	call	$smul_767x63
+___
+$code.=<<___	if ($i==5);
+	sar	\$63, @acc[5]		# sign extension
+	mov	@acc[5], 8*6($out_ptr)
+	mov	@acc[5], 8*7($out_ptr)
+	mov	@acc[5], 8*8($out_ptr)
+	mov	@acc[5], 8*9($out_ptr)
+	mov	@acc[5], 8*10($out_ptr)
+	mov	@acc[5], 8*11($out_ptr)
+___
+}
+$code.=<<___;
+	################################# iteration before last
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$62, $cnt
+	#call	__ab_approximation_62	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	mov	8*1($in_ptr), @acc[1]	# |a_hi|
+	mov	8*6($in_ptr), @acc[2]	# |b_lo|
+	mov	8*7($in_ptr), @acc[3]	# |b_hi|
+	call	__inner_loop_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[2], 8*6($out_ptr)
+
+	#mov	8*7(%rsp), $f0		# |f0|
+	#mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*12($out_ptr),$out_ptr	# pointer to destination |u|
+	call	__smulq_383x63
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
+	call	__smulq_767x63
+
+	################################# last iteration
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$22, $cnt		# 766 % 62
+	#call	__ab_approximation_62	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	xor	@acc[1],      @acc[1]	# |a_hi|
+	mov	8*6($in_ptr), @acc[2]	# |b_lo|
+	xor	@acc[3],   @acc[3]	# |b_hi|
+	call	__inner_loop_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	#mov	$f1, 8*9(%rsp)
+	#mov	$g1, 8*10(%rsp)
+
+	#mov	8*7(%rsp), $f0		# |f0|
+	#mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	#call	__smulq_383x63
+
+	#mov	8*9(%rsp), $f0		# |f1|
+	#mov	8*10(%rsp), $g0		# |g1|
+	mov	$f1, $f0
+	mov	$g1, $g0
+	mov	8*4(%rsp), $out_ptr	# original out_ptr
+	call	__smulq_767x63
+
+	mov	8*5(%rsp), $in_ptr	# original n_ptr
+	mov	%rax, %rdx		# top limb of the result
+	sar	\$63, %rax		# result's sign as mask
+
+	mov	%rax, @acc[0]		# mask |modulus|
+	mov	%rax, @acc[1]
+	mov	%rax, @acc[2]
+	and	8*0($in_ptr), @acc[0]
+	and	8*1($in_ptr), @acc[1]
+	mov	%rax, @acc[3]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), @acc[3]
+	mov	%rax, @acc[4]
+	and	8*4($in_ptr), @acc[4]
+	and	8*5($in_ptr), %rax
+
+	add	@acc[0], @acc[6]	# conditionally add |modulus|<<384
+	adc	@acc[1], @acc[7]
+	adc	@acc[2], @acc[8]
+	adc	@acc[3], @acc[9]
+	adc	@acc[4], %rcx
+	adc	%rax,    %rdx
+
+	mov	@acc[6], 8*6($out_ptr)	# store absolute value
+	mov	@acc[7], 8*7($out_ptr)
+	mov	@acc[8], 8*8($out_ptr)
+	mov	@acc[9], 8*9($out_ptr)
+	mov	%rcx,    8*10($out_ptr)
+	mov	%rdx,    8*11($out_ptr)
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+___
+########################################################################
+# see corresponding commentary in ctx_inverse_mod_384-x86_64...
+{
+my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
+my $fx = @acc[9];
+
+$code.=<<___;
+.type	__smulq_767x63,\@abi-omnipotent
+.align	32
+__smulq_767x63:
+	mov	8*0($in_ptr), @acc[0]	# load |u|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	$f0, $fx
+	sar	\$63, $f0		# |f0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit
+
+	mov	$out_ptr, 8*1(%rsp)
+	mov	$in_ptr, 8*2(%rsp)
+	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
+
+	xor	$f0, $fx		# conditionally negate |f0|
+	add	%rax, $fx
+
+	xor	$f0, @acc[0]		# conditionally negate |u|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, @acc[4]
+	xor	$f0, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulq	$fx			# |u|*|f0|
+	mov	%rax, 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+	mov	@acc[$i], 8*$i($out_ptr)
+___
+}
+$code.=<<___;
+	imulq	$fx
+	add	%rax, @acc[$i]
+	adc	\$0, %rdx
+
+	mov	@acc[5], 8*5($out_ptr)
+	mov	%rdx, 8*6($out_ptr)
+	sar	\$63, %rdx		# sign extension
+	mov	%rdx, 8*7($out_ptr)
+___
+{
+my $fx=$in_ptr;
+$code.=<<___;
+	mov	$g0, $f0		# load |g0|
+
+	mov	8*0($in_ptr), @acc[0]	# load |v|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+	mov	8*6($in_ptr), @acc[6]
+	mov	8*7($in_ptr), @acc[7]
+	mov	8*8($in_ptr), @acc[8]
+	mov	8*9($in_ptr), @acc[9]
+	mov	8*10($in_ptr), @acc[10]
+	mov	8*11($in_ptr), @acc[11]
+
+	mov	$f0, $fx		# overrides in_ptr
+	sar	\$63, $f0		# |g0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |g0|'s sign as bit
+
+	xor	$f0, $fx		# conditionally negate |g0|
+	add	%rax, $fx
+
+	xor	$f0, @acc[0]		# conditionally negate |v|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, @acc[4]
+	xor	$f0, @acc[5]
+	xor	$f0, @acc[6]
+	xor	$f0, @acc[7]
+	xor	$f0, @acc[8]
+	xor	$f0, @acc[9]
+	xor	$f0, @acc[10]
+	xor	$f0, @acc[11]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+	adc	\$0, @acc[7]
+	adc	\$0, @acc[8]
+	adc	\$0, @acc[9]
+	adc	\$0, @acc[10]
+	adc	\$0, @acc[11]
+
+	mulq	$fx			# |v|*|g0|
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<11; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	mov	8*1(%rsp), %rdx		# out_ptr
+	imulq	$fx, %rax
+	mov	8*2(%rsp), $in_ptr	# restore original in_ptr
+	add	@acc[11], %rax
+
+	add	8*0(%rdx), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1(%rdx), @acc[1]
+	adc	8*2(%rdx), @acc[2]
+	adc	8*3(%rdx), @acc[3]
+	adc	8*4(%rdx), @acc[4]
+	adc	8*5(%rdx), @acc[5]
+	adc	8*6(%rdx), @acc[6]
+	mov	8*7(%rdx), @acc[11]	# sign extension
+	adc	@acc[11], @acc[7]
+	adc	@acc[11], @acc[8]
+	adc	@acc[11], @acc[9]
+	adc	@acc[11], @acc[10]
+	adc	@acc[11], %rax
+
+	mov	%rdx, $out_ptr		# restore original out_ptr
+
+	mov	@acc[0], 8*0(%rdx)
+	mov	@acc[1], 8*1(%rdx)
+	mov	@acc[2], 8*2(%rdx)
+	mov	@acc[3], 8*3(%rdx)
+	mov	@acc[4], 8*4(%rdx)
+	mov	@acc[5], 8*5(%rdx)
+	mov	@acc[6], 8*6(%rdx)
+	mov	@acc[7], 8*7(%rdx)
+	mov	@acc[8], 8*8(%rdx)
+	mov	@acc[9], 8*9(%rdx)
+	mov	@acc[10], 8*10(%rdx)
+	mov	%rax,     8*11(%rdx)
+
+	ret
+.size	__smulq_767x63,.-__smulq_767x63
+___
+}
+$code.=<<___;
+.type	__smulq_383x63,\@abi-omnipotent
+.align	32
+__smulq_383x63:
+___
+for($j=0; $j<2; $j++) {
+$code.=<<___;
+	mov	8*0($in_ptr), @acc[0]	# load |u| (or |v|)
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	%rdx, $fx
+	sar	\$63, %rdx		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	%rdx, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rdx, $fx		# conditionally negate |f0|
+	add	%rax, $fx
+
+	xor	%rdx, @acc[0]		# conditionally negate |u| (or |v|)
+	xor	%rdx, @acc[1]
+	xor	%rdx, @acc[2]
+	xor	%rdx, @acc[3]
+	xor	%rdx, @acc[4]
+	xor	%rdx, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulq	$fx			# |u|*|f0| (or |v|*|g0|)
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___	if ($j==0);
+	imulq	$fx, %rax
+	add	%rax, @acc[$i]
+
+	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
+	mov	$g0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	imulq	$fx, %rax
+	add	%rax, @acc[$i]
+
+	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
+
+	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	ret
+.size	__smulq_383x63,.-__smulq_383x63
+___
+{
+$code.=<<___;
+.type	__smulq_383_n_shift_by_62,\@abi-omnipotent
+.align	32
+__smulq_383_n_shift_by_62:
+	mov	$f0, @acc[8]
+___
+my $f0 = @acc[8];
+for($j=0; $j<2; $j++) {
+$code.=<<___;
+	mov	8*0($in_ptr), @acc[0]	# load |a| (or |b|)
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	%rdx, $fx
+	sar	\$63, %rdx		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	%rdx, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rdx, $fx		# conditionally negate |f0| (or |g0|)
+	add	%rax, $fx
+
+	xor	%rdx, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rdx, @acc[1]
+	xor	%rdx, @acc[2]
+	xor	%rdx, @acc[3]
+	xor	%rdx, @acc[4]
+	xor	%rdx, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulq	$fx			# |a|*|f0| (or |b|*|g0|)
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___	if ($j==0);
+	imulq	$fx
+	add	%rax, @acc[$i]
+	adc	\$0, %rdx
+
+	lea	8*6($in_ptr), $in_ptr	# pointer to |b|
+	mov	%rdx, @acc[6]
+	mov	$g0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	imulq	$fx
+	add	%rax, @acc[$i]
+	adc	\$0, %rdx
+
+	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
+
+	add	8*0($out_ptr), @acc[0]
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+	adc	%rdx,          @acc[6]
+	mov	$f0, %rdx
+
+	shrd	\$62, @acc[1], @acc[0]
+	shrd	\$62, @acc[2], @acc[1]
+	shrd	\$62, @acc[3], @acc[2]
+	shrd	\$62, @acc[4], @acc[3]
+	shrd	\$62, @acc[5], @acc[4]
+	shrd	\$62, @acc[6], @acc[5]
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	$fx, $fx
+	sub	@acc[6], $fx		# sign as bit
+
+	xor	@acc[6], @acc[0]	# conditionally negate the result
+	xor	@acc[6], @acc[1]
+	xor	@acc[6], @acc[2]
+	xor	@acc[6], @acc[3]
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], @acc[5]
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	xor	@acc[6], %rdx		# conditionally negate |f0|
+	xor	@acc[6], $g0		# conditionally negate |g0|
+	add	$fx, %rdx
+	add	$fx, $g0
+
+	ret
+.size	__smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62
+___
+} }
+
+{
+my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
+my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi");
+{
+my @a = ($a_lo, $t1, $a_hi);
+my @b = ($b_lo, $t2, $b_hi);
+
+$code.=<<___;
+.type	__ab_approximation_62,\@abi-omnipotent
+.align	32
+__ab_approximation_62:
+	mov	8*5($in_ptr), @a[2]	# load |a| in reverse order
+	mov	8*11($in_ptr), @b[2]	# load |b| in reverse order
+	mov	8*4($in_ptr), @a[1]
+	mov	8*10($in_ptr), @b[1]
+	mov	8*3($in_ptr), @a[0]
+	mov	8*9($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# check top-most limbs, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+	mov	8*2($in_ptr), @a[0]
+	mov	8*8($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... ones before top-most, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+	mov	8*1($in_ptr), @a[0]
+	mov	8*7($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+	mov	8*0($in_ptr), @a[0]
+	mov	8*6($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0
+	bsr	$t0, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	$t0, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[1], @a[2]	# align second limb to the left
+	shldq	%cl, @b[1], @b[2]
+
+	jmp	__inner_loop_62
+
+	ret
+.size	__ab_approximation_62,.-__ab_approximation_62
+___
+}
+$code.=<<___;
+.type	__inner_loop_62,\@abi-omnipotent
+.align	8
+.long	0
+__inner_loop_62:
+	mov	\$1, $f0	# |f0|=1
+	xor	$g0, $g0	# |g0|=0
+	xor	$f1, $f1	# |f1|=0
+	mov	\$1, $g1	# |g1|=1
+	mov	$in_ptr, 8(%rsp)
+
+.Loop_62:
+	xor	$t0, $t0
+	xor	$t1, $t1
+	test	\$1, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
+	mov	$b_lo, $t2
+	mov	$b_hi, $t3
+	cmovnz	$b_lo, $t0
+	cmovnz	$b_hi, $t1
+	sub	$a_lo, $t2	# |b_|-|a_|
+	sbb	$a_hi, $t3
+	mov	$a_lo, $t4
+	mov	$a_hi, $t5
+	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
+	sbb	$t1, $a_hi
+	cmovc	$t2, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
+	cmovc	$t3, $a_hi
+	cmovc	$t4, $b_lo	# |b_| = |a_|
+	cmovc	$t5, $b_hi
+	mov	$f0, $t0	# exchange |f0| and |f1|
+	cmovc	$f1, $f0
+	cmovc	$t0, $f1
+	mov	$g0, $t1	# exchange |g0| and |g1|
+	cmovc	$g1, $g0
+	cmovc	$t1, $g1
+	xor	$t0, $t0
+	xor	$t1, $t1
+	shrd	\$1, $a_hi, $a_lo
+	shr	\$1, $a_hi
+	test	\$1, $t4	# if |a_| was odd, then we'll be subtracting...
+	cmovnz	$f1, $t0
+	cmovnz	$g1, $t1
+	add	$f1, $f1	# |f1|<<=1
+	add	$g1, $g1	# |g1|<<=1
+	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
+	sub	\$1, $cnt
+	jnz	.Loop_62
+
+	mov	8(%rsp), $in_ptr
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/ctx_inverse_mod_384-x86_64.pl b/blst/asm/ctx_inverse_mod_384-x86_64.pl
new file mode 100755
index 0000000..d207e2f
--- /dev/null
+++ b/blst/asm/ctx_inverse_mod_384-x86_64.pl
@@ -0,0 +1,995 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >4x better than
+# modulus-specific FLT addition chain...
+#
+# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_383(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 31
+    mask = (1 << k) - 1
+
+    for i in range(0, 766 // k):
+        # __ab_approximation_31
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-k-2)) << k)
+            b_ = (b & mask) | ((b >> (n-k-2)) << k)
+
+        # __inner_loop_31
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smulx_383_n_shift_by_31
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smulx_767x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 766 % k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 766 % k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    if v < 0:
+        v += mod << (768 - mod.bit_length())    # left aligned
+
+    return v & (2**768 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
+my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
+my $cnt = "%edi";
+
+$frame = 8*11+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ctx_inverse_mod_383
+.type	ctx_inverse_mod_383,\@function,4,"unwind"
+.align	32
+ctx_inverse_mod_383:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*11+511(%rsp), %rax	# find closest 512-byte-aligned spot
+	and	\$-512, %rax		# in the frame...
+	mov	$out_ptr, 8*4(%rsp)
+	mov	$nx_ptr, 8*5(%rsp)
+
+	mov	8*0($in_ptr), @acc[0]	# load input
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	8*0($n_ptr), @acc[6]	# load modulus
+	mov	8*1($n_ptr), @acc[7]
+	mov	8*2($n_ptr), @acc[8]
+	mov	8*3($n_ptr), @acc[9]
+	mov	8*4($n_ptr), @acc[10]
+	mov	8*5($n_ptr), @acc[11]
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+	mov	@acc[4], 8*4(%rax)
+	mov	@acc[5], 8*5(%rax)
+
+	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
+	mov	@acc[7], 8*7(%rax)
+	mov	@acc[8], 8*8(%rax)
+	mov	@acc[9], 8*9(%rax)
+	mov	@acc[10], 8*10(%rax)
+	mov	%rax, $in_ptr
+	mov	@acc[11], 8*11(%rax)
+
+	################################# first iteration
+	mov	\$31, $cnt
+	call	__ab_approximation_31
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulx_383_n_shift_by_31
+	#mov	$f0, 8*7(%rsp)		# corrected |f0|
+	#mov	$g0, 8*8(%rsp)		# corrected |g0|
+	mov	$f0, 8*12($out_ptr)	# initialize |u| with |f0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulx_383_n_shift_by_31
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+	mov	$f0, 8*12($out_ptr)	# initialize |v| with |f1|
+
+	################################# second iteration
+	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulx_383_n_shift_by_31
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulx_383_n_shift_by_31
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*12($in_ptr), %rax	# |u|
+	mov	8*18($in_ptr), @acc[3]	# |v|
+	mov	$f0, %rbx
+	mov	%rax, @acc[2]
+	imulq	8*7(%rsp)		# |u|*|f0|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	8*8(%rsp)		# |v|*|g0|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*6($out_ptr)	# destination |u|
+	mov	@acc[1], 8*7($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*8($out_ptr)
+	mov	@acc[1], 8*9($out_ptr)
+	mov	@acc[1], 8*10($out_ptr)
+	mov	@acc[1], 8*11($out_ptr)
+	lea	8*12($in_ptr), $in_ptr	# make in_ptr "rewindable" with xor
+
+	mov	@acc[2], %rax
+	imulq	%rbx			# |u|*|f1|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	%rcx			# |v|*|g1|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*12($out_ptr)	# destination |v|
+	mov	@acc[1], 8*13($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*14($out_ptr)
+	mov	@acc[1], 8*15($out_ptr)
+	mov	@acc[1], 8*16($out_ptr)
+	mov	@acc[1], 8*17($out_ptr)
+___
+for($i=2; $i<23; $i++) {
+my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31"
+                         : "__smulx_191_n_shift_by_31";
+my $smul_767x63  = $i>11 ? "__smulx_767x63"
+                         : "__smulx_383x63";
+$code.=<<___;
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	$smul_n_shift
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	$smul_n_shift
+	mov	$f0, 8*9(%rsp)		# corrected |f1|
+	mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*7(%rsp), $f0		# |f0|
+	mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	call	__smulx_383x63
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
+	call	$smul_767x63
+___
+$code.=<<___	if ($i==11);
+	sar	\$63, @acc[5]		# sign extension
+	mov	@acc[5], 8*6($out_ptr)
+	mov	@acc[5], 8*7($out_ptr)
+	mov	@acc[5], 8*8($out_ptr)
+	mov	@acc[5], 8*9($out_ptr)
+	mov	@acc[5], 8*10($out_ptr)
+	mov	@acc[5], 8*11($out_ptr)
+___
+}
+$code.=<<___;
+	################################# two[!] last iterations in one go
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$53, $cnt		# 31 + 766 % 31
+	#call	__ab_approximation_31	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	#xor	@acc[1],      @acc[1]	# |a_hi|
+	mov	8*6($in_ptr), @acc[2]	# |b_lo|
+	#xor	@acc[3],      @acc[3]	# |b_hi|
+	call	__inner_loop_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	#mov	$f1, 8*9(%rsp)
+	#mov	$g1, 8*10(%rsp)
+
+	#mov	8*7(%rsp), $f0		# |f0|
+	#mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	#call	__smulx_383x63
+
+	#mov	8*9(%rsp), $f0		# |f1|
+	#mov	8*10(%rsp), $g0		# |g1|
+	mov	$f1, $f0
+	mov	$g1, $g0
+	mov	8*4(%rsp), $out_ptr	# original out_ptr
+	call	__smulx_767x63
+
+	mov	8*5(%rsp), $in_ptr	# original n_ptr
+	mov	%rax, %rdx		# top limb of the result
+	sar	\$63, %rax		# result's sign as mask
+
+	mov	%rax, @acc[0]		# mask |modulus|
+	mov	%rax, @acc[1]
+	mov	%rax, @acc[2]
+	and	8*0($in_ptr), @acc[0]
+	and	8*1($in_ptr), @acc[1]
+	mov	%rax, @acc[3]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), @acc[3]
+	mov	%rax, @acc[4]
+	and	8*4($in_ptr), @acc[4]
+	and	8*5($in_ptr), %rax
+
+	add	@acc[0], @acc[6]	# conditionally add |modulus|<<384
+	adc	@acc[1], @acc[7]
+	adc	@acc[2], @acc[8]
+	adc	@acc[3], @acc[9]
+	adc	@acc[4], %rcx
+	adc	%rax,    %rdx
+
+	mov	@acc[6], 8*6($out_ptr)	# store absolute value
+	mov	@acc[7], 8*7($out_ptr)
+	mov	@acc[8], 8*8($out_ptr)
+	mov	@acc[9], 8*9($out_ptr)
+	mov	%rcx,    8*10($out_ptr)
+	mov	%rdx,    8*11($out_ptr)
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ctx_inverse_mod_383,.-ctx_inverse_mod_383
+___
+########################################################################
+# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
+# to the maximum bit-length of the *result*, and "63" - to the maximum
+# bit-length of the |f?| and |g?| single-limb multiplicands. However!
+# The latter should not be taken literally, as they are always chosen so
+# that "bad things" don't happen. For example, there comes a point when
+# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
+# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
+# because past that point |f0| is always 1 and |g0| is always 0. And,
+# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
+# perform full-width |u|*|f1| multiplication, half-width one with sign
+# extension is sufficient...
+{
+my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
+my $fx = @acc[9];
+
+$code.=<<___;
+.type	__smulx_767x63,\@abi-omnipotent
+.align	32
+__smulx_767x63:
+	mov	8*0($in_ptr), @acc[0]	# load |u|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	$f0, %rax
+	sar	\$63, %rax		# |f0|'s sign as mask
+	xor	$fx, $fx		# overrides in_ptr
+	sub	%rax, $fx		# |f0|'s sign as bit
+
+	mov	$out_ptr, 8*1(%rsp)
+	mov	$in_ptr,  8*2(%rsp)
+	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
+
+	xor	%rax, $f0		# conditionally negate |f0|
+	add	$fx, $f0
+
+	xor	%rax, @acc[0]		# conditionally negate |u|
+	xor	%rax, @acc[1]
+	xor	%rax, @acc[2]
+	xor	%rax, @acc[3]
+	xor	%rax, @acc[4]
+	xor	@acc[5], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, %rax
+
+	mulx	@acc[0], @acc[0], $fx	# |u|*|f0|
+	mulx	@acc[1], @acc[1], @acc[5]
+	add	$fx, @acc[1]
+___
+for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___;
+	adc	\$0, $fx
+	imulq	%rdx
+	add	$fx, %rax
+	adc	\$0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	%rax,    8*5($out_ptr)
+	mov	%rdx,    8*6($out_ptr)
+	sar	\$63, %rdx		# sign extension
+	mov	%rdx, 8*7($out_ptr)
+___
+{
+my $fx=$in_ptr;
+$code.=<<___;
+	mov	$g0, $f0		# load |g0|
+	mov	$g0, %rax
+
+	mov	8*0($in_ptr), @acc[0]	# load |v|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+	mov	8*6($in_ptr), @acc[6]
+	mov	8*7($in_ptr), @acc[7]
+	mov	8*8($in_ptr), @acc[8]
+	mov	8*9($in_ptr), @acc[9]
+	mov	8*10($in_ptr), @acc[10]
+	mov	8*11($in_ptr), @acc[11]
+
+	sar	\$63, %rax		# |g0|'s sign as mask
+	xor	$fx, $fx		# overrides in_ptr
+	sub	%rax, $fx		# |g0|'s sign as bit
+
+	xor	%rax, $f0		# conditionally negate |g0|
+	add	$fx, $f0
+
+	xor	%rax, @acc[0]		# conditionally negate |v|
+	xor	%rax, @acc[1]
+	xor	%rax, @acc[2]
+	xor	%rax, @acc[3]
+	xor	%rax, @acc[4]
+	xor	%rax, @acc[5]
+	xor	%rax, @acc[6]
+	xor	%rax, @acc[7]
+	xor	%rax, @acc[8]
+	xor	%rax, @acc[9]
+	xor	%rax, @acc[10]
+	xor	%rax, @acc[11]
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+	adc	\$0, @acc[7]
+	adc	\$0, @acc[8]
+	adc	\$0, @acc[9]
+	adc	\$0, @acc[10]
+	adc	\$0, @acc[11]
+
+	mulx	@acc[0], @acc[0], %rax	# |v|*|g0|
+	mulx	@acc[1], @acc[1], $fx
+	add	%rax, @acc[1]
+___
+for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___;
+	mulx	@acc[11], @acc[11], $fx
+	mov	8*1(%rsp), %rdx		# out_ptr
+	mov	8*2(%rsp), $in_ptr	# restore original in_ptr
+	adc	@acc[11], %rax
+
+	add	8*0(%rdx), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1(%rdx), @acc[1]
+	adc	8*2(%rdx), @acc[2]
+	adc	8*3(%rdx), @acc[3]
+	adc	8*4(%rdx), @acc[4]
+	adc	8*5(%rdx), @acc[5]
+	adc	8*6(%rdx), @acc[6]
+	mov	8*7(%rdx), @acc[11]	# sign extension
+	adc	@acc[11], @acc[7]
+	adc	@acc[11], @acc[8]
+	adc	@acc[11], @acc[9]
+	adc	@acc[11], @acc[10]
+	adc	@acc[11], %rax
+
+	mov	%rdx, $out_ptr		# restore original out_ptr
+
+	mov	@acc[0], 8*0(%rdx)
+	mov	@acc[1], 8*1(%rdx)
+	mov	@acc[2], 8*2(%rdx)
+	mov	@acc[3], 8*3(%rdx)
+	mov	@acc[4], 8*4(%rdx)
+	mov	@acc[5], 8*5(%rdx)
+	mov	@acc[6], 8*6(%rdx)
+	mov	@acc[7], 8*7(%rdx)
+	mov	@acc[8], 8*8(%rdx)
+	mov	@acc[9], 8*9(%rdx)
+	mov	@acc[10], 8*10(%rdx)
+	mov	%rax,     8*11(%rdx)
+
+	ret
+.size	__smulx_767x63,.-__smulx_767x63
+___
+}
+$code.=<<___;
+.type	__smulx_383x63,\@abi-omnipotent
+.align	32
+__smulx_383x63:
+___
+for($j=0; $j<2; $j++) {
+my $k = 8*6*$j;
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+	mov	$k+8*4($in_ptr), @acc[4]
+	mov	$k+8*5($in_ptr), @acc[5]
+
+	mov	$f0, $fx
+	sar	\$63, $fx		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	$fx, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	$fx, $f0		# conditionally negate |f0|
+	add	%rax, $f0
+
+	xor	$fx, @acc[0]		# conditionally negate |u| (or |v|)
+	xor	$fx, @acc[1]
+	xor	$fx, @acc[2]
+	xor	$fx, @acc[3]
+	xor	$fx, @acc[4]
+	xor	$fx, @acc[5]
+	add	%rax, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulx	@acc[0], @acc[0], $fx	# |u|*|f0| (or |v|*|g0|)
+	mulx	@acc[1], @acc[1], %rax
+	add	$fx, @acc[1]
+___
+for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___	if ($j==0);
+	mulx	@acc[$i], @acc[$i], %rax
+	mov	$g0, $f0
+	adc	$fx, @acc[$i]
+
+	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], %rax
+	adc	$fx, @acc[$i]
+
+	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	ret
+.size	__smulx_383x63,.-__smulx_383x63
+___
+########################################################################
+# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
+# the names refers to maximum bit-lengths of |a| and |b|. As already
+# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
+# chosen so that "bad things" don't happen. For example, so that the
+# sum of the products doesn't overflow, and that the final result is
+# never wider than inputs...
+{
+$code.=<<___;
+.type	__smulx_383_n_shift_by_31,\@abi-omnipotent
+.align	32
+__smulx_383_n_shift_by_31:
+	mov	$f0, @acc[8]
+	xor	@acc[6], @acc[6]
+___
+my $f0 = @acc[8];
+for($j=0; $j<2; $j++) {
+my $k = 8*6*$j;
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+	mov	$k+8*4($in_ptr), @acc[4]
+	mov	$k+8*5($in_ptr), @acc[5]
+
+	mov	%rdx, %rax
+	sar	\$63, %rax		# |f0|'s sign as mask (or |g0|'s)
+	xor	$fx, $fx
+	sub	%rax, $fx		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rax, %rdx		# conditionally negate |f0| (or |g0|)
+	add	$fx, %rdx
+
+	xor	%rax, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rax, @acc[1]
+	xor	%rax, @acc[2]
+	xor	%rax, @acc[3]
+	xor	%rax, @acc[4]
+	xor	@acc[5], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, %rax
+
+	mulx	@acc[0], @acc[0], $fx	# |a|*|f0| (or |b|*|g0|)
+	mulx	@acc[1], @acc[1], @acc[5]
+	add	$fx, @acc[1]
+___
+for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___	if ($j==0);
+	adc	\$0, $fx
+	imulq	%rdx
+	add	$fx, %rax
+	adc	%rdx, @acc[6]
+
+	mov	$g0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	%rax,    8*5($out_ptr)
+___
+}
+$code.=<<___;
+	adc	\$0, $fx
+	imulq	%rdx
+	add	$fx, %rax
+	adc	\$0, %rdx
+
+	add	8*0($out_ptr), @acc[0]
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), %rax
+	adc	%rdx,          @acc[6]
+	mov	$f0, %rdx
+
+	shrd	\$31, @acc[1], @acc[0]
+	shrd	\$31, @acc[2], @acc[1]
+	shrd	\$31, @acc[3], @acc[2]
+	shrd	\$31, @acc[4], @acc[3]
+	shrd	\$31, %rax,    @acc[4]
+	shrd	\$31, @acc[6], %rax
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	$fx, $fx
+	sub	@acc[6], $fx		# sign as bit
+
+	xor	@acc[6], @acc[0]	# conditionally negate the result
+	xor	@acc[6], @acc[1]
+	xor	@acc[6], @acc[2]
+	xor	@acc[6], @acc[3]
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, %rax
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	%rax,    8*5($out_ptr)
+
+	xor	@acc[6], %rdx		# conditionally negate |f0|
+	xor	@acc[6], $g0		# conditionally negate |g0|
+	add	$fx, %rdx
+	add	$fx, $g0
+
+	ret
+.size	__smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31
+___
+} {
+$code.=<<___;
+.type	__smulx_191_n_shift_by_31,\@abi-omnipotent
+.align	32
+__smulx_191_n_shift_by_31:
+	mov	$f0, @acc[8]
+___
+my $f0 = @acc[8];
+for($j=0; $j<2; $j++) {
+my $k = 8*6*$j;
+my @acc=@acc;
+   @acc=@acc[3..5] if ($j);
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+
+	mov	%rdx, %rax
+	sar	\$63, %rax		# |f0|'s sign as mask (or |g0|'s)
+	xor	$fx, $fx
+	sub	%rax, $fx		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rax, %rdx		# conditionally negate |f0| (or |g0|)
+	add	$fx, %rdx
+
+	xor	%rax, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rax, @acc[1]
+	xor	@acc[2], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, %rax
+
+	mulx	@acc[0], @acc[0], $fx	# |a|*|f0| (or |b|*|g0|)
+	mulx	@acc[1], @acc[1], @acc[2]
+	add	$fx, @acc[1]
+	adc	\$0, @acc[2]
+	imulq	%rdx
+	add	%rax, @acc[2]
+	adc	\$0, %rdx
+___
+$code.=<<___	if ($j==0);
+	mov	%rdx, @acc[6]
+	mov	$g0, %rdx
+___
+}
+$code.=<<___;
+	add	@acc[0], @acc[3]
+	adc	@acc[1], @acc[4]
+	adc	@acc[2], @acc[5]
+	adc	%rdx,    @acc[6]
+	mov	$f0, %rdx
+
+	shrd	\$31, @acc[4], @acc[3]
+	shrd	\$31, @acc[5], @acc[4]
+	shrd	\$31, @acc[6], @acc[5]
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	$fx, $fx
+	sub	@acc[6], $fx		# sign as bit
+
+	xor	@acc[6], @acc[3]	# conditionally negate the result
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], @acc[5]
+	add	$fx, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	@acc[3], 8*0($out_ptr)
+	mov	@acc[4], 8*1($out_ptr)
+	mov	@acc[5], 8*2($out_ptr)
+
+	xor	@acc[6], %rdx		# conditionally negate |f0|
+	xor	@acc[6], $g0		# conditionally negate |g0|
+	add	$fx, %rdx
+	add	$fx, $g0
+
+	ret
+.size	__smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31
+___
+} }
+
+{
+my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
+my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
+my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
+my ($a_, $b_) = ($a_lo, $b_lo);
+{
+my @a = ($a_lo, $t1, $a_hi);
+my @b = ($b_lo, $t2, $b_hi);
+
+$code.=<<___;
+.type	__ab_approximation_31,\@abi-omnipotent
+.align	32
+__ab_approximation_31:
+	mov	8*5($in_ptr), @a[2]	# load |a| in reverse order
+	mov	8*11($in_ptr), @b[2]	# load |b| in reverse order
+	mov	8*4($in_ptr), @a[1]
+	mov	8*10($in_ptr), @b[1]
+	mov	8*3($in_ptr), @a[0]
+	mov	8*9($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# check top-most limbs, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*2($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*8($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... ones before top-most, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*1($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*7($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*0($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*6($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0
+	bsr	$t0, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[0], @a[2]
+	cmovz	@b[0], @b[2]
+	cmovz	$t0, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[1], @a[2]	# align second limb to the left
+	shldq	%cl, @b[1], @b[2]
+
+	mov	\$0x7FFFFFFF, %eax
+	and	%rax, @a[0]
+	and	%rax, @b[0]
+	andn	@a[2], %rax, @a[2]
+	andn	@b[2], %rax, @b[2]
+	or	@a[2], @a[0]
+	or	@b[2], @b[0]
+
+	jmp	__inner_loop_31
+
+	ret
+.size	__ab_approximation_31,.-__ab_approximation_31
+___
+}
+$code.=<<___;
+.type	__inner_loop_31,\@abi-omnipotent
+.align	32
+__inner_loop_31:		################# by Thomas Pornin
+	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
+	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
+	mov	\$0x7FFFFFFF7FFFFFFF, $bias
+
+.Loop_31:
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	mov	$fg0, $t2
+	mov	$fg1, $t3
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	cmovb	$fg1, $fg0
+	cmovb	$t2, $fg1
+
+	sub	$b_, $a_		# |a_|-|b_|
+	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
+	add	$bias, $fg0
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $fg0
+	cmovz	$t3, $fg1
+
+	shr	\$1, $a_		# |a_|>>=1
+	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
+	sub	$bias, $fg1
+	sub	\$1, $cnt
+	jnz	.Loop_31
+
+	shr	\$32, $bias
+	mov	%ecx, %edx		# $fg0, $f0
+	mov	${fg1}d, ${f1}d
+	shr	\$32, $g0
+	shr	\$32, $g1
+	sub	$bias, $f0		# remove the bias
+	sub	$bias, $g0
+	sub	$bias, $f1
+	sub	$bias, $g1
+
+	ret
+.size	__inner_loop_31,.-__inner_loop_31
+
+.type	__inner_loop_62,\@abi-omnipotent
+.align	32
+__inner_loop_62:
+	mov	\$1, $f0	# |f0|=1
+	xor	$g0, $g0	# |g0|=0
+	xor	$f1, $f1	# |f1|=0
+	mov	\$1, $g1	# |g1|=1
+
+.Loop_62:
+	xor	$t0, $t0
+	test	\$1, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
+	mov	$b_lo, $t1
+	cmovnz	$b_lo, $t0
+	sub	$a_lo, $t1	# |b_|-|a_|
+	mov	$a_lo, $t2
+	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
+	cmovc	$t1, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
+	cmovc	$t2, $b_lo	# |b_| = |a_|
+	mov	$f0, $t0	# exchange |f0| and |f1|
+	cmovc	$f1, $f0
+	cmovc	$t0, $f1
+	mov	$g0, $t1	# exchange |g0| and |g1|
+	cmovc	$g1, $g0
+	cmovc	$t1, $g1
+	xor	$t0, $t0
+	xor	$t1, $t1
+	shr	\$1, $a_lo
+	test	\$1, $t2	# if |a_| was odd, then we'll be subtracting...
+	cmovnz	$f1, $t0
+	cmovnz	$g1, $t1
+	add	$f1, $f1	# |f1|<<=1
+	add	$g1, $g1	# |g1|<<=1
+	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
+	sub	\$1, $cnt
+	jnz	.Loop_62
+
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/div3w-armv8.pl b/blst/asm/div3w-armv8.pl
new file mode 100755
index 0000000..bfa3245
--- /dev/null
+++ b/blst/asm/div3w-armv8.pl
@@ -0,0 +1,122 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+$code.=<<___;
+.text
+
+.globl	div_3_limbs
+.type	div_3_limbs,%function
+.align	5
+div_3_limbs:
+	ldp	x4,x5,[x0]	// load R
+	eor	x0,x0,x0	// Q = 0
+	mov	x3,#64		// loop counter
+	nop
+
+.Loop:
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,x0	// Q <<= 1
+	sbcs	x7,x5,x2
+	add	x0,x0,#1	// Q + speculative bit
+	csel	x4,x4,x6,lo	// select between R and R - D
+	 extr	x1,x2,x1,#1	// D >>= 1
+	csel	x5,x5,x7,lo
+	 lsr	x2,x2,#1
+	sbc	x0,x0,xzr	// subtract speculative bit
+	sub	x3,x3,#1
+	cbnz	x3,.Loop
+
+	asr	x3,x0,#63	// top bit -> mask
+	add	x0,x0,x0	// Q <<= 1
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,#1	// Q + specilative bit
+	sbcs	x7,x5,x2
+	sbc	x0,x0,xzr	// subtract speculative bit
+
+	orr	x0,x0,x3	// all ones if overflow
+
+	ret
+.size	div_3_limbs,.-div_3_limbs
+___
+{
+my ($div_rem, $divisor, $quot) = map("x$_",(0..2));
+my @div = map("x$_",(3..4));
+my @acc = map("x$_",(5..7));
+my @t = map("x$_",(8..11));
+
+$code.=<<___;
+.globl	quot_rem_128
+.type	quot_rem_128,%function
+.align	5
+quot_rem_128:
+	ldp	@div[0],@div[1],[$divisor]
+
+	mul	@acc[0],@div[0],$quot	// divisor[0:1} * quotient
+	umulh	@acc[1],@div[0],$quot
+	mul	@t[3],  @div[1],$quot
+	umulh	@acc[2],@div[1],$quot
+
+	ldp	@t[0],@t[1],[$div_rem]	// load 3 limbs of the dividend
+	ldr	@t[2],[$div_rem,#16]
+
+	adds	@acc[1],@acc[1],@t[3]
+	adc	@acc[2],@acc[2],xzr
+
+	subs	@t[0],@t[0],@acc[0]	// dividend - divisor * quotient
+	sbcs	@t[1],@t[1],@acc[1]
+	sbcs	@t[2],@t[2],@acc[2]
+	sbc	@acc[0],xzr,xzr		// borrow -> mask
+
+	add	$quot,$quot,@acc[0]	// if borrowed, adjust the quotient ...
+	and	@div[0],@div[0],@acc[0]
+	and	@div[1],@div[1],@acc[0]
+	adds	@t[0],@t[0],@div[0]	// ... and add divisor
+	adc	@t[1],@t[1],@div[1]
+
+	stp	@t[0],@t[1],[$div_rem]	// save 2 limbs of the remainder
+	str	$quot,[$div_rem,#16]	// and one limb of the quotient
+
+	mov	x0,$quot		// return adjusted quotient
+
+	ret
+.size	quot_rem_128,.-quot_rem_128
+
+.globl	quot_rem_64
+.type	quot_rem_64,%function
+.align	5
+quot_rem_64:
+	ldr	@div[0],[$divisor]
+	ldr	@t[0],[$div_rem]	// load 1 limb of the dividend
+
+	mul	@acc[0],@div[0],$quot	// divisor * quotient
+
+	sub	@t[0],@t[0],@acc[0]	// dividend - divisor * quotient
+
+	stp	@t[0],$quot,[$div_rem]	// save remainder and quotient
+
+	mov	x0,$quot		// return quotient
+
+	ret
+.size	quot_rem_64,.-quot_rem_64
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/div3w-x86_64.pl b/blst/asm/div3w-x86_64.pl
new file mode 100755
index 0000000..b8192db
--- /dev/null
+++ b/blst/asm/div3w-x86_64.pl
@@ -0,0 +1,184 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+$c_ref=<<'___';
+/*
+ * |div_top| points at two most significant limbs of the dividend, |d_hi|
+ * and |d_lo| are two most significant limbs of the divisor. If divisor
+ * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
+ * The divisor is required to be "bitwise left-aligned," and dividend's
+ * top limbs to be not larger than the divisor's. The latter limitation
+ * can be problematic in the first iteration of multi-precision division,
+ * where in most general case the condition would have to be "smaller."
+ * The subroutine considers four limbs, two of which are "overlapping,"
+ * hence the name... Another way to look at it is to think of the pair
+ * of the dividend's limbs being suffixed with a zero:
+ *   +-------+-------+-------+
+ * R |       |       |   0   |
+ *   +-------+-------+-------+
+ *           +-------+-------+
+ * D         |       |       |
+ *           +-------+-------+
+ */
+limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)
+{
+    llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0];
+    llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo;
+    limb_t Q = 0, mask;
+    size_t i;
+
+    for (i = 0; i < LIMB_BITS; i++) {
+        Q <<= 1;
+        mask = (R >= D);
+        Q |= mask;
+        R -= (D & ((llimb_t)0 - mask));
+        D >>= 1;
+    }
+
+    mask = 0 - (Q >> (LIMB_BITS - 1));   /* does it overflow? */
+
+    Q <<= 1;
+    Q |= (R >= D);
+
+    return (Q | mask);
+}
+___
+
+$code.=<<___;
+.text
+
+.globl	div_3_limbs
+.hidden	div_3_limbs
+.type	div_3_limbs,\@function,3
+.align	32
+div_3_limbs:
+	mov	(%rdi),%r8		# load R.lo
+	mov	8(%rdi),%r9		# load R.hi
+	xor	%rax,%rax		# Q = 0
+	mov	\$64,%ecx		# loop counter
+
+.Loop:
+	 mov	%r8,%r10		# put aside R
+	sub	%rsi,%r8		# R -= D
+	 mov	%r9,%r11
+	sbb	%rdx,%r9
+	lea	1(%rax,%rax),%rax	# Q <<= 1 + speculative bit
+	 mov	%rdx,%rdi
+	cmovc	%r10,%r8		# restore R if R - D borrowed
+	cmovc	%r11,%r9
+	sbb	\$0,%rax		# subtract speculative bit
+	 shl	\$63,%rdi
+	 shr	\$1,%rsi
+	 shr	\$1,%rdx
+	 or	%rdi,%rsi		# D >>= 1
+	sub	\$1,%ecx
+	jnz	.Loop
+
+	lea	1(%rax,%rax),%rcx	# Q <<= 1 + speculative bit
+	sar	\$63,%rax		# top bit -> mask
+
+	sub	%rsi,%r8		# R -= D
+	sbb	%rdx,%r9
+	sbb	\$0,%rcx		# subtract speculative bit
+
+	or	%rcx,%rax		# all ones if overflow
+
+	ret
+.size	div_3_limbs,.-div_3_limbs
+___
+########################################################################
+# Calculate remainder and adjust the quotient, which can be off-by-one.
+# Then save quotient in limb next to top limb of the remainder. There is
+# place, because the remainder/next-iteration-dividend gets shorter by
+# one limb.
+{
+my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");
+my @acc = ("%r8", "%r9", "%rdx");
+my @tmp = ("%r10", "%r11", "%rax");
+
+$code.=<<___;
+.globl	quot_rem_128
+.hidden	quot_rem_128
+.type	quot_rem_128,\@function,3
+.align	32
+quot_rem_128:
+	mov	%rdx, %rax
+	mov	%rdx, $quotient
+
+	mulq	0($divisor)		# divisor[0:1] * quotient
+	mov	%rax, @acc[0]
+	mov	$quotient, %rax
+	mov	%rdx, @acc[1]
+
+	mulq	8($divisor)
+	add	%rax, @acc[1]
+	adc	\$0, %rdx		# %rdx is @acc[2]
+
+	mov	0($div_rem), @tmp[0]	# load 3 limbs of the dividend
+	mov	8($div_rem), @tmp[1]
+	mov	16($div_rem), @tmp[2]
+
+	sub	@acc[0], @tmp[0]	# dividend - divisor * quotient
+	sbb	@acc[1], @tmp[1]
+	sbb	@acc[2], @tmp[2]
+	sbb	@acc[0], @acc[0]	# borrow -> mask
+
+	add	@acc[0], $quotient	# if borrowed, adjust the quotient ...
+	mov	@acc[0], @acc[1]
+	and	0($divisor), @acc[0]
+	and	8($divisor), @acc[1]
+	add	@acc[0], @tmp[0]	# ... and add divisor
+	adc	@acc[1], @tmp[1]
+
+	mov	@tmp[0], 0($div_rem)	# save 2 limbs of the remainder ...
+	mov	@tmp[1], 8($div_rem)
+	mov	$quotient, 16($div_rem)	# ... and 1 limb of the quotient
+
+	mov	$quotient, %rax		# return adjusted quotient
+
+	ret
+.size	quot_rem_128,.-quot_rem_128
+
+########################################################################
+# Unlike 128-bit case above, quotient is exact. As result just one limb
+# of the dividend is sufficient to calculate the remainder...
+
+.globl	quot_rem_64
+.hidden	quot_rem_64
+.type	quot_rem_64,\@function,3
+.align	32
+quot_rem_64:
+	mov	%rdx, %rax		# return quotient
+	imulq	0($divisor), %rdx	# divisor[0] * quotient
+
+	mov	0($div_rem), @tmp[0]	# load 1 limb of the dividend
+
+	sub	%rdx, @tmp[0]		# dividend - divisor * quotient
+
+	mov	@tmp[0], 0($div_rem)	# save 1 limb of the remainder ...
+	mov	%rax, 8($div_rem)	# ... and 1 limb of the quotient
+
+	ret
+.size	quot_rem_64,.-quot_rem_64
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/mul_mont_256-armv8.pl b/blst/asm/mul_mont_256-armv8.pl
new file mode 100755
index 0000000..ba6c2b8
--- /dev/null
+++ b/blst/asm/mul_mont_256-armv8.pl
@@ -0,0 +1,409 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# As for "sparse" in subroutine names, see commentary in the
+# asm/mulx_mont_256-x86_64.pl module.
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
+
+@mod=map("x$_",(5..8));
+$bi="x9";
+@a=map("x$_",(10..13));
+@tmp=map("x$_",(14..17));
+@acc=map("x$_",(19..24));
+$m0=$n_ptr;
+
+$code.=<<___;
+.text
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,%function
+.align	5
+mul_mont_sparse_256:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldr	$bi,        [$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	mul	@acc[0],@a[0],$bi
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	mul	@acc[1],@a[1],$bi
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	mul	@acc[2],@a[2],$bi
+	mul	@acc[3],@a[3],$bi
+
+	 umulh	@tmp[0],@a[0],$bi
+	 umulh	@tmp[1],@a[1],$bi
+	mul	$m0,$n0,@acc[0]
+	 umulh	@tmp[2],@a[2],$bi
+	 umulh	@tmp[3],@a[3],$bi
+	 adds	@acc[1],@acc[1],@tmp[0]
+	//mul	@tmp[0],@mod[0],$m0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$m0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$m0
+	 adc	@acc[4],xzr,    @tmp[3]
+	mul	@tmp[3],@mod[3],$m0
+___
+for ($i=1;$i<4;$i++) {
+$code.=<<___;
+	ldr	$bi,[$b_ptr,8*$i]
+	subs	xzr,@acc[0],#1		//adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@acc[4],@acc[4],xzr
+
+	 adds	@acc[0],@acc[1],@tmp[0]
+	mul	@tmp[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	mul	@tmp[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	mul	@tmp[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	mul	@tmp[3],@a[3],$bi
+	 adc	@acc[4],xzr,xzr
+
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@tmp[2]
+	mul	$m0,$n0,@acc[0]
+	 umulh	@tmp[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@a[3],$bi
+	adc	@acc[4],@acc[4],xzr
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	//mul	@tmp[0],@mod[0],$m0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$m0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$m0
+	 adc	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$m0
+___
+}
+$code.=<<___;
+	subs	xzr,@acc[0],#1		//adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@acc[4],@acc[4],xzr
+
+	 adds	@acc[0],@acc[1],@tmp[0]
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	 adc	@acc[4],xzr,xzr
+
+	subs	@tmp[0],@acc[0],@mod[0]
+	sbcs	@tmp[1],@acc[1],@mod[1]
+	sbcs	@tmp[2],@acc[2],@mod[2]
+	sbcs	@tmp[3],@acc[3],@mod[3]
+	sbcs	xzr,    @acc[4],xzr
+
+	csel	@acc[0],@acc[0],@tmp[0],lo
+	csel	@acc[1],@acc[1],@tmp[1],lo
+	csel	@acc[2],@acc[2],@tmp[2],lo
+	csel	@acc[3],@acc[3],@tmp[3],lo
+
+	stp	@acc[0],@acc[1],[$r_ptr]
+	stp	@acc[2],@acc[3],[$r_ptr,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+___
+{
+my @acc = (@a,@acc[0..3]);
+my @a = @mod;
+
+$code.=<<___;
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,%function
+.align	5
+sqr_mont_sparse_256:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	mov	$n0,$n_ptr
+
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x]
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	@acc[1],@a[1],@a[0]	// a[1]*a[0]
+	umulh	@tmp[1],@a[1],@a[0]
+	mul	@acc[2],@a[2],@a[0]	// a[2]*a[0]
+	umulh	@tmp[2],@a[2],@a[0]
+	mul	@acc[3],@a[3],@a[0]	// a[3]*a[0]
+	umulh	@acc[4],@a[3],@a[0]
+
+	adds	@acc[2],@acc[2],@tmp[1]	// accumulate high parts of multiplication
+	 mul	@tmp[0],@a[2],@a[1]	// a[2]*a[1]
+	 umulh	@tmp[1],@a[2],@a[1]
+	adcs	@acc[3],@acc[3],@tmp[2]
+	 mul	@tmp[2],@a[3],@a[1]	// a[3]*a[1]
+	 umulh	@tmp[3],@a[3],@a[1]
+	adc	@acc[4],@acc[4],xzr	// can't overflow
+
+	mul	@acc[5],@a[3],@a[2]	// a[3]*a[2]
+	umulh	@acc[6],@a[3],@a[2]
+
+	adds	@tmp[1],@tmp[1],@tmp[2]	// accumulate high parts of multiplication
+	 mul	@acc[0],@a[0],@a[0]	// a[0]*a[0]
+	adc	@tmp[2],@tmp[3],xzr	// can't overflow
+
+	adds	@acc[3],@acc[3],@tmp[0]	// accumulate low parts of multiplication
+	 umulh	@a[0],@a[0],@a[0]
+	adcs	@acc[4],@acc[4],@tmp[1]
+	 mul	@tmp[1],@a[1],@a[1]	// a[1]*a[1]
+	adcs	@acc[5],@acc[5],@tmp[2]
+	 umulh	@a[1],@a[1],@a[1]
+	adc	@acc[6],@acc[6],xzr	// can't overflow
+
+	adds	@acc[1],@acc[1],@acc[1]	// acc[1-6]*=2
+	 mul	@tmp[2],@a[2],@a[2]	// a[2]*a[2]
+	adcs	@acc[2],@acc[2],@acc[2]
+	 umulh	@a[2],@a[2],@a[2]
+	adcs	@acc[3],@acc[3],@acc[3]
+	 mul	@tmp[3],@a[3],@a[3]	// a[3]*a[3]
+	adcs	@acc[4],@acc[4],@acc[4]
+	 umulh	@a[3],@a[3],@a[3]
+	adcs	@acc[5],@acc[5],@acc[5]
+	adcs	@acc[6],@acc[6],@acc[6]
+	adc	@acc[7],xzr,xzr
+
+	adds	@acc[1],@acc[1],@a[0]	// +a[i]*a[i]
+	adcs	@acc[2],@acc[2],@tmp[1]
+	adcs	@acc[3],@acc[3],@a[1]
+	adcs	@acc[4],@acc[4],@tmp[2]
+	adcs	@acc[5],@acc[5],@a[2]
+	adcs	@acc[6],@acc[6],@tmp[3]
+	adc	@acc[7],@acc[7],@a[3]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	adds	@acc[0],@acc[0],@acc[4]	// accumulate upper half
+	adcs	@acc[1],@acc[1],@acc[5]
+	adcs	@acc[2],@acc[2],@acc[6]
+	adcs	@acc[3],@acc[3],@acc[7]
+	adc	@acc[4],xzr,xzr
+
+	subs	@tmp[0],@acc[0],@mod[0]
+	sbcs	@tmp[1],@acc[1],@mod[1]
+	sbcs	@tmp[2],@acc[2],@mod[2]
+	sbcs	@tmp[3],@acc[3],@mod[3]
+	sbcs	xzr,    @acc[4],xzr
+
+	csel	@acc[0],@acc[0],@tmp[0],lo
+	csel	@acc[1],@acc[1],@tmp[1],lo
+	csel	@acc[2],@acc[2],@tmp[2],lo
+	csel	@acc[3],@acc[3],@tmp[3],lo
+
+	stp	@acc[0],@acc[1],[$r_ptr]
+	stp	@acc[2],@acc[3],[$r_ptr,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+___
+}
+{
+my @a = (@a, $bi);
+
+$code.=<<___;
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,%function
+.align	5
+from_mont_256:
+	paciasp
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	$n0,$n_ptr
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	subs	@tmp[0],@a[0],@mod[0]
+	sbcs	@tmp[1],@a[1],@mod[1]
+	sbcs	@tmp[2],@a[2],@mod[2]
+	sbcs	@tmp[3],@a[3],@mod[3]
+
+	csel	@a[0],@a[0],@tmp[0],lo
+	csel	@a[1],@a[1],@tmp[1],lo
+	csel	@a[2],@a[2],@tmp[2],lo
+	csel	@a[3],@a[3],@tmp[3],lo
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ldr	x29,[sp],#16
+	autiasp
+	ret
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,%function
+.align	5
+redc_mont_256:
+	paciasp
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	$n0,$n_ptr
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	ldp	@tmp[0],@tmp[1],[$a_ptr,#32]
+	ldp	@tmp[2],@tmp[3],[$a_ptr,#48]
+
+	adds	@a[0],@a[0],@tmp[0]
+	adcs	@a[1],@a[1],@tmp[1]
+	adcs	@a[2],@a[2],@tmp[2]
+	adcs	@a[3],@a[3],@tmp[3]
+	adc	@a[4],xzr,xzr
+
+	subs	@tmp[0],@a[0],@mod[0]
+	sbcs	@tmp[1],@a[1],@mod[1]
+	sbcs	@tmp[2],@a[2],@mod[2]
+	sbcs	@tmp[3],@a[3],@mod[3]
+	sbcs	xzr,    @a[4],xzr
+
+	csel	@a[0],@a[0],@tmp[0],lo
+	csel	@a[1],@a[1],@tmp[1],lo
+	csel	@a[2],@a[2],@tmp[2],lo
+	csel	@a[3],@a[3],@tmp[3],lo
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ldr	x29,[sp],#16
+	autiasp
+	ret
+.size	redc_mont_256,.-redc_mont_256
+
+.type	__mul_by_1_mont_256,%function
+.align	5
+__mul_by_1_mont_256:
+	mul	$m0,$n0,@a[0]
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+___
+for ($i=1;$i<4;$i++) {
+$code.=<<___;
+	//mul	@tmp[0],@mod[0],$m0
+	mul	@tmp[1],@mod[1],$m0
+	mul	@tmp[2],@mod[2],$m0
+	mul	@tmp[3],@mod[3],$m0
+	subs	xzr,@a[0],#1		//adds	@a[0],@a[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@a[1],@a[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@a[2],@a[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@a[3],@a[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@a[4],xzr,xzr
+
+	 adds	@a[0],@a[1],@tmp[0]
+	 adcs	@a[1],@a[2],@tmp[1]
+	 adcs	@a[2],@a[3],@tmp[2]
+	mul	$m0,$n0,@a[0]
+	 adc	@a[3],@a[4],@tmp[3]
+___
+}
+$code.=<<___;
+	//mul	@tmp[0],@mod[0],$m0
+	mul	@tmp[1],@mod[1],$m0
+	mul	@tmp[2],@mod[2],$m0
+	mul	@tmp[3],@mod[3],$m0
+	subs	xzr,@a[0],#1		//adds	@a[0],@a[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@a[1],@a[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@a[2],@a[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@a[3],@a[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@a[4],xzr,xzr
+
+	 adds	@a[0],@a[1],@tmp[0]
+	 adcs	@a[1],@a[2],@tmp[1]
+	 adcs	@a[2],@a[3],@tmp[2]
+	 adc	@a[3],@a[4],@tmp[3]
+
+	ret
+.size	__mul_by_1_mont_256,.-__mul_by_1_mont_256
+___
+}
+
+print $code;
+
+close STDOUT;
diff --git a/blst/asm/mul_mont_384-armv8.pl b/blst/asm/mul_mont_384-armv8.pl
new file mode 100755
index 0000000..44e12a0
--- /dev/null
+++ b/blst/asm/mul_mont_384-armv8.pl
@@ -0,0 +1,2015 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
+
+@mod = map("x$_",(5..10));
+@a   = map("x$_",(11..16));
+$bi  = "x17";
+@acc = map("x$_",(19..25));
+@tmp = map("x$_",(26..28,0,1,3));
+
+$code.=<<___;
+.text
+
+.globl	add_mod_384x384
+.type	add_mod_384x384,%function
+.align	5
+add_mod_384x384:
+	paciasp
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__add_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	autiasp
+	ret
+.size	add_mod_384x384,.-add_mod_384x384
+
+.type	__add_mod_384x384,%function
+.align	5
+__add_mod_384x384:
+	ldp	@a[0],  @a[1],  [$a_ptr]
+	ldp	@acc[0],@acc[1],[$b_ptr]
+	ldp	@a[2],  @a[3],  [$a_ptr,#16]
+	adds	@a[0],@a[0],@acc[0]
+	ldp	@acc[2],@acc[3],[$b_ptr,#16]
+	adcs	@a[1],@a[1],@acc[1]
+	ldp	@a[4],  @a[5],  [$a_ptr,#32]
+	adcs	@a[2],@a[2],@acc[2]
+	ldp	@acc[4],@acc[5],[$b_ptr,#32]
+	adcs	@a[3],@a[3],@acc[3]
+	 stp	@a[0],  @a[1],  [$r_ptr]
+	adcs	@a[4],@a[4],@acc[4]
+	 ldp	@a[0],  @a[1],  [$a_ptr,#48]
+	adcs	@a[5],@a[5],@acc[5]
+
+	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
+	 stp	@a[2],  @a[3],  [$r_ptr,#16]
+	 ldp	@a[2],  @a[3],  [$a_ptr,#64]
+	 ldp	@acc[2],@acc[3],[$b_ptr,#64]
+
+	adcs	@a[0],@a[0],@acc[0]
+	 stp	@a[4],  @a[5],  [$r_ptr,#32]
+	adcs	@a[1],@a[1],@acc[1]
+	 ldp	@a[4],  @a[5],  [$a_ptr,#80]
+	adcs	@a[2],@a[2],@acc[2]
+	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
+	adcs	@a[3],@a[3],@acc[3]
+	adcs	@a[4],@a[4],@acc[4]
+	adcs	@a[5],@a[5],@acc[5]
+	adc	$bi,xzr,xzr
+
+	subs	@acc[0],@a[0],@mod[0]
+	sbcs	@acc[1],@a[1],@mod[1]
+	sbcs	@acc[2],@a[2],@mod[2]
+	sbcs	@acc[3],@a[3],@mod[3]
+	sbcs	@acc[4],@a[4],@mod[4]
+	sbcs	@acc[5],@a[5],@mod[5]
+	sbcs	xzr,$bi,xzr
+
+	csel	@a[0],@a[0],@acc[0],lo
+	csel	@a[1],@a[1],@acc[1],lo
+	csel	@a[2],@a[2],@acc[2],lo
+	csel	@a[3],@a[3],@acc[3],lo
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	csel	@a[4],@a[4],@acc[4],lo
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	csel	@a[5],@a[5],@acc[5],lo
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ret
+.size	__add_mod_384x384,.-__add_mod_384x384
+
+.globl	sub_mod_384x384
+.type	sub_mod_384x384,%function
+.align	5
+sub_mod_384x384:
+	paciasp
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	autiasp
+	ret
+.size	sub_mod_384x384,.-sub_mod_384x384
+
+.type	__sub_mod_384x384,%function
+.align	5
+__sub_mod_384x384:
+	ldp	@a[0],  @a[1],  [$a_ptr]
+	ldp	@acc[0],@acc[1],[$b_ptr]
+	ldp	@a[2],  @a[3],  [$a_ptr,#16]
+	subs	@a[0],@a[0],@acc[0]
+	ldp	@acc[2],@acc[3],[$b_ptr,#16]
+	sbcs	@a[1],@a[1],@acc[1]
+	ldp	@a[4],  @a[5],  [$a_ptr,#32]
+	sbcs	@a[2],@a[2],@acc[2]
+	ldp	@acc[4],@acc[5],[$b_ptr,#32]
+	sbcs	@a[3],@a[3],@acc[3]
+	 stp	@a[0],  @a[1],  [$r_ptr]
+	sbcs	@a[4],@a[4],@acc[4]
+	 ldp	@a[0],  @a[1],  [$a_ptr,#48]
+	sbcs	@a[5],@a[5],@acc[5]
+
+	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
+	 stp	@a[2],  @a[3],  [$r_ptr,#16]
+	 ldp	@a[2],  @a[3],  [$a_ptr,#64]
+	 ldp	@acc[2],@acc[3],[$b_ptr,#64]
+
+	sbcs	@a[0],@a[0],@acc[0]
+	 stp	@a[4],  @a[5],  [$r_ptr,#32]
+	sbcs	@a[1],@a[1],@acc[1]
+	 ldp	@a[4],  @a[5],  [$a_ptr,#80]
+	sbcs	@a[2],@a[2],@acc[2]
+	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
+	sbcs	@a[3],@a[3],@acc[3]
+	sbcs	@a[4],@a[4],@acc[4]
+	sbcs	@a[5],@a[5],@acc[5]
+	sbc	$bi,xzr,xzr
+
+	 and	@acc[0],@mod[0],$bi
+	 and	@acc[1],@mod[1],$bi
+	adds	@a[0],@a[0],@acc[0]
+	 and	@acc[2],@mod[2],$bi
+	adcs	@a[1],@a[1],@acc[1]
+	 and	@acc[3],@mod[3],$bi
+	adcs	@a[2],@a[2],@acc[2]
+	 and	@acc[4],@mod[4],$bi
+	adcs	@a[3],@a[3],@acc[3]
+	 and	@acc[5],@mod[5],$bi
+	adcs	@a[4],@a[4],@acc[4]
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	adc	@a[5],@a[5],@acc[5]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,%function
+.align	5
+__add_mod_384:
+	ldp	@a[0],  @a[1],  [$a_ptr]
+	ldp	@acc[0],@acc[1],[$b_ptr]
+	ldp	@a[2],  @a[3],  [$a_ptr,#16]
+	adds	@a[0],@a[0],@acc[0]
+	ldp	@acc[2],@acc[3],[$b_ptr,#16]
+	adcs	@a[1],@a[1],@acc[1]
+	ldp	@a[4],  @a[5],  [$a_ptr,#32]
+	adcs	@a[2],@a[2],@acc[2]
+	ldp	@acc[4],@acc[5],[$b_ptr,#32]
+	adcs	@a[3],@a[3],@acc[3]
+	adcs	@a[4],@a[4],@acc[4]
+	adcs	@a[5],@a[5],@acc[5]
+	adc	$bi,xzr,xzr
+
+	subs	@acc[0],@a[0],@mod[0]
+	sbcs	@acc[1],@a[1],@mod[1]
+	sbcs	@acc[2],@a[2],@mod[2]
+	sbcs	@acc[3],@a[3],@mod[3]
+	sbcs	@acc[4],@a[4],@mod[4]
+	sbcs	@acc[5],@a[5],@mod[5]
+	sbcs	xzr,$bi,xzr
+
+	csel	@a[0],@a[0],@acc[0],lo
+	csel	@a[1],@a[1],@acc[1],lo
+	csel	@a[2],@a[2],@acc[2],lo
+	csel	@a[3],@a[3],@acc[3],lo
+	csel	@a[4],@a[4],@acc[4],lo
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[5],@a[5],@acc[5],lo
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,%function
+.align	5
+__sub_mod_384:
+	ldp	@a[0],  @a[1],  [$a_ptr]
+	ldp	@acc[0],@acc[1],[$b_ptr]
+	ldp	@a[2],  @a[3],  [$a_ptr,#16]
+	subs	@a[0],@a[0],@acc[0]
+	ldp	@acc[2],@acc[3],[$b_ptr,#16]
+	sbcs	@a[1],@a[1],@acc[1]
+	ldp	@a[4],  @a[5],  [$a_ptr,#32]
+	sbcs	@a[2],@a[2],@acc[2]
+	ldp	@acc[4],@acc[5],[$b_ptr,#32]
+	sbcs	@a[3],@a[3],@acc[3]
+	sbcs	@a[4],@a[4],@acc[4]
+	sbcs	@a[5],@a[5],@acc[5]
+	sbc	$bi,xzr,xzr
+
+	 and	@acc[0],@mod[0],$bi
+	 and	@acc[1],@mod[1],$bi
+	adds	@a[0],@a[0],@acc[0]
+	 and	@acc[2],@mod[2],$bi
+	adcs	@a[1],@a[1],@acc[1]
+	 and	@acc[3],@mod[3],$bi
+	adcs	@a[2],@a[2],@acc[2]
+	 and	@acc[4],@mod[4],$bi
+	adcs	@a[3],@a[3],@acc[3]
+	 and	@acc[5],@mod[5],$bi
+	adcs	@a[4],@a[4],@acc[4]
+	stp	@a[0],@a[1],[$r_ptr]
+	adc	@a[5],@a[5],@acc[5]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	mul_mont_384x
+.hidden	mul_mont_384x
+.type	mul_mont_384x,%function
+.align	5
+mul_mont_384x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#288		// space for 3 768-bit vectors
+
+	mov	@tmp[0],$r_ptr		// save r_ptr
+	mov	@tmp[1],$a_ptr		// save b_ptr
+	mov	@tmp[2],$b_ptr		// save b_ptr
+
+	sub	$r_ptr,sp,#0		// mul_384(t0, a->re, b->re)
+	bl	__mul_384
+
+	add	$a_ptr,$a_ptr,#48	// mul_384(t1, a->im, b->im)
+	add	$b_ptr,$b_ptr,#48
+	add	$r_ptr,sp,#96
+	bl	__mul_384
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	sub	$b_ptr,$a_ptr,#48
+	add	$r_ptr,sp,#240
+	bl	__add_mod_384
+
+	add	$a_ptr,@tmp[2],#0
+	add	$b_ptr,@tmp[2],#48
+	add	$r_ptr,sp,#192		// t2
+	bl	__add_mod_384
+
+	add	$a_ptr,$r_ptr,#0
+	add	$b_ptr,$r_ptr,#48
+	bl	__mul_384		// mul_384(t2, a->re+a->im, b->re+b->im)
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	mov	$a_ptr,$r_ptr
+	add	$b_ptr,sp,#0
+	bl	__sub_mod_384x384
+
+	add	$b_ptr,sp,#96
+	bl	__sub_mod_384x384	// t2 = t2-t0-t1
+
+	add	$a_ptr,sp,#0
+	add	$b_ptr,sp,#96
+	add	$r_ptr,sp,#0
+	bl	__sub_mod_384x384	// t0 = t0-t1
+
+	add	$a_ptr,sp,#0		// ret->re = redc(t0)
+	add	$r_ptr,@tmp[0],#0
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+
+	add	$a_ptr,sp,#192		// ret->im = redc(t2)
+	add	$r_ptr,$r_ptr,#48
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#288
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	mul_mont_384x,.-mul_mont_384x
+
+.globl	sqr_mont_384x
+.hidden	sqr_mont_384x
+.type	sqr_mont_384x,%function
+.align	5
+sqr_mont_384x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	$n_ptr,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 2 384-bit vectors
+	mov	$n0,$n_ptr		// adjust for missing b_ptr
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	add	$b_ptr,$a_ptr,#48
+	add	$r_ptr,sp,#0
+	bl	__add_mod_384		// t0 = a->re + a->im
+
+	add	$r_ptr,sp,#48
+	bl	__sub_mod_384		// t1 = a->re - a->im
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldr	$bi,        [$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	bl	__mul_mont_384		// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	@a[0],@a[0],@a[0]	// add with itself
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	@acc[6],xzr,xzr
+
+	subs	@acc[0],@a[0],@mod[0]
+	sbcs	@acc[1],@a[1],@mod[1]
+	sbcs	@acc[2],@a[2],@mod[2]
+	sbcs	@acc[3],@a[3],@mod[3]
+	sbcs	@acc[4],@a[4],@mod[4]
+	sbcs	@acc[5],@a[5],@mod[5]
+	sbcs	xzr,@acc[6],xzr
+
+	csel	@acc[0],@a[0],@acc[0],lo
+	csel	@acc[1],@a[1],@acc[1],lo
+	csel	@acc[2],@a[2],@acc[2],lo
+	 ldp	@a[0],@a[1],[sp]
+	csel	@acc[3],@a[3],@acc[3],lo
+	 ldr	$bi,        [sp,#48]
+	csel	@acc[4],@a[4],@acc[4],lo
+	 ldp	@a[2],@a[3],[sp,#16]
+	csel	@acc[5],@a[5],@acc[5],lo
+	 ldp	@a[4],@a[5],[sp,#32]
+
+	stp	@acc[0],@acc[1],[$b_ptr,#48]
+	stp	@acc[2],@acc[3],[$b_ptr,#64]
+	stp	@acc[4],@acc[5],[$b_ptr,#80]
+
+	add	$b_ptr,sp,#48
+	bl	__mul_mont_384		// mul_mont_384(ret->re, t0, t1)
+	ldr	x30,[x29,#8]
+
+	stp	@a[0],@a[1],[$b_ptr]
+	stp	@a[2],@a[3],[$b_ptr,#16]
+	stp	@a[4],@a[5],[$b_ptr,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_mont_384x,.-sqr_mont_384x
+
+.globl	mul_mont_384
+.hidden	mul_mont_384
+.type	mul_mont_384,%function
+.align	5
+mul_mont_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	$n0,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldr	$bi,        [$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	@a[0],@a[1],[$b_ptr]
+	stp	@a[2],@a[3],[$b_ptr,#16]
+	stp	@a[4],@a[5],[$b_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	mul_mont_384,.-mul_mont_384
+
+.type	__mul_mont_384,%function
+.align	5
+__mul_mont_384:
+	mul	@acc[0],@a[0],$bi
+	mul	@acc[1],@a[1],$bi
+	mul	@acc[2],@a[2],$bi
+	mul	@acc[3],@a[3],$bi
+	mul	@acc[4],@a[4],$bi
+	mul	@acc[5],@a[5],$bi
+	mul	$n0,$n0,@acc[0]
+
+	 umulh	@tmp[0],@a[0],$bi
+	 umulh	@tmp[1],@a[1],$bi
+	 umulh	@tmp[2],@a[2],$bi
+	 umulh	@tmp[3],@a[3],$bi
+	 umulh	@tmp[4],@a[4],$bi
+	 umulh	@tmp[5],@a[5],$bi
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	// mul	@tmp[0],@mod[0],$n0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$n0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$n0
+	 adcs	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$n0
+	 adcs	@acc[5],@acc[5],@tmp[4]
+	mul	@tmp[4],@mod[4],$n0
+	 adc	@acc[6],xzr,    @tmp[5]
+	mul	@tmp[5],@mod[5],$n0
+	 mov	$bi,xzr
+___
+for ($i=1;$i<6;$i++) {
+$code.=<<___;
+	subs	xzr,@acc[0],#1		// adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$n0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$n0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$n0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$n0
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@mod[4],$n0
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@mod[5],$n0
+	adcs	@acc[6],@acc[6],xzr
+	adc	$n0,$bi,xzr
+	ldr	$bi,[$b_ptr,8*$i]
+
+	 adds	@acc[0],@acc[1],@tmp[0]
+	mul	@tmp[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	mul	@tmp[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	mul	@tmp[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	mul	@tmp[3],@a[3],$bi
+	 adcs	@acc[4],@acc[5],@tmp[4]
+	mul	@tmp[4],@a[4],$bi
+	 adcs	@acc[5],@acc[6],@tmp[5]
+	mul	@tmp[5],@a[5],$bi
+	 adc	@acc[6],$n0,xzr
+	ldr	$n0,[x29,#96]
+
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@tmp[2]
+	mul	$n0,$n0,@acc[0]
+	 umulh	@tmp[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@a[3],$bi
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@a[4],$bi
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@a[5],$bi
+	adcs	@acc[6],@acc[6],xzr
+	adc	$bi,xzr,xzr
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	// mul	@tmp[0],@mod[0],$n0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$n0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$n0
+	 adcs	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$n0
+	 adcs	@acc[5],@acc[5],@tmp[4]
+	mul	@tmp[4],@mod[4],$n0
+	 adcs	@acc[6],@acc[6],@tmp[5]
+	mul	@tmp[5],@mod[5],$n0
+	 adc	$bi,$bi,xzr
+___
+}
+$code.=<<___;
+	subs	xzr,@acc[0],#1		// adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$n0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$n0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$n0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$n0
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@mod[4],$n0
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@mod[5],$n0
+	adcs	@acc[6],@acc[6],xzr
+	 ldp	$n0,$b_ptr,[x29,#96]	// pull r_ptr
+	adc	$bi,$bi,xzr
+
+	 adds	@acc[0],@acc[1],@tmp[0]
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	 adcs	@acc[4],@acc[5],@tmp[4]
+	 adcs	@acc[5],@acc[6],@tmp[5]
+	 adc	@acc[6],$bi,xzr
+
+	subs	@tmp[0],@acc[0],@mod[0]
+	sbcs	@tmp[1],@acc[1],@mod[1]
+	sbcs	@tmp[2],@acc[2],@mod[2]
+	sbcs	@tmp[3],@acc[3],@mod[3]
+	sbcs	@tmp[4],@acc[4],@mod[4]
+	sbcs	@tmp[5],@acc[5],@mod[5]
+	sbcs	xzr,    @acc[6],xzr
+
+	csel	@a[0],@acc[0],@tmp[0],lo
+	csel	@a[1],@acc[1],@tmp[1],lo
+	csel	@a[2],@acc[2],@tmp[2],lo
+	csel	@a[3],@acc[3],@tmp[3],lo
+	csel	@a[4],@acc[4],@tmp[4],lo
+	csel	@a[5],@acc[5],@tmp[5],lo
+	ret
+.size	__mul_mont_384,.-__mul_mont_384
+
+.globl	sqr_mont_384
+.hidden	sqr_mont_384
+.type	sqr_mont_384,%function
+.align	5
+sqr_mont_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	$n0,$n_ptr		// adjust for missing b_ptr
+
+	mov	$n_ptr,$r_ptr		// save r_ptr
+	mov	$r_ptr,sp
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	bl	__sqr_384
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	mov	$a_ptr,sp
+	mov	$r_ptr,$n_ptr		// restore r_ptr
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_mont_384,.-sqr_mont_384
+
+.globl	sqr_n_mul_mont_383
+.hidden	sqr_n_mul_mont_383
+.type	sqr_n_mul_mont_383,%function
+.align	5
+sqr_n_mul_mont_383:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	$n0,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	$bi,x5			// save b_ptr
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	mov	$r_ptr,sp
+.Loop_sqr_383:
+	bl	__sqr_384
+	sub	$b_ptr,$b_ptr,#1	// counter
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	mov	$a_ptr,sp
+	bl	__mul_by_1_mont_384
+
+	ldp	@acc[0],@acc[1],[$a_ptr,#48]
+	ldp	@acc[2],@acc[3],[$a_ptr,#64]
+	ldp	@acc[4],@acc[5],[$a_ptr,#80]
+
+	adds	@a[0],@a[0],@acc[0]	// just accumulate upper half
+	adcs	@a[1],@a[1],@acc[1]
+	adcs	@a[2],@a[2],@acc[2]
+	adcs	@a[3],@a[3],@acc[3]
+	adcs	@a[4],@a[4],@acc[4]
+	adc	@a[5],@a[5],@acc[5]
+
+	cbnz	$b_ptr,.Loop_sqr_383
+
+	mov	$b_ptr,$bi
+	ldr	$bi,[$bi]
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	@a[0],@a[1],[$b_ptr]
+	stp	@a[2],@a[3],[$b_ptr,#16]
+	stp	@a[4],@a[5],[$b_ptr,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
+___
+{
+my @acc=(@acc,@tmp[0..2]);
+
+$code.=<<___;
+.type	__sqr_384,%function
+.align	5
+__sqr_384:
+	mul	@acc[0],@a[1],@a[0]
+	mul	@acc[1],@a[2],@a[0]
+	mul	@acc[2],@a[3],@a[0]
+	mul	@acc[3],@a[4],@a[0]
+	mul	@acc[4],@a[5],@a[0]
+
+	 umulh	@mod[1],@a[1],@a[0]
+	 umulh	@mod[2],@a[2],@a[0]
+	 umulh	@mod[3],@a[3],@a[0]
+	 umulh	@mod[4],@a[4],@a[0]
+	 adds	@acc[1],@acc[1],@mod[1]
+	 umulh	@mod[5],@a[5],@a[0]
+	 adcs	@acc[2],@acc[2],@mod[2]
+	mul	@mod[2],@a[2],@a[1]
+	 adcs	@acc[3],@acc[3],@mod[3]
+	mul	@mod[3],@a[3],@a[1]
+	 adcs	@acc[4],@acc[4],@mod[4]
+	mul	@mod[4],@a[4],@a[1]
+	 adc	@acc[5],xzr,    @mod[5]
+	mul	@mod[5],@a[5],@a[1]
+
+	adds	@acc[2],@acc[2],@mod[2]
+	 umulh	@mod[2],@a[2],@a[1]
+	adcs	@acc[3],@acc[3],@mod[3]
+	 umulh	@mod[3],@a[3],@a[1]
+	adcs	@acc[4],@acc[4],@mod[4]
+	 umulh	@mod[4],@a[4],@a[1]
+	adcs	@acc[5],@acc[5],@mod[5]
+	 umulh	@mod[5],@a[5],@a[1]
+	adc	@acc[6],xzr,xzr
+
+	  mul	@mod[0],@a[0],@a[0]
+	 adds	@acc[3],@acc[3],@mod[2]
+	  umulh	@a[0],  @a[0],@a[0]
+	 adcs	@acc[4],@acc[4],@mod[3]
+	mul	@mod[3],@a[3],@a[2]
+	 adcs	@acc[5],@acc[5],@mod[4]
+	mul	@mod[4],@a[4],@a[2]
+	 adc	@acc[6],@acc[6],@mod[5]
+	mul	@mod[5],@a[5],@a[2]
+
+	adds	@acc[4],@acc[4],@mod[3]
+	 umulh	@mod[3],@a[3],@a[2]
+	adcs	@acc[5],@acc[5],@mod[4]
+	 umulh	@mod[4],@a[4],@a[2]
+	adcs	@acc[6],@acc[6],@mod[5]
+	 umulh	@mod[5],@a[5],@a[2]
+	adc	@acc[7],xzr,xzr
+
+	  mul	@mod[1],@a[1],@a[1]
+	 adds	@acc[5],@acc[5],@mod[3]
+	  umulh	@a[1],  @a[1],@a[1]
+	 adcs	@acc[6],@acc[6],@mod[4]
+	mul	@mod[4],@a[4],@a[3]
+	 adc	@acc[7],@acc[7],@mod[5]
+	mul	@mod[5],@a[5],@a[3]
+
+	adds	@acc[6],@acc[6],@mod[4]
+	 umulh	@mod[4],@a[4],@a[3]
+	adcs	@acc[7],@acc[7],@mod[5]
+	 umulh	@mod[5],@a[5],@a[3]
+	adc	@acc[8],xzr,xzr
+	  mul	@mod[2],@a[2],@a[2]
+	 adds	@acc[7],@acc[7],@mod[4]
+	  umulh	@a[2],  @a[2],@a[2]
+	 adc	@acc[8],@acc[8],@mod[5]
+	  mul	@mod[3],@a[3],@a[3]
+
+	mul	@mod[5],@a[5],@a[4]
+	  umulh	@a[3],  @a[3],@a[3]
+	adds	@acc[8],@acc[8],@mod[5]
+	 umulh	@mod[5],@a[5],@a[4]
+	  mul	@mod[4],@a[4],@a[4]
+	adc	@acc[9],@mod[5],xzr
+
+	adds	@acc[0],@acc[0],@acc[0]
+	adcs	@acc[1],@acc[1],@acc[1]
+	adcs	@acc[2],@acc[2],@acc[2]
+	adcs	@acc[3],@acc[3],@acc[3]
+	adcs	@acc[4],@acc[4],@acc[4]
+	adcs	@acc[5],@acc[5],@acc[5]
+	adcs	@acc[6],@acc[6],@acc[6]
+	adcs	@acc[7],@acc[7],@acc[7]
+	  umulh	@a[4],  @a[4],@a[4]
+	adcs	@acc[8],@acc[8],@acc[8]
+	  mul	@mod[5],@a[5],@a[5]
+	adcs	@acc[9],@acc[9],@acc[9]
+	  umulh	@a[5],  @a[5],@a[5]
+	adc	$a_ptr,xzr,xzr
+
+	adds	@acc[0],@acc[0],@a[0]
+	adcs	@acc[1],@acc[1],@mod[1]
+	adcs	@acc[2],@acc[2],@a[1]
+	adcs	@acc[3],@acc[3],@mod[2]
+	adcs	@acc[4],@acc[4],@a[2]
+	adcs	@acc[5],@acc[5],@mod[3]
+	adcs	@acc[6],@acc[6],@a[3]
+	stp	@mod[0],@acc[0],[$r_ptr]
+	adcs	@acc[7],@acc[7],@mod[4]
+	stp	@acc[1],@acc[2],[$r_ptr,#16]
+	adcs	@acc[8],@acc[8],@a[4]
+	stp	@acc[3],@acc[4],[$r_ptr,#32]
+	adcs	@acc[9],@acc[9],@mod[5]
+	stp	@acc[5],@acc[6],[$r_ptr,#48]
+	adc	@a[5],@a[5],$a_ptr
+	stp	@acc[7],@acc[8],[$r_ptr,#64]
+	stp	@acc[9],@a[5],[$r_ptr,#80]
+
+	ret
+.size	__sqr_384,.-__sqr_384
+___
+}
+$code.=<<___;
+.globl	sqr_384
+.hidden	sqr_384
+.type	sqr_384,%function
+.align	5
+sqr_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	bl	__sqr_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_384,.-sqr_384
+
+.globl	redc_mont_384
+.hidden	redc_mont_384
+.type	redc_mont_384,%function
+.align	5
+redc_mont_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	$n0,$n_ptr		// adjust for missing b_ptr
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	redc_mont_384,.-redc_mont_384
+
+.globl	from_mont_384
+.hidden	from_mont_384
+.type	from_mont_384,%function
+.align	5
+from_mont_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	$n0,$n_ptr		// adjust for missing b_ptr
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	subs	@acc[0],@a[0],@mod[0]
+	sbcs	@acc[1],@a[1],@mod[1]
+	sbcs	@acc[2],@a[2],@mod[2]
+	sbcs	@acc[3],@a[3],@mod[3]
+	sbcs	@acc[4],@a[4],@mod[4]
+	sbcs	@acc[5],@a[5],@mod[5]
+
+	csel	@a[0],@a[0],@acc[0],lo
+	csel	@a[1],@a[1],@acc[1],lo
+	csel	@a[2],@a[2],@acc[2],lo
+	csel	@a[3],@a[3],@acc[3],lo
+	csel	@a[4],@a[4],@acc[4],lo
+	csel	@a[5],@a[5],@acc[5],lo
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	from_mont_384,.-from_mont_384
+
+.type	__mul_by_1_mont_384,%function
+.align	5
+__mul_by_1_mont_384:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	mul	@tmp[0],$n0,@a[0]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	// mul	@acc[0],@mod[0],@tmp[0]
+	mul	@acc[1],@mod[1],@tmp[0]
+	mul	@acc[2],@mod[2],@tmp[0]
+	mul	@acc[3],@mod[3],@tmp[0]
+	mul	@acc[4],@mod[4],@tmp[0]
+	mul	@acc[5],@mod[5],@tmp[0]
+	subs	xzr,@a[0],#1		// adds	@acc[0],@acc[0],@a[0]
+	 umulh	@a[0],@mod[0],@tmp[0]
+	adcs	@acc[1],@acc[1],@a[1]
+	 umulh	@a[1],@mod[1],@tmp[0]
+	adcs	@acc[2],@acc[2],@a[2]
+	 umulh	@a[2],@mod[2],@tmp[0]
+	adcs	@acc[3],@acc[3],@a[3]
+	 umulh	@a[3],@mod[3],@tmp[0]
+	adcs	@acc[4],@acc[4],@a[4]
+	 umulh	@a[4],@mod[4],@tmp[0]
+	adcs	@acc[5],@acc[5],@a[5]
+	 umulh	@a[5],@mod[5],@tmp[0]
+	adc	@acc[6],xzr,xzr
+___
+for ($i=1;$i<6;$i++) {
+$code.=<<___;
+	 adds	@a[0],@a[0],@acc[1]
+	 adcs	@a[1],@a[1],@acc[2]
+	 adcs	@a[2],@a[2],@acc[3]
+	mul	@tmp[0],$n0,@a[0]
+	 adcs	@a[3],@a[3],@acc[4]
+	 adcs	@a[4],@a[4],@acc[5]
+	 adc	@a[5],@a[5],@acc[6]
+
+	// mul	@acc[0],@mod[0],@tmp[0]
+	mul	@acc[1],@mod[1],@tmp[0]
+	mul	@acc[2],@mod[2],@tmp[0]
+	mul	@acc[3],@mod[3],@tmp[0]
+	mul	@acc[4],@mod[4],@tmp[0]
+	mul	@acc[5],@mod[5],@tmp[0]
+	subs	xzr,@a[0],#1		// adds	@acc[0],@acc[0],@a[0]
+	 umulh	@a[0],@mod[0],@tmp[0]
+	adcs	@acc[1],@acc[1],@a[1]
+	 umulh	@a[1],@mod[1],@tmp[0]
+	adcs	@acc[2],@acc[2],@a[2]
+	 umulh	@a[2],@mod[2],@tmp[0]
+	adcs	@acc[3],@acc[3],@a[3]
+	 umulh	@a[3],@mod[3],@tmp[0]
+	adcs	@acc[4],@acc[4],@a[4]
+	 umulh	@a[4],@mod[4],@tmp[0]
+	adcs	@acc[5],@acc[5],@a[5]
+	 umulh	@a[5],@mod[5],@tmp[0]
+	adc	@acc[6],xzr,xzr
+___
+}
+$code.=<<___;
+	adds	@a[0],@a[0],@acc[1]
+	adcs	@a[1],@a[1],@acc[2]
+	adcs	@a[2],@a[2],@acc[3]
+	adcs	@a[3],@a[3],@acc[4]
+	adcs	@a[4],@a[4],@acc[5]
+	adc	@a[5],@a[5],@acc[6]
+
+	ret
+.size	__mul_by_1_mont_384,.-__mul_by_1_mont_384
+
+.type	__redc_tail_mont_384,%function
+.align	5
+__redc_tail_mont_384:
+	ldp	@acc[0],@acc[1],[$a_ptr,#48]
+	ldp	@acc[2],@acc[3],[$a_ptr,#64]
+	ldp	@acc[4],@acc[5],[$a_ptr,#80]
+
+	adds	@a[0],@a[0],@acc[0]	// accumulate upper half
+	adcs	@a[1],@a[1],@acc[1]
+	adcs	@a[2],@a[2],@acc[2]
+	adcs	@a[3],@a[3],@acc[3]
+	adcs	@a[4],@a[4],@acc[4]
+	adcs	@a[5],@a[5],@acc[5]
+	adc	@acc[6],xzr,xzr
+
+	subs	@acc[0],@a[0],@mod[0]
+	sbcs	@acc[1],@a[1],@mod[1]
+	sbcs	@acc[2],@a[2],@mod[2]
+	sbcs	@acc[3],@a[3],@mod[3]
+	sbcs	@acc[4],@a[4],@mod[4]
+	sbcs	@acc[5],@a[5],@mod[5]
+	sbcs	xzr,@acc[6],xzr
+
+	csel	@a[0],@a[0],@acc[0],lo
+	csel	@a[1],@a[1],@acc[1],lo
+	csel	@a[2],@a[2],@acc[2],lo
+	csel	@a[3],@a[3],@acc[3],lo
+	csel	@a[4],@a[4],@acc[4],lo
+	csel	@a[5],@a[5],@acc[5],lo
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ret
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	mul_384
+.hidden	mul_384
+.type	mul_384,%function
+.align	5
+mul_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	mul_384,.-mul_384
+
+.type	__mul_384,%function
+.align	5
+__mul_384:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldr	$bi,        [$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	mul	@acc[0],@a[0],$bi
+	mul	@acc[1],@a[1],$bi
+	mul	@acc[2],@a[2],$bi
+	mul	@acc[3],@a[3],$bi
+	mul	@acc[4],@a[4],$bi
+	mul	@acc[5],@a[5],$bi
+
+	 umulh	@mod[0],@a[0],$bi
+	 umulh	@mod[1],@a[1],$bi
+	 umulh	@mod[2],@a[2],$bi
+	 umulh	@mod[3],@a[3],$bi
+	 umulh	@mod[4],@a[4],$bi
+	 umulh	@mod[5],@a[5],$bi
+	ldr	$bi,[$b_ptr,8*1]
+
+	str	@acc[0],[$r_ptr]
+	 adds	@acc[0],@acc[1],@mod[0]
+	mul	@mod[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@mod[1]
+	mul	@mod[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@mod[2]
+	mul	@mod[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@mod[3]
+	mul	@mod[3],@a[3],$bi
+	 adcs	@acc[4],@acc[5],@mod[4]
+	mul	@mod[4],@a[4],$bi
+	 adc	@acc[5],xzr,    @mod[5]
+	mul	@mod[5],@a[5],$bi
+___
+for ($i=1;$i<5;$i++) {
+$code.=<<___;
+	adds	@acc[0],@acc[0],@mod[0]
+	 umulh	@mod[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@mod[1]
+	 umulh	@mod[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@mod[2]
+	 umulh	@mod[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@mod[3]
+	 umulh	@mod[3],@a[3],$bi
+	adcs	@acc[4],@acc[4],@mod[4]
+	 umulh	@mod[4],@a[4],$bi
+	adcs	@acc[5],@acc[5],@mod[5]
+	 umulh	@mod[5],@a[5],$bi
+	ldr	$bi,[$b_ptr,#8*($i+1)]
+	adc	@acc[6],xzr,xzr
+
+	str	@acc[0],[$r_ptr,8*$i]
+	 adds	@acc[0],@acc[1],@mod[0]
+	mul	@mod[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@mod[1]
+	mul	@mod[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@mod[2]
+	mul	@mod[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@mod[3]
+	mul	@mod[3],@a[3],$bi
+	 adcs	@acc[4],@acc[5],@mod[4]
+	mul	@mod[4],@a[4],$bi
+	 adc	@acc[5],@acc[6],@mod[5]
+	mul	@mod[5],@a[5],$bi
+___
+}
+$code.=<<___;
+	adds	@acc[0],@acc[0],@mod[0]
+	 umulh	@mod[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@mod[1]
+	 umulh	@mod[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@mod[2]
+	 umulh	@mod[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@mod[3]
+	 umulh	@mod[3],@a[3],$bi
+	adcs	@acc[4],@acc[4],@mod[4]
+	 umulh	@mod[4],@a[4],$bi
+	adcs	@acc[5],@acc[5],@mod[5]
+	 umulh	@mod[5],@a[5],$bi
+	adc	@acc[6],xzr,xzr
+
+	str	@acc[0],[$r_ptr,8*$i]
+	 adds	@acc[0],@acc[1],@mod[0]
+	 adcs	@acc[1],@acc[2],@mod[1]
+	 adcs	@acc[2],@acc[3],@mod[2]
+	 adcs	@acc[3],@acc[4],@mod[3]
+	 adcs	@acc[4],@acc[5],@mod[4]
+	 adc	@acc[5],@acc[6],@mod[5]
+
+	stp	@acc[0],@acc[1],[$r_ptr,#48]
+	stp	@acc[2],@acc[3],[$r_ptr,#64]
+	stp	@acc[4],@acc[5],[$r_ptr,#80]
+
+	ret
+.size	__mul_384,.-__mul_384
+
+.globl	mul_382x
+.hidden	mul_382x
+.type	mul_382x,%function
+.align	5
+mul_382x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for two 384-bit vectors
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	mov	@tmp[0],$r_ptr		// save r_ptr
+	ldp	@acc[0],@acc[1],[$a_ptr,#48]
+	mov	@tmp[1],$a_ptr		// save a_ptr
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	mov	@tmp[2],$b_ptr		// save b_ptr
+	ldp	@acc[2],@acc[3],[$a_ptr,#64]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	adds	@mod[0],$a[0],@acc[0]	// t0 = a->re + a->im
+	ldp	@acc[4],@acc[5],[$a_ptr,#80]
+	adcs	@mod[1],$a[1],@acc[1]
+	 ldp	@a[0],@a[1],[$b_ptr]
+	adcs	@mod[2],$a[2],@acc[2]
+	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
+	adcs	@mod[3],$a[3],@acc[3]
+	 ldp	@a[2],@a[3],[$b_ptr,#16]
+	adcs	@mod[4],$a[4],@acc[4]
+	 ldp	@acc[2],@acc[3],[$b_ptr,#64]
+	adc	@mod[5],$a[5],@acc[5]
+	 ldp	@a[4],@a[5],[$b_ptr,#32]
+
+	stp	@mod[0],@mod[1],[sp]
+	 adds	@mod[0],$a[0],@acc[0]	// t1 = b->re + b->im
+	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
+	 adcs	@mod[1],$a[1],@acc[1]
+	stp	@mod[2],@mod[3],[sp,#16]
+	 adcs	@mod[2],$a[2],@acc[2]
+	 adcs	@mod[3],$a[3],@acc[3]
+	 stp	@mod[4],@mod[5],[sp,#32]
+	 adcs	@mod[4],$a[4],@acc[4]
+	 stp	@mod[0],@mod[1],[sp,#48]
+	 adc	@mod[5],$a[5],@acc[5]
+	 stp	@mod[2],@mod[3],[sp,#64]
+	 stp	@mod[4],@mod[5],[sp,#80]
+
+	bl	__mul_384		// mul_384(ret->re, a->re, b->re)
+
+	add	$a_ptr,sp,#0		// mul_384(ret->im, t0, t1)
+	add	$b_ptr,sp,#48
+	add	$r_ptr,@tmp[0],#96
+	bl	__mul_384
+
+	add	$a_ptr,@tmp[1],#48	// mul_384(tx, a->im, b->im)
+	add	$b_ptr,@tmp[2],#48
+	add	$r_ptr,sp,#0
+	bl	__mul_384
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	add	$a_ptr,@tmp[0],#96	// ret->im -= tx
+	add	$b_ptr,sp,#0
+	add	$r_ptr,@tmp[0],#96
+	bl	__sub_mod_384x384
+
+	add	$b_ptr,@tmp[0],#0	// ret->im -= ret->re
+	bl	__sub_mod_384x384
+
+	add	$a_ptr,@tmp[0],#0	// ret->re -= tx
+	add	$b_ptr,sp,#0
+	add	$r_ptr,@tmp[0],#0
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	mul_382x,.-mul_382x
+
+.globl	sqr_382x
+.hidden	sqr_382x
+.type	sqr_382x,%function
+.align	5
+sqr_382x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@acc[0],@acc[1],[$a_ptr,#48]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	adds	@mod[0],$a[0],@acc[0]	// t0 = a->re + a->im
+	ldp	@acc[2],@acc[3],[$a_ptr,#64]
+	adcs	@mod[1],$a[1],@acc[1]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	adcs	@mod[2],$a[2],@acc[2]
+	ldp	@acc[4],@acc[5],[$a_ptr,#80]
+	adcs	@mod[3],$a[3],@acc[3]
+	stp	@mod[0],@mod[1],[$r_ptr]
+	adcs	@mod[4],$a[4],@acc[4]
+	 ldp	@mod[0],@mod[1],[$b_ptr]
+	adc	@mod[5],$a[5],@acc[5]
+	stp	@mod[2],@mod[3],[$r_ptr,#16]
+
+	subs	@a[0],$a[0],@acc[0]	// t1 = a->re - a->im
+	 ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	sbcs	@a[1],$a[1],@acc[1]
+	stp	@mod[4],@mod[5],[$r_ptr,#32]
+	sbcs	@a[2],$a[2],@acc[2]
+	 ldp	@mod[4],@mod[5],[$b_ptr,#32]
+	sbcs	@a[3],$a[3],@acc[3]
+	sbcs	@a[4],$a[4],@acc[4]
+	sbcs	@a[5],$a[5],@acc[5]
+	sbc	@acc[6],xzr,xzr
+
+	 and	@acc[0],@mod[0],@acc[6]
+	 and	@acc[1],@mod[1],@acc[6]
+	adds	@a[0],@a[0],@acc[0]
+	 and	@acc[2],@mod[2],@acc[6]
+	adcs	@a[1],@a[1],@acc[1]
+	 and	@acc[3],@mod[3],@acc[6]
+	adcs	@a[2],@a[2],@acc[2]
+	 and	@acc[4],@mod[4],@acc[6]
+	adcs	@a[3],@a[3],@acc[3]
+	 and	@acc[5],@mod[5],@acc[6]
+	adcs	@a[4],@a[4],@acc[4]
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	adc	@a[5],@a[5],@acc[5]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	mov	$n0,$a_ptr		// save a_ptr
+	add	$a_ptr,$r_ptr,#0	// mul_384(ret->re, t0, t1)
+	add	$b_ptr,$r_ptr,#48
+	bl	__mul_384
+
+	add	$a_ptr,$n0,#0		// mul_384(ret->im, a->re, a->im)
+	add	$b_ptr,$n0,#48
+	add	$r_ptr,$r_ptr,#96
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	adds	@a[0],@a[0],@a[0]	// add with itself
+	ldp	@a[4],@a[5],[$r_ptr,#32]
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adcs	@acc[0],@acc[0],@acc[0]
+	adcs	@acc[1],@acc[1],@acc[1]
+	stp	@a[0],@a[1],[$r_ptr]
+	adcs	@acc[2],@acc[2],@acc[2]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	adcs	@acc[3],@acc[3],@acc[3]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	adcs	@acc[4],@acc[4],@acc[4]
+	stp	@acc[0],@acc[1],[$r_ptr,#48]
+	adc	@acc[5],@acc[5],@acc[5]
+	stp	@acc[2],@acc[3],[$r_ptr,#64]
+	stp	@acc[4],@acc[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_382x,.-sqr_382x
+
+.globl	sqr_mont_382x
+.hidden	sqr_mont_382x
+.type	sqr_mont_382x,%function
+.align	5
+sqr_mont_382x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	$n_ptr,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#112		// space for two 384-bit vectors + word
+	mov	$n0,$n_ptr		// adjust for missing b_ptr
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	$bi,@acc[1],[$a_ptr,#48]
+	ldp	@acc[2],@acc[3],[$a_ptr,#64]
+	ldp	@acc[4],@acc[5],[$a_ptr,#80]
+
+	adds	@mod[0],$a[0],$bi	// t0 = a->re + a->im
+	adcs	@mod[1],$a[1],@acc[1]
+	adcs	@mod[2],$a[2],@acc[2]
+	adcs	@mod[3],$a[3],@acc[3]
+	adcs	@mod[4],$a[4],@acc[4]
+	adc	@mod[5],$a[5],@acc[5]
+
+	subs	@acc[0],$a[0],$bi	// t1 = a->re - a->im
+	sbcs	@acc[1],$a[1],@acc[1]
+	sbcs	@acc[2],$a[2],@acc[2]
+	sbcs	@acc[3],$a[3],@acc[3]
+	sbcs	@acc[4],$a[4],@acc[4]
+	sbcs	@acc[5],$a[5],@acc[5]
+	sbc	@acc[6],xzr,xzr		// borrow flag as mask
+
+	stp	@mod[0],@mod[1],[sp]
+	stp	@mod[2],@mod[3],[sp,#16]
+	stp	@mod[4],@mod[5],[sp,#32]
+	stp	@acc[0],@acc[1],[sp,#48]
+	stp	@acc[2],@acc[3],[sp,#64]
+	stp	@acc[4],@acc[5],[sp,#80]
+	str	@acc[6],[sp,#96]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	add	$b_ptr,$a_ptr,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	@acc[0],@a[0],@a[0]	// add with itself
+	adcs	@acc[1],@a[1],@a[1]
+	adcs	@acc[2],@a[2],@a[2]
+	adcs	@acc[3],@a[3],@a[3]
+	adcs	@acc[4],@a[4],@a[4]
+	adc	@acc[5],@a[5],@a[5]
+
+	stp	@acc[0],@acc[1],[$b_ptr,#48]
+	stp	@acc[2],@acc[3],[$b_ptr,#64]
+	stp	@acc[4],@acc[5],[$b_ptr,#80]
+
+	ldp	@a[0],@a[1],[sp]
+	ldr	$bi,[sp,#48]
+	ldp	@a[2],@a[3],[sp,#16]
+	ldp	@a[4],@a[5],[sp,#32]
+
+	add	$b_ptr,sp,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, t0, t1)
+	ldr	x30,[x29,#8]
+
+	ldr	@acc[6],[sp,#96]	// account for sign from a->re - a->im
+	ldp	@acc[0],@acc[1],[sp]
+	ldp	@acc[2],@acc[3],[sp,#16]
+	ldp	@acc[4],@acc[5],[sp,#32]
+
+	and	@acc[0],@acc[0],@acc[6]
+	and	@acc[1],@acc[1],@acc[6]
+	and	@acc[2],@acc[2],@acc[6]
+	and	@acc[3],@acc[3],@acc[6]
+	and	@acc[4],@acc[4],@acc[6]
+	and	@acc[5],@acc[5],@acc[6]
+
+	subs	@a[0],@a[0],@acc[0]
+	sbcs	@a[1],@a[1],@acc[1]
+	sbcs	@a[2],@a[2],@acc[2]
+	sbcs	@a[3],@a[3],@acc[3]
+	sbcs	@a[4],@a[4],@acc[4]
+	sbcs	@a[5],@a[5],@acc[5]
+	sbc	@acc[6],xzr,xzr
+
+	and	@acc[0],@mod[0],@acc[6]
+	and	@acc[1],@mod[1],@acc[6]
+	and	@acc[2],@mod[2],@acc[6]
+	and	@acc[3],@mod[3],@acc[6]
+	and	@acc[4],@mod[4],@acc[6]
+	and	@acc[5],@mod[5],@acc[6]
+
+	adds	@a[0],@a[0],@acc[0]
+	adcs	@a[1],@a[1],@acc[1]
+	adcs	@a[2],@a[2],@acc[2]
+	adcs	@a[3],@a[3],@acc[3]
+	adcs	@a[4],@a[4],@acc[4]
+	adc	@a[5],@a[5],@acc[5]
+
+	stp	@a[0],@a[1],[$b_ptr]
+	stp	@a[2],@a[3],[$b_ptr,#16]
+	stp	@a[4],@a[5],[$b_ptr,#32]
+
+	add	sp,sp,#112
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_mont_382x,.-sqr_mont_382x
+
+.type	__mul_mont_383_nonred,%function
+.align	5
+__mul_mont_383_nonred:
+	mul	@acc[0],@a[0],$bi
+	mul	@acc[1],@a[1],$bi
+	mul	@acc[2],@a[2],$bi
+	mul	@acc[3],@a[3],$bi
+	mul	@acc[4],@a[4],$bi
+	mul	@acc[5],@a[5],$bi
+	mul	$n0,$n0,@acc[0]
+
+	 umulh	@tmp[0],@a[0],$bi
+	 umulh	@tmp[1],@a[1],$bi
+	 umulh	@tmp[2],@a[2],$bi
+	 umulh	@tmp[3],@a[3],$bi
+	 umulh	@tmp[4],@a[4],$bi
+	 umulh	@tmp[5],@a[5],$bi
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	mul	@tmp[0],@mod[0],$n0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$n0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$n0
+	 adcs	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$n0
+	 adcs	@acc[5],@acc[5],@tmp[4]
+	mul	@tmp[4],@mod[4],$n0
+	 adc	@acc[6],xzr,    @tmp[5]
+	mul	@tmp[5],@mod[5],$n0
+___
+for ($i=1;$i<6;$i++) {
+$code.=<<___;
+	ldr	$bi,[$b_ptr,8*$i]
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$n0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$n0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$n0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$n0
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@mod[4],$n0
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@mod[5],$n0
+	adc	@acc[6],@acc[6],xzr
+
+	ldr	$n0,[x29,#96]
+	 adds	@acc[0],@acc[1],@tmp[0]
+	mul	@tmp[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	mul	@tmp[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	mul	@tmp[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	mul	@tmp[3],@a[3],$bi
+	 adcs	@acc[4],@acc[5],@tmp[4]
+	mul	@tmp[4],@a[4],$bi
+	 adcs	@acc[5],@acc[6],@tmp[5]
+	mul	@tmp[5],@a[5],$bi
+	 adc	@acc[6],xzr,xzr
+
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@tmp[2]
+	mul	$n0,$n0,@acc[0]
+	 umulh	@tmp[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@a[3],$bi
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@a[4],$bi
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@a[5],$bi
+	adc	@acc[6],@acc[6],xzr
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	mul	@tmp[0],@mod[0],$n0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$n0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$n0
+	 adcs	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$n0
+	 adcs	@acc[5],@acc[5],@tmp[4]
+	mul	@tmp[4],@mod[4],$n0
+	 adc	@acc[6],@acc[6],@tmp[5]
+	mul	@tmp[5],@mod[5],$n0
+___
+}
+$code.=<<___;
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$n0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$n0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$n0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$n0
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@mod[4],$n0
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@mod[5],$n0
+	adc	@acc[6],@acc[6],xzr
+	 ldp	$n0,$b_ptr,[x29,#96]		// pull r_ptr
+
+	 adds	@a[0],@acc[1],@tmp[0]
+	 adcs	@a[1],@acc[2],@tmp[1]
+	 adcs	@a[2],@acc[3],@tmp[2]
+	 adcs	@a[3],@acc[4],@tmp[3]
+	 adcs	@a[4],@acc[5],@tmp[4]
+	 adcs	@a[5],@acc[6],@tmp[5]
+
+	ret
+.size	__mul_mont_383_nonred,.-__mul_mont_383_nonred
+
+.globl	sgn0_pty_mont_384
+.hidden	sgn0_pty_mont_384
+.type	sgn0_pty_mont_384,%function
+.align	5
+sgn0_pty_mont_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	$n0,$b_ptr
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+	ldp	@mod[4],@mod[5],[$a_ptr,#32]
+	mov	$a_ptr,$r_ptr
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	$r_ptr,@a[0],#1
+	adds	@a[0],@a[0],@a[0]
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$bi,xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	$bi,$bi,xzr
+
+	mvn	$bi,$bi
+	and	$bi,$bi,#2
+	orr	$r_ptr,$r_ptr,$bi
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384
+
+.globl	sgn0_pty_mont_384x
+.hidden	sgn0_pty_mont_384x
+.type	sgn0_pty_mont_384x,%function
+.align	5
+sgn0_pty_mont_384x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	$n0,$b_ptr
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+	ldp	@mod[4],@mod[5],[$a_ptr,#32]
+	mov	$a_ptr,$r_ptr
+
+	bl	__mul_by_1_mont_384
+	add	$a_ptr,$a_ptr,#48
+
+	and	$b_ptr,@a[0],#1
+	 orr	$n_ptr,@a[0],@a[1]
+	adds	@a[0],@a[0],@a[0]
+	 orr	$n_ptr,$n_ptr,@a[2]
+	adcs	@a[1],@a[1],@a[1]
+	 orr	$n_ptr,$n_ptr,@a[3]
+	adcs	@a[2],@a[2],@a[2]
+	 orr	$n_ptr,$n_ptr,@a[4]
+	adcs	@a[3],@a[3],@a[3]
+	 orr	$n_ptr,$n_ptr,@a[5]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$bi,xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	$bi,$bi,xzr
+
+	mvn	$bi,$bi
+	and	$bi,$bi,#2
+	orr	$b_ptr,$b_ptr,$bi
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	$r_ptr,@a[0],#1
+	 orr	$a_ptr,@a[0],@a[1]
+	adds	@a[0],@a[0],@a[0]
+	 orr	$a_ptr,$a_ptr,@a[2]
+	adcs	@a[1],@a[1],@a[1]
+	 orr	$a_ptr,$a_ptr,@a[3]
+	adcs	@a[2],@a[2],@a[2]
+	 orr	$a_ptr,$a_ptr,@a[4]
+	adcs	@a[3],@a[3],@a[3]
+	 orr	$a_ptr,$a_ptr,@a[5]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$bi,xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	$bi,$bi,xzr
+
+	mvn	$bi,$bi
+	and	$bi,$bi,#2
+	orr	$r_ptr,$r_ptr,$bi
+
+	cmp	$n_ptr,#0
+	csel	$n_ptr,$r_ptr,$b_ptr,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	$a_ptr,#0
+	csel	$a_ptr,$r_ptr,$b_ptr,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	$n_ptr,$n_ptr,#1
+	and	$a_ptr,$a_ptr,#2
+	orr	$r_ptr,$a_ptr,$n_ptr		// pack sign and parity
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
+___
+
+if (0) {
+my @b = ($bi, @mod[0..4]);
+my @comba = @acc[4..6];
+
+$code.=<<___;
+.type	__mul_384_comba,%function
+.align	5
+__mul_384_comba:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+	ldp	@b[4],@b[5],[$b_ptr,#32]
+
+	mul	@comba[0],@a[0],@b[0]
+	umulh	@comba[1],@a[0],@b[0]
+	 mul	@acc[0],@a[1],@b[0]
+	 umulh	@acc[1],@a[1],@b[0]
+	str	@comba[0],[$r_ptr]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[2],@a[0],@b[1]
+	umulh	@acc[3],@a[0],@b[1]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],xzr,      @acc[1]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[0],@a[2],@b[0]
+	umulh	@acc[1],@a[2],@b[0]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#8]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[2],@a[1],@b[1]
+	umulh	@acc[3],@a[1],@b[1]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[0],@a[0],@b[2]
+	umulh	@acc[1],@a[0],@b[2]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[2],@a[3],@b[0]
+	 umulh	@acc[3],@a[3],@b[0]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#16]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[0],@a[2],@b[1]
+	umulh	@acc[1],@a[2],@b[1]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[2],@a[1],@b[2]
+	umulh	@acc[3],@a[1],@b[2]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[0],@a[0],@b[3]
+	umulh	@acc[1],@a[0],@b[3]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[2],@a[4],@b[0]
+	 umulh	@acc[3],@a[4],@b[0]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#24]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[0],@a[3],@b[1]
+	umulh	@acc[1],@a[3],@b[1]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[2],@a[2],@b[2]
+	umulh	@acc[3],@a[2],@b[2]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[0],@a[1],@b[3]
+	umulh	@acc[1],@a[1],@b[3]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[2],@a[0],@b[4]
+	umulh	@acc[3],@a[0],@b[4]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[0],@a[5],@b[0]
+	 umulh	@acc[1],@a[5],@b[0]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#32]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[2],@a[4],@b[1]
+	umulh	@acc[3],@a[4],@b[1]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[0],@a[3],@b[2]
+	umulh	@acc[1],@a[3],@b[2]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[2],@a[2],@b[3]
+	umulh	@acc[3],@a[2],@b[3]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[0],@a[1],@b[4]
+	umulh	@acc[1],@a[1],@b[4]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[2],@a[0],@b[5]
+	umulh	@acc[3],@a[0],@b[5]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[0],@a[5],@b[1]
+	 umulh	@acc[1],@a[5],@b[1]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#40]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[2],@a[4],@b[2]
+	umulh	@acc[3],@a[4],@b[2]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[0],@a[3],@b[3]
+	umulh	@acc[1],@a[3],@b[3]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[2],@a[2],@b[4]
+	umulh	@acc[3],@a[2],@b[4]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[0],@a[1],@b[5]
+	umulh	@acc[1],@a[1],@b[5]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[2],@a[5],@b[2]
+	 umulh	@acc[3],@a[5],@b[2]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#48]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[0],@a[4],@b[3]
+	umulh	@acc[1],@a[4],@b[3]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[2],@a[3],@b[4]
+	umulh	@acc[3],@a[3],@b[4]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[0],@a[2],@b[5]
+	umulh	@acc[1],@a[2],@b[5]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[2],@a[5],@b[3]
+	 umulh	@acc[3],@a[5],@b[3]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#56]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[0],@a[4],@b[4]
+	umulh	@acc[1],@a[4],@b[4]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[2],@a[3],@b[5]
+	umulh	@acc[3],@a[3],@b[5]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[0],@a[5],@b[4]
+	 umulh	@acc[1],@a[5],@b[4]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#64]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[2],@a[4],@b[5]
+	umulh	@acc[3],@a[4],@b[5]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],xzr,xzr
+	 mul	@acc[0],@a[5],@b[5]
+	 umulh	@acc[1],@a[5],@b[5]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#72]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	adds	@comba[0],@comba[0],@acc[0]
+	adc	@comba[1],@comba[1],@acc[1]
+	stp	@comba[0],@comba[1],[$r_ptr,#80]
+
+	ret
+.size	__mul_384_comba,.-__mul_384_comba
+___
+}
+print $code;
+
+close STDOUT;
diff --git a/blst/asm/mulq_mont_256-x86_64.pl b/blst/asm/mulq_mont_256-x86_64.pl
new file mode 100755
index 0000000..12e58bb
--- /dev/null
+++ b/blst/asm/mulq_mont_256-x86_64.pl
@@ -0,0 +1,513 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# As for "sparse" in subroutine names, see commentary in the
+# asm/mulx_mont_256-x86_64.pl module.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+{ ############################################################## 256 bits
+my @acc=map("%r$_",(9..15));
+
+{ ############################################################## mulq
+my ($hi, $a0) = ("%rbp", $r_ptr);
+
+$code.=<<___;
+.text
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,\@function,5,"unwind"
+.align	32
+mul_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($b_org), %rax
+	mov	8*0($a_ptr), @acc[4]
+	mov	8*1($a_ptr), @acc[5]
+	mov	8*2($a_ptr), @acc[3]
+	mov	8*3($a_ptr), $hi
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+
+	mov	%rax, @acc[6]
+	mulq	@acc[4]			# a[0]*b[0]
+	mov	%rax, @acc[0]
+	mov	@acc[6], %rax
+	mov	%rdx, @acc[1]
+	call	__mulq_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,\@function,4,"unwind"
+.align	32
+sqr_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), %rax
+	mov	$n_ptr, $n0
+	mov	8*1($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+	mov	8*2($a_ptr), @acc[3]
+	lea	($a_ptr), $b_ptr
+	mov	8*3($a_ptr), $hi
+
+	mov	%rax, @acc[6]
+	mulq	%rax			# a[0]*a[0]
+	mov	%rax, @acc[0]
+	mov	@acc[6], %rax
+	mov	%rdx, @acc[1]
+	call	__mulq_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+___
+{
+my @acc=@acc;
+$code.=<<___;
+.type	__mulq_mont_sparse_256,\@abi-omnipotent
+.align	32
+__mulq_mont_sparse_256:
+	mulq	@acc[5]			# a[1]*b[0]
+	add	%rax, @acc[1]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	@acc[3]			# a[2]*b[0]
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	mulq	$hi			# a[3]*b[0]
+	add	%rax, @acc[3]
+	 mov	8($b_ptr), %rax
+	adc	\$0, %rdx
+	xor	@acc[5], @acc[5]
+	mov	%rdx, @acc[4]
+
+___
+for (my $i=1; $i<4; $i++) {
+my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	mov	@acc[0], $a0
+	imulq	$n0, @acc[0]
+
+	################################# Multiply by b[$i]
+	mov	%rax, @acc[6]
+	mulq	8*0($a_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[4]
+	 mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[4]
+	adc	%rdx, @acc[5]		# can't overflow
+	xor	@acc[6], @acc[6]
+
+	################################# reduction
+	mulq	8*0($n_ptr)
+	add	%rax, $a0		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, $a0
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$a0, @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	 mov	$b_next, %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	add	%rdx, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	imulq	$n0, %rax
+	mov	8(%rsp), $a_ptr		# restore $r_ptr
+
+	################################# last reduction
+	mov	%rax, @acc[6]
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[0]		# guaranteed to be zero
+	mov	@acc[6], %rax
+	adc	%rdx, @acc[0]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	@acc[0], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	 mov	@acc[2], $b_ptr
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	add	%rax, @acc[3]
+	 mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	%rdx, @acc[4]
+	adc	\$0, @acc[5]
+
+	#################################
+	# Branch-less conditional subtraction of modulus
+
+	 mov	@acc[3], @acc[0]
+	sub	8*0($n_ptr), @acc[1]
+	sbb	8*1($n_ptr), @acc[2]
+	sbb	8*2($n_ptr), @acc[3]
+	 mov	@acc[4], $hi
+	sbb	8*3($n_ptr), @acc[4]
+	sbb	\$0, @acc[5]
+
+	cmovc	%rax, @acc[1]
+	cmovc	$b_ptr, @acc[2]
+	cmovc	@acc[0], @acc[3]
+	mov	@acc[1], 8*0($a_ptr)
+	cmovc	$hi, @acc[4]
+	mov	@acc[2], 8*1($a_ptr)
+	mov	@acc[3], 8*2($a_ptr)
+	mov	@acc[4], 8*3($a_ptr)
+
+	ret
+.cfi_endproc
+.size	__mulq_mont_sparse_256,.-__mulq_mont_sparse_256
+___
+} }
+{ my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
+
+$code.=<<___;
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,\@function,4,"unwind"
+.align	32
+from_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulq_by_1_mont_256
+
+	#################################
+	# Branch-less conditional acc[0:3] - modulus
+
+	#mov	@acc[4], %rax		# __mulq_by_1_mont_256 does it
+	mov	@acc[5], @acc[1]
+	mov	@acc[6], @acc[2]
+	mov	@acc[0], @acc[3]
+
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[6]
+	sbb	8*3($n_ptr), @acc[0]
+
+	cmovnc	@acc[4], %rax
+	cmovnc	@acc[5], @acc[1]
+	cmovnc	@acc[6], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[0], @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,\@function,4,"unwind"
+.align	32
+redc_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulq_by_1_mont_256
+
+	add	8*4($a_ptr), @acc[4]	# accumulate upper half
+	adc	8*5($a_ptr), @acc[5]
+	mov	@acc[4], %rax
+	adc	8*6($a_ptr), @acc[6]
+	mov	@acc[5], @acc[1]
+	adc	8*7($a_ptr), @acc[0]
+	sbb	$a_ptr, $a_ptr
+
+	#################################
+	# Branch-less conditional acc[0:4] - modulus
+
+	mov	@acc[6], @acc[2]
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[6]
+	mov	@acc[0], @acc[3]
+	sbb	8*3($n_ptr), @acc[0]
+	sbb	\$0, $a_ptr
+
+	cmovnc	@acc[4], %rax 
+	cmovnc	@acc[5], @acc[1]
+	cmovnc	@acc[6], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[0], @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	redc_mont_256,.-redc_mont_256
+___
+{
+my @acc=@acc;
+
+$code.=<<___;
+.type	__mulq_by_1_mont_256,\@abi-omnipotent
+.align	32
+__mulq_by_1_mont_256:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	mov	%rax, @acc[4]
+	imulq	$n0, %rax
+	mov	%rax, @acc[0]
+___
+for (my $i=0; $i<4; $i++) {
+my $hi = @acc[4];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[4]		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[4]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[4], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+___
+$code.=<<___	if ($i<3);
+	 mov	@acc[1], @acc[5]
+	 imulq	$n0, @acc[1]
+___
+$code.=<<___;
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	ret
+.size	__mulq_by_1_mont_256,.-__mulq_by_1_mont_256
+___
+} } }
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/mulq_mont_384-x86_64.pl b/blst/asm/mulq_mont_384-x86_64.pl
new file mode 100755
index 0000000..3812319
--- /dev/null
+++ b/blst/asm/mulq_mont_384-x86_64.pl
@@ -0,0 +1,2675 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+# common accumulator layout
+@acc=map("%r$_",(8..15));
+
+########################################################################
+{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.text
+
+########################################################################
+# Double-width subtraction modulo n<<384, as opposite to naively
+# expected modulo n*n. It works because n<<384 is the actual
+# input boundary condition for Montgomery reduction, not n*n.
+# Just in case, this is duplicated, but only one module is
+# supposed to be linked...
+.type	__sub_mod_384x384,\@abi-omnipotent
+.align	32
+__sub_mod_384x384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	8*6($a_ptr), @acc[6]
+
+	sub	8*0($b_org), @acc[0]
+	mov	8*7($a_ptr), @acc[7]
+	sbb	8*1($b_org), @acc[1]
+	mov	8*8($a_ptr), @acc[8]
+	sbb	8*2($b_org), @acc[2]
+	mov	8*9($a_ptr), @acc[9]
+	sbb	8*3($b_org), @acc[3]
+	mov	8*10($a_ptr), @acc[10]
+	sbb	8*4($b_org), @acc[4]
+	mov	8*11($a_ptr), @acc[11]
+	sbb	8*5($b_org), @acc[5]
+	 mov	@acc[0], 8*0($r_ptr)
+	sbb	8*6($b_org), @acc[6]
+	 mov	8*0($n_ptr), @acc[0]
+	 mov	@acc[1], 8*1($r_ptr)
+	sbb	8*7($b_org), @acc[7]
+	 mov	8*1($n_ptr), @acc[1]
+	 mov	@acc[2], 8*2($r_ptr)
+	sbb	8*8($b_org), @acc[8]
+	 mov	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], 8*3($r_ptr)
+	sbb	8*9($b_org), @acc[9]
+	 mov	8*3($n_ptr), @acc[3]
+	 mov	@acc[4], 8*4($r_ptr)
+	sbb	8*10($b_org), @acc[10]
+	 mov	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], 8*5($r_ptr)
+	sbb	8*11($b_org), @acc[11]
+	 mov	8*5($n_ptr), @acc[5]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[0]
+	and	$b_org, @acc[1]
+	and	$b_org, @acc[2]
+	and	$b_org, @acc[3]
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+
+	add	@acc[0], @acc[6]
+	adc	@acc[1], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[2], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[3], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[4], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[5], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,\@abi-omnipotent
+.align	32
+__add_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	8*3($b_org), @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	8*4($b_org), @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	8*5($b_org), @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[9],  @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,\@abi-omnipotent
+.align	32
+__sub_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+__sub_mod_384_a_is_loaded:
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[6]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[7]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[8]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[9]
+	sbb	8*4($b_org), @acc[4]
+	 mov	8*4($n_ptr), @acc[10]
+	sbb	8*5($b_org), @acc[5]
+	 mov	8*5($n_ptr), @acc[11]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+	and	$b_org, @acc[8]
+	and	$b_org, @acc[9]
+	and	$b_org, @acc[10]
+	and	$b_org, @acc[11]
+
+	add	@acc[6], @acc[0]
+	adc	@acc[7], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[8], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[9], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	adc	@acc[10], @acc[4]
+	mov	@acc[3], 8*3($r_ptr)
+	adc	@acc[11], @acc[5]
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+___
+}
+
+########################################################################
+# "Complex" multiplication and squaring. Use vanilla multiplication when
+# possible to fold reductions. I.e. instead of mul_mont, mul_mont
+# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod
+# followed by *common* reduction...
+{ my $frame = 5*8 +	# place for argument off-load +
+	      3*768/8;	# place for 3 768-bit temporary vectors
+$code.=<<___;
+.globl	mul_mont_384x
+.hidden	mul_mont_384x
+.type	mul_mont_384x,\@function,5,"unwind"
+.align	32
+mul_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr
+	mov	$r_ptr, 8*4(%rsp)	# offload arguments
+	mov	$a_ptr, 8*3(%rsp)
+	mov	$b_org, 8*2(%rsp)
+	mov	$n_ptr, 8*1(%rsp)
+	mov	$n0,    8*0(%rsp)
+
+	################################# mul_384(t0, a->re, b->re);
+	#lea	0($b_btr), $b_ptr	# b->re
+	#lea	0($a_ptr), $a_ptr	# a->re
+	lea	40(%rsp), $r_ptr	# t0
+	call	__mulq_384
+
+	################################# mul_384(t1, a->im, b->im);
+	lea	48($b_ptr), $b_ptr	# b->im
+	lea	48($a_ptr), $a_ptr	# a->im
+	lea	40+96(%rsp), $r_ptr	# t1
+	call	__mulq_384
+
+	################################# mul_384(t2, a->re+a->im, b->re+b->im);
+	mov	8*1(%rsp), $n_ptr
+	lea	-48($a_ptr), $b_org
+	lea	40+192+48(%rsp), $r_ptr
+	call	__add_mod_384
+
+	mov	8*2(%rsp), $a_ptr
+	lea	48($a_ptr), $b_org
+	lea	-48($r_ptr), $r_ptr
+	call	__add_mod_384
+
+	lea	($r_ptr),$b_ptr
+	lea	48($r_ptr),$a_ptr
+	call	__mulq_384
+
+	################################# t2=t2-t0-t1
+	lea	($r_ptr), $a_ptr	# t2
+	lea	40(%rsp), $b_org	# t0
+	mov	8*1(%rsp), $n_ptr
+	call	__sub_mod_384x384	# t2=t2-t0
+
+	lea	($r_ptr), $a_ptr	# t2
+	lea	-96($r_ptr), $b_org	# t1
+	call	__sub_mod_384x384	# t2=t2-t1
+
+	################################# t0=t0-t1
+	lea	40(%rsp), $a_ptr
+	lea	40+96(%rsp), $b_org
+	lea	40(%rsp), $r_ptr
+	call	__sub_mod_384x384	# t0-t1
+
+	mov	$n_ptr, $b_ptr		# n_ptr for redc_mont_384
+
+	################################# redc_mont_384(ret->re, t0, mod, n0);
+	lea	40(%rsp), $a_ptr	# t0
+	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
+	mov	8*4(%rsp), $r_ptr	# ret->re
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	################################# redc_mont_384(ret->im, t2, mod, n0);
+	lea	40+192(%rsp), $a_ptr	# t2
+	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
+	lea	48($r_ptr), $r_ptr	# ret->im
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_mont_384x,.-mul_mont_384x
+___
+}
+{ my $frame = 4*8 +	# place for argument off-load +
+	      2*384/8 +	# place for 2 384-bit temporary vectors
+	      8;	# align
+$code.=<<___;
+.globl	sqr_mont_384x
+.hidden	sqr_mont_384x
+.type	sqr_mont_384x,\@function,4,"unwind"
+.align	32
+sqr_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$n_ptr, 8*0(%rsp)	# n0
+	mov	$b_org, $n_ptr		# n_ptr
+	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
+	mov	$a_ptr, 8*2(%rsp)
+
+	################################# add_mod_384(t0, a->re, a->im);
+	lea	48($a_ptr), $b_org	# a->im
+	lea	32(%rsp), $r_ptr	# t0
+	call	__add_mod_384
+
+	################################# sub_mod_384(t1, a->re, a->im);
+	mov	8*2(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_org	# a->im
+	lea	32+48(%rsp), $r_ptr	# t1
+	call	__sub_mod_384
+
+	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
+	mov	8*2(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_ptr	# a->im
+
+	mov	48($a_ptr), %rax	# a->im
+	mov	8*0($a_ptr), @acc[6]	# a->re
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[4]
+	mov	8*3($a_ptr), @acc[5]
+
+	call	__mulq_mont_384
+___
+{
+my @acc = map("%r$_",14,15,8..11,	# output from __mulq_mont_384
+                     12,13,"ax","bx","bp","si");
+$code.=<<___;
+	add	@acc[0], @acc[0]	# add with itself
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	@acc[3], @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	@acc[4], @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	@acc[5], @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	mov	@acc[0],  8*6($r_ptr)	# ret->im
+	cmovc	@acc[9],  @acc[3]
+	mov	@acc[1],  8*7($r_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2],  8*8($r_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3],  8*9($r_ptr)
+	mov	@acc[4],  8*10($r_ptr)
+	mov	@acc[5],  8*11($r_ptr)
+___
+}
+$code.=<<___;
+	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
+	lea	32(%rsp), $a_ptr	# t0
+	lea	32+48(%rsp), $b_ptr	# t1
+
+	mov	32+48(%rsp), %rax	# t1[0]
+	mov	32+8*0(%rsp), @acc[6]	# t0[0..3]
+	mov	32+8*1(%rsp), @acc[7]
+	mov	32+8*2(%rsp), @acc[4]
+	mov	32+8*3(%rsp), @acc[5]
+
+	call	__mulq_mont_384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_mont_384x,.-sqr_mont_384x
+
+.globl	mul_382x
+.hidden	mul_382x
+.type	mul_382x,\@function,4,"unwind"
+.align	32
+mul_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	96($r_ptr), $r_ptr	# ret->im
+	mov	$a_ptr, 8*0(%rsp)
+	mov	$b_org, 8*1(%rsp)
+	mov	$r_ptr, 8*2(%rsp)	# offload ret->im
+	mov	$n_ptr, 8*3(%rsp)
+
+	################################# t0 = a->re + a->im
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	add	8*6($a_ptr), @acc[0]
+	adc	8*7($a_ptr), @acc[1]
+	adc	8*8($a_ptr), @acc[2]
+	adc	8*9($a_ptr), @acc[3]
+	adc	8*10($a_ptr), @acc[4]
+	adc	8*11($a_ptr), @acc[5]
+
+	mov	@acc[0], 32+8*0(%rsp)
+	mov	@acc[1], 32+8*1(%rsp)
+	mov	@acc[2], 32+8*2(%rsp)
+	mov	@acc[3], 32+8*3(%rsp)
+	mov	@acc[4], 32+8*4(%rsp)
+	mov	@acc[5], 32+8*5(%rsp)
+
+	################################# t1 = b->re + b->im
+	mov	8*0($b_org), @acc[0]
+	mov	8*1($b_org), @acc[1]
+	mov	8*2($b_org), @acc[2]
+	mov	8*3($b_org), @acc[3]
+	mov	8*4($b_org), @acc[4]
+	mov	8*5($b_org), @acc[5]
+
+	add	8*6($b_org), @acc[0]
+	adc	8*7($b_org), @acc[1]
+	adc	8*8($b_org), @acc[2]
+	adc	8*9($b_org), @acc[3]
+	adc	8*10($b_org), @acc[4]
+	adc	8*11($b_org), @acc[5]
+
+	mov	@acc[0], 32+8*6(%rsp)
+	mov	@acc[1], 32+8*7(%rsp)
+	mov	@acc[2], 32+8*8(%rsp)
+	mov	@acc[3], 32+8*9(%rsp)
+	mov	@acc[4], 32+8*10(%rsp)
+	mov	@acc[5], 32+8*11(%rsp)
+
+	################################# mul_384(ret->im, t0, t1);
+	lea	32+8*0(%rsp), $a_ptr	# t0
+	lea	32+8*6(%rsp), $b_ptr	# t1
+	call	__mulq_384
+
+	################################# mul_384(ret->re, a->re, b->re);
+	mov	8*0(%rsp), $a_ptr
+	mov	8*1(%rsp), $b_ptr
+	lea	-96($r_ptr), $r_ptr	# ret->re
+	call	__mulq_384
+
+	################################# mul_384(tx, a->im, b->im);
+	lea	48($a_ptr), $a_ptr
+	lea	48($b_ptr), $b_ptr
+	lea	32(%rsp), $r_ptr
+	call	__mulq_384
+
+	################################# ret->im -= tx
+	mov	8*2(%rsp), $a_ptr	# restore ret->im
+	lea	32(%rsp), $b_org
+	mov	8*3(%rsp), $n_ptr
+	mov	$a_ptr, $r_ptr
+	call	__sub_mod_384x384
+
+	################################# ret->im -= ret->re
+	lea	0($r_ptr), $a_ptr
+	lea	-96($r_ptr), $b_org
+	call	__sub_mod_384x384
+
+	################################# ret->re -= tx
+	lea	-96($r_ptr), $a_ptr
+	lea	32(%rsp), $b_org
+	lea	-96($r_ptr), $r_ptr
+	call	__sub_mod_384x384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_382x,.-mul_382x
+___
+}
+{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.globl	sqr_382x
+.hidden	sqr_382x
+.type	sqr_382x,\@function,3,"unwind"
+.align	32
+sqr_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$a_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+
+	################################# t0 = a->re + a->im
+	mov	8*0($a_ptr), @acc[6]
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[9]
+	mov	8*4($a_ptr), @acc[10]
+	mov	8*5($a_ptr), @acc[11]
+
+	mov	@acc[6], @acc[0]
+	add	8*6($a_ptr), @acc[6]
+	mov	@acc[7], @acc[1]
+	adc	8*7($a_ptr), @acc[7]
+	mov	@acc[8], @acc[2]
+	adc	8*8($a_ptr), @acc[8]
+	mov	@acc[9], @acc[3]
+	adc	8*9($a_ptr), @acc[9]
+	mov	@acc[10], @acc[4]
+	adc	8*10($a_ptr), @acc[10]
+	mov	@acc[11], @acc[5]
+	adc	8*11($a_ptr), @acc[11]
+
+	mov	@acc[6], 8*0($r_ptr)
+	mov	@acc[7], 8*1($r_ptr)
+	mov	@acc[8], 8*2($r_ptr)
+	mov	@acc[9], 8*3($r_ptr)
+	mov	@acc[10], 8*4($r_ptr)
+	mov	@acc[11], 8*5($r_ptr)
+
+	################################# t1 = a->re - a->im
+	lea	48($a_ptr), $b_org
+	lea	48($r_ptr), $r_ptr
+	call	__sub_mod_384_a_is_loaded
+
+	################################# mul_384(ret->re, t0, t1);
+	lea	($r_ptr), $a_ptr
+	lea	-48($r_ptr), $b_ptr
+	lea	-48($r_ptr), $r_ptr
+	call	__mulq_384
+
+	################################# mul_384(ret->im, a->re, a->im);
+	mov	(%rsp), $a_ptr
+	lea	48($a_ptr), $b_ptr
+	lea	96($r_ptr), $r_ptr
+	call	__mulq_384
+
+	mov	8*0($r_ptr), @acc[0]	# double ret->im
+	mov	8*1($r_ptr), @acc[1]
+	mov	8*2($r_ptr), @acc[2]
+	mov	8*3($r_ptr), @acc[3]
+	mov	8*4($r_ptr), @acc[4]
+	mov	8*5($r_ptr), @acc[5]
+	mov	8*6($r_ptr), @acc[6]
+	mov	8*7($r_ptr), @acc[7]
+	mov	8*8($r_ptr), @acc[8]
+	mov	8*9($r_ptr), @acc[9]
+	mov	8*10($r_ptr), @acc[10]
+	add	@acc[0], @acc[0]
+	mov	8*11($r_ptr), @acc[11]
+	adc	@acc[1], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[2], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[3], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	adc	@acc[4], @acc[4]
+	mov	@acc[3], 8*3($r_ptr)
+	adc	@acc[5], @acc[5]
+	mov	@acc[4], 8*4($r_ptr)
+	adc	@acc[6], @acc[6]
+	mov	@acc[5], 8*5($r_ptr)
+	adc	@acc[7], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[8], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[9], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[10], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[11], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	mov	8*1(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*2(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*3(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*4(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*5(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*6(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*7(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*7
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_382x,.-sqr_382x
+___
+}
+{ ########################################################## 384-bit mul
+my @acc=map("%r$_",("cx",8..12));
+my $bi = "%rbp";
+
+$code.=<<___;
+.globl	mul_384
+.hidden	mul_384
+.type	mul_384,\@function,3,"unwind"
+.align	32
+mul_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr
+	call	__mulq_384
+
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_384,.-mul_384
+
+.type	__mulq_384,\@abi-omnipotent
+.align	32
+__mulq_384:
+	mov	8*0($b_ptr), %rax
+
+	mov	%rax, $bi
+	mulq	8*0($a_ptr)
+	mov	%rax, 8*0($r_ptr)
+	mov	$bi, %rax
+	mov	%rdx, @acc[0]
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[0]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[1]
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+
+	mulq	8*5($a_ptr)
+	add	%rax, @acc[4]
+	mov	8*1($b_ptr), %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[5]
+___
+for(my $i=1; $i<6; $i++) {
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax";
+$code.=<<___;
+	mov	%rax, $bi
+	mulq	8*0($a_ptr)
+	add	%rax, @acc[0]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	@acc[0], 8*$i($r_ptr)
+	mov	%rdx, @acc[0]
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[1], @acc[0]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[1]
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[2], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[3], @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[4]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[4], @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+
+	mulq	8*5($a_ptr)
+	add	%rax, @acc[5]
+	mov	$b_next, %rax
+	adc	\$0, %rdx
+	add	@acc[5], @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[5]
+___
+}
+$code.=<<___;
+	mov	@acc[0], 8*6($r_ptr)
+	mov	@acc[1], 8*7($r_ptr)
+	mov	@acc[2], 8*8($r_ptr)
+	mov	@acc[3], 8*9($r_ptr)
+	mov	@acc[4], 8*10($r_ptr)
+	mov	@acc[5], 8*11($r_ptr)
+
+	ret
+.size	__mulq_384,.-__mulq_384
+___
+}
+if (0) { ##############################################################
+my @b=map("%r$_",(10..15));
+my @a=reverse(@b);
+   @b[5]=$b_ptr;
+my $bi = "%rbp";
+my @comba=map("%r$_",("cx",8,9));
+#                                                   a[0]*b[0]
+#                                              a[1]*b[0]
+#                                              a[0]*b[1]
+#                                         a[2]*b[0]
+#                                         a[1]*b[1]
+#                                         a[0]*b[2]
+#                                    a[3]*b[0]
+#                                    a[2]*b[1]
+#                                    a[1]*b[2]
+#                                    a[0]*b[3]
+#                               a[4]*b[0]
+#                               a[3]*b[1]
+#                               a[2]*b[2]
+#                               a[1]*b[3]
+#                               a[0]*b[4]
+#                          a[5]*b[0]
+#                          a[4]*b[1]
+#                          a[3]*b[2]
+#                          a[2]*b[3]
+#                          a[1]*b[4]
+#                          a[0]*b[5]
+#                     a[5]*b[1]
+#                     a[4]*b[2]
+#                     a[3]*b[3]
+#                     a[2]*b[4]
+#                     a[1]*b[5]
+#                a[5]*b[2]
+#                a[4]*b[3]
+#                a[3]*b[4]
+#                a[2]*b[5]
+#           a[5]*b[3]
+#           a[4]*b[4]
+#           a[3]*b[5]
+#      a[5]*b[4]
+#      a[4]*b[5]
+# a[5]*b[5]
+#
+# 13% less instructions give +15% on Core2, +10% on Goldmont,
+# -0% on Sandy Bridge, but -16% on Haswell:-(
+# [for reference +5% on Skylake, +11% on Ryzen]
+
+$code.=<<___;
+.type	__mulq_comba_384,\@abi-omnipotent
+.align	32
+__mulq_comba_384:
+	mov	8*0($b_ptr), %rax
+	mov	8*0($a_ptr), @a[0]
+	mov	8*1($a_ptr), @a[1]
+	mov	8*1($b_ptr), @b[1]
+
+	mov	%rax, @b[0]
+	mulq	@a[0]			# a[0]*b[0]
+	mov	%rax, 8*0($r_ptr)
+	mov	@b[0], %rax
+	mov	%rdx, @comba[0]
+
+	#################################
+	mov	8*2($a_ptr), @a[2]
+	xor	@comba[2], @comba[2]
+	mulq	@a[1]			# a[1]*b[0]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	\$0, %rdx
+	mov	8*2($b_ptr), @b[2]
+	mov	%rdx, @comba[1]
+
+	mulq	@a[0]			# a[0]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[0], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*1($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[2]			# a[2]*b[0]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[1]			# a[1]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[2], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[0]			# a[0]*b[2]
+	add	%rax, @comba[0]
+	mov	@b[0], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*2($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	8*3($a_ptr)		# a[3]*b[0]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[2]			# a[2]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[2], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[1]			# a[1]*b[2]
+	add	%rax, @comba[0]
+	mov	8*3($b_ptr), %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mov	%rax, @b[3]
+	mulq	@a[0]			# a[0]*b[3]
+	add	%rax, @comba[0]
+	mov	@b[0], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*3($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	8*4($a_ptr)		# a[4]*b[0]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*3($a_ptr)		# a[3]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[2], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*2($a_ptr)		# a[2]*b[2]
+	add	%rax, @comba[0]
+	mov	@b[3], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[1]			# a[1]*b[3]
+	add	%rax, @comba[0]
+	mov	8*4($b_ptr), %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mov	%rax, @b[4]
+	mulq	@a[0]			# a[0]*b[4]
+	add	%rax, @comba[0]
+	mov	@b[0], %rax
+	adc	%rdx, @comba[1]
+	mov	8*5($a_ptr), @a[5]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*4($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[5]			# a[5]*b[0]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*4($a_ptr)		# a[4]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[2], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*3($a_ptr)		# a[3]*b[2]
+	add	%rax, @comba[0]
+	mov	@b[3], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*2($a_ptr)		# a[2]*b[3]
+	add	%rax, @comba[0]
+	mov	@b[4], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*1($a_ptr)		# a[1]*b[4]
+	add	%rax, @comba[0]
+	mov	8*5($b_ptr), %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mov	%rax, @b[5]
+	mulq	@a[0]			# a[0]*b[5]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	%rdx, @comba[1]
+	mov	8*4($a_ptr), @a[4]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*5($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[5]			# a[5]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[2], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[4]			# a[4]*b[2]
+	add	%rax, @comba[0]
+	mov	@b[3], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*3($a_ptr)		# a[3]*b[3]
+	add	%rax, @comba[0]
+	mov	@b[4], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*2($a_ptr)		# a[2]*b[4]
+	add	%rax, @comba[0]
+	mov	@b[5], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*1($a_ptr)		# a[1]*b[5]
+	add	%rax, @comba[0]
+	mov	$b[2], %rax
+	adc	%rdx, @comba[1]
+	mov	8*3($a_ptr), @a[3]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*6($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[5]			# a[5]*b[2]
+	add	%rax, @comba[0]
+	mov	@b[3], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[4]			# a[4]*b[3]
+	add	%rax, @comba[0]
+	mov	@b[4], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[3]			# a[3]*b[4]
+	add	%rax, @comba[0]
+	mov	@b[5], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*2($a_ptr)		# a[2]*b[5]
+	add	%rax, @comba[0]
+	mov	@b[3], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*7($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[5]			# a[5]*b[3]
+	add	%rax, @comba[0]
+	mov	@b[4], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[4]			# a[4]*b[4]
+	add	%rax, @comba[0]
+	mov	@b[5], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[3]			# a[3]*b[5]
+	add	%rax, @comba[0]
+	mov	@b[4], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*8($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[5]			# a[5]*b[4]
+	add	%rax, @comba[0]
+	mov	@b[5], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[4]			# a[4]*b[5]
+	add	%rax, @comba[0]
+	mov	@b[5], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*9($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	mulq	@a[5]			# a[5]*b[4]
+	add	%rax, @comba[0]
+	adc	%rdx, @comba[1]
+
+	mov	@comba[0], 8*10($r_ptr)
+	mov	@comba[1], 8*11($r_ptr)
+
+	ret
+.size	__mulq_comba_384,.-__mulq_comba_384
+___
+}
+{ ########################################################## 384-bit sqr
+my @acc=(@acc,"%rcx","%rbx","%rbp",$a_ptr);
+my $hi;
+
+$code.=<<___;
+.globl	sqr_384
+.hidden	sqr_384
+.type	sqr_384,\@function,2,"unwind"
+.align	32
+sqr_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__sqrq_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_384,.-sqr_384
+
+.type	__sqrq_384,\@abi-omnipotent
+.align	32
+__sqrq_384:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[9]
+
+	#########################################
+	mov	%rax, @acc[6]
+	mulq	@acc[7]				# a[1]*a[0]
+	mov	%rax, @acc[1]
+	mov	@acc[6], %rax
+	 mov	8*4($a_ptr), @acc[10]
+	mov	%rdx, @acc[2]
+
+	mulq	@acc[8]				# a[2]*a[0]
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	 mov	8*5($a_ptr), @acc[11]
+	mov	%rdx, @acc[3]
+
+	mulq	@acc[9]				# a[3]*a[0]
+	add	%rax, @acc[3]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+
+	mulq	@acc[10]			# a[4]*a[0]
+	add	%rax, @acc[4]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[5]
+
+	mulq	@acc[11]			# a[5]*a[0]
+	add	%rax, @acc[5]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	mulq	%rax				# a[0]*a[0]
+	xor	@acc[0], @acc[0]
+	mov	%rax, 8*0($r_ptr)
+	 mov	@acc[7], %rax
+	add	@acc[1], @acc[1]		# double acc[1]
+	adc	\$0, @acc[0]
+	add	%rdx, @acc[1]			# accumulate a[0]*a[0]
+	adc	\$0, @acc[0]			# carries to a[1]*a[1]
+	mov	@acc[1], 8*1($r_ptr)
+___
+$hi=@acc[1];
+$code.=<<___;
+	#########################################
+	mulq	@acc[8]				# a[2]*a[1]
+	add	%rax, @acc[3]
+	mov	@acc[7], %rax
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	@acc[9]				# a[3]*a[1]
+	add	%rax, @acc[4]
+	mov	@acc[7], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	@acc[10]			# a[4]*a[1]
+	add	%rax, @acc[5]
+	mov	@acc[7], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[5]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	@acc[11]			# a[5]*a[1]
+	add	%rax, @acc[6]
+	mov	@acc[7], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[6]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	%rax				# a[1]*a[1]
+	xor	@acc[1], @acc[1]
+	add	%rax, @acc[0]			# can't carry
+	 mov	@acc[8], %rax
+	add	@acc[2], @acc[2]		# double acc[2:3]
+	adc	@acc[3], @acc[3]
+	adc	\$0, @acc[1]
+	add	@acc[0], @acc[2]		# accumulate a[1]*a[1]
+	adc	%rdx, @acc[3]
+	adc	\$0, @acc[1]			# carries to a[2]*a[2]
+	mov	@acc[2], 8*2($r_ptr)
+___
+$hi=@acc[0];
+$code.=<<___;
+	#########################################
+	mulq	@acc[9]				# a[3]*a[2]
+	add	%rax, @acc[5]
+	mov	@acc[8], %rax
+	adc	\$0, %rdx
+	 mov	@acc[3], 8*3($r_ptr)
+	mov	%rdx, $hi
+
+	mulq	@acc[10]			# a[4]*a[2]
+	add	%rax, @acc[6]
+	mov	@acc[8], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[6]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	@acc[11]			# a[5]*a[2]
+	add	%rax, @acc[7]
+	mov	@acc[8], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[7]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[8]
+
+	mulq	%rax				# a[2]*a[2]
+	xor	@acc[3], @acc[3]
+	add	%rax, @acc[1]			# can't carry
+	 mov	@acc[9], %rax
+	add	@acc[4], @acc[4]		# double acc[4:5]
+	adc	@acc[5], @acc[5]
+	adc	\$0, @acc[3]
+	add	@acc[1], @acc[4]		# accumulate a[2]*a[2]
+	adc	%rdx, @acc[5]
+	adc	\$0, @acc[3]			# carries to a[3]*a[3]
+	mov	@acc[4], 8*4($r_ptr)
+
+	#########################################
+	mulq	@acc[10]			# a[4]*a[3]
+	add	%rax, @acc[7]
+	mov	@acc[9], %rax
+	adc	\$0, %rdx
+	 mov	@acc[5], 8*5($r_ptr)
+	mov	%rdx, $hi
+
+	mulq	@acc[11]			# a[5]*a[3]
+	add	%rax, @acc[8]
+	mov	@acc[9], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[8]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[9]
+
+	mulq	%rax				# a[3]*a[3]
+	xor	@acc[4], @acc[4]
+	add	%rax, @acc[3]			# can't carry
+	 mov	@acc[10], %rax
+	add	@acc[6], @acc[6]		# double acc[6:7]
+	adc	@acc[7], @acc[7]
+	adc	\$0, @acc[4]
+	add	@acc[3], @acc[6]		# accumulate a[3]*a[3]
+	adc	%rdx, @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	\$0, @acc[4]			# carries to a[4]*a[4]
+	mov	@acc[7], 8*7($r_ptr)
+
+	#########################################
+	mulq	@acc[11]			# a[5]*a[4]
+	add	%rax, @acc[9]
+	mov	@acc[10], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[10]
+
+	mulq	%rax				# a[4]*a[4]
+	xor	@acc[5], @acc[5]
+	add	%rax, @acc[4]			# can't carry
+	 mov	@acc[11], %rax
+	add	@acc[8], @acc[8]		# double acc[8:9]
+	adc	@acc[9], @acc[9]
+	adc	\$0, @acc[5]
+	add	@acc[4], @acc[8]		# accumulate a[4]*a[4]
+	adc	%rdx, @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	\$0, @acc[5]			# carries to a[5]*a[5]
+	mov	@acc[9], 8*9($r_ptr)
+
+	#########################################
+	mulq	%rax				# a[5]*a[5]
+	add	@acc[5], %rax			# can't carry
+	add	@acc[10], @acc[10]		# double acc[10]
+	adc	\$0, %rdx
+	add	@acc[10], %rax			# accumulate a[5]*a[5]
+	adc	\$0, %rdx
+	mov	%rax, 8*10($r_ptr)
+	mov	%rdx, 8*11($r_ptr)
+
+	ret
+.size	__sqrq_384,.-__sqrq_384
+
+.globl	sqr_mont_384
+.hidden	sqr_mont_384
+.type	sqr_mont_384,\@function,4,"unwind"
+.align	32
+sqr_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8*15, %rsp
+.cfi_adjust_cfa_offset	8*15
+.cfi_end_prologue
+
+	mov	$n_ptr, 8*12(%rsp)	# n0
+	mov	$b_org, 8*13(%rsp)	# n_ptr
+	mov	$r_ptr, 8*14(%rsp)
+
+	mov	%rsp, $r_ptr
+	call	__sqrq_384
+
+	lea	0(%rsp), $a_ptr
+	mov	8*12(%rsp), %rcx	# n0 for mul_by_1
+	mov	8*13(%rsp), $b_ptr	# n_ptr for mul_by_1
+	mov	8*14(%rsp), $r_ptr
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	lea	8*15(%rsp), %r8		# size optimization
+	mov	8*15(%rsp), %r15
+.cfi_restore	%r15
+	mov	8*1(%r8), %r14
+.cfi_restore	%r14
+	mov	8*2(%r8), %r13
+.cfi_restore	%r13
+	mov	8*3(%r8), %r12
+.cfi_restore	%r12
+	mov	8*4(%r8), %rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8), %rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8), %rsp
+.cfi_adjust_cfa_offset	-8*21
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_mont_384,.-sqr_mont_384
+___
+}
+{ ########################################################## 384-bit redc_mont
+my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
+
+$code.=<<___;
+########################################################################
+# void redc_mont_384(uint64_t ret[6], const uint64_t a[12],
+#                    uint64_t m[6], uint64_t n0);
+.globl	redc_mont_384
+.hidden	redc_mont_384
+.type	redc_mont_384,\@function,4,"unwind"
+.align	32
+redc_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	redc_mont_384,.-redc_mont_384
+
+########################################################################
+# void from_mont_384(uint64_t ret[6], const uint64_t a[6],
+#                    uint64_t m[6], uint64_t n0);
+.globl	from_mont_384
+.hidden	from_mont_384
+.type	from_mont_384,\@function,4,"unwind"
+.align	32
+from_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulq_by_1_mont_384
+
+	#################################
+	# Branch-less conditional acc[0:6] - modulus
+
+	#mov	@acc[6], %rax		# __mulq_by_1_mont_384 does it
+	mov	@acc[7], %rcx
+	mov	@acc[0], %rdx
+	mov	@acc[1], %rbp
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[7]
+	mov	@acc[2], @acc[5]
+	sbb	8*2($n_ptr), @acc[0]
+	sbb	8*3($n_ptr), @acc[1]
+	sbb	8*4($n_ptr), @acc[2]
+	mov	@acc[3], $a_ptr
+	sbb	8*5($n_ptr), @acc[3]
+
+	cmovc	%rax, @acc[6]
+	cmovc	%rcx, @acc[7]
+	cmovc	%rdx, @acc[0]
+	mov	@acc[6], 8*0($r_ptr)
+	cmovc	%rbp, @acc[1]
+	mov	@acc[7], 8*1($r_ptr)
+	cmovc	@acc[5], @acc[2]
+	mov	@acc[0], 8*2($r_ptr)
+	cmovc	$a_ptr,  @acc[3]
+	mov	@acc[1], 8*3($r_ptr)
+	mov	@acc[2], 8*4($r_ptr)
+	mov	@acc[3], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	from_mont_384,.-from_mont_384
+___
+{ my @acc=@acc;				# will be rotated locally
+
+$code.=<<___;
+.type	__mulq_by_1_mont_384,\@abi-omnipotent
+.align	32
+__mulq_by_1_mont_384:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	mov	%rax, @acc[6]
+	imulq	$n0, %rax
+	mov	%rax, @acc[0]
+___
+for (my $i=0; $i<6; $i++) {
+my $hi = @acc[6];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[6]		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[6]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[6], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+___
+$code.=<<___	if ($i<5);
+	 mov	@acc[1], @acc[7]
+	 imulq	$n0, @acc[1]
+___
+$code.=<<___;
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*4($n_ptr)
+	add	%rax, @acc[4]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*5($n_ptr)
+	add	%rax, @acc[5]
+	mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[5]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	ret
+.size	__mulq_by_1_mont_384,.-__mulq_by_1_mont_384
+
+.type	__redc_tail_mont_384,\@abi-omnipotent
+.align	32
+__redc_tail_mont_384:
+	add	8*6($a_ptr), @acc[0]	# accumulate upper half
+	mov	@acc[0], %rax
+	adc	8*7($a_ptr), @acc[1]
+	adc	8*8($a_ptr), @acc[2]
+	adc	8*9($a_ptr), @acc[3]
+	mov	@acc[1], %rcx
+	adc	8*10($a_ptr), @acc[4]
+	adc	8*11($a_ptr), @acc[5]
+	sbb	@acc[6], @acc[6]
+
+	#################################
+	# Branch-less conditional acc[0:6] - modulus
+
+	mov	@acc[2], %rdx
+	mov	@acc[3], %rbp
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	mov	@acc[4], @acc[7]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	mov	@acc[5], $a_ptr
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, @acc[6]
+
+	cmovc	%rax, @acc[0]
+	cmovc	%rcx, @acc[1]
+	cmovc	%rdx, @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	%rbp, @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[7], @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	$a_ptr,  @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	sgn0_pty_mont_384
+.hidden	sgn0_pty_mont_384
+.type	sgn0_pty_mont_384,\@function,3,"unwind"
+.align	32
+sgn0_pty_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $n_ptr
+	lea	0($r_ptr), $a_ptr
+	mov	$b_org, $n0
+	call	__mulq_by_1_mont_384
+
+	xor	%rax, %rax
+	mov	@acc[0], @acc[7]
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	not	%rax			# 2*x > p, which means "negative"
+	and	\$1, @acc[7]
+	and	\$2, %rax
+	or	@acc[7], %rax		# pack sign and parity
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384
+
+.globl	sgn0_pty_mont_384x
+.hidden	sgn0_pty_mont_384x
+.type	sgn0_pty_mont_384x,\@function,3,"unwind"
+.align	32
+sgn0_pty_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $n_ptr
+	lea	48($r_ptr), $a_ptr	# sgn0(a->im)
+	mov	$b_org, $n0
+	call	__mulq_by_1_mont_384
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	lea	0($r_ptr), $a_ptr	# sgn0(a->re)
+	xor	$r_ptr, $r_ptr
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, $r_ptr
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $r_ptr
+
+	mov	@acc[0], 0(%rsp)	# a->im is zero or not
+	not	$r_ptr			# 2*x > p, which means "negative"
+	and	\$1, @acc[7]
+	and	\$2, $r_ptr
+	or	@acc[7], $r_ptr		# pack sign and parity
+
+	call	__mulq_by_1_mont_384
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	xor	%rax, %rax
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	mov	0(%rsp), @acc[6]
+
+	not	%rax			# 2*x > p, which means "negative"
+
+	test	@acc[0], @acc[0]
+	cmovz	$r_ptr, @acc[7]		# a->re==0? prty(a->im) : prty(a->re)
+
+	test	@acc[6], @acc[6]
+	cmovnz	$r_ptr, %rax		# a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	\$1, @acc[7]
+	and	\$2, %rax
+	or	@acc[7], %rax		# pack sign and parity
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
+___
+} }
+
+{ ########################################################## mulq_mont
+my ($bi, $hi) = ("%rdi", "%rbp");
+
+$code.=<<___;
+.globl	mul_mont_384
+.hidden	mul_mont_384
+.type	mul_mont_384,\@function,5,"unwind"
+.align	32
+mul_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8*3, %rsp
+.cfi_adjust_cfa_offset	8*3
+.cfi_end_prologue
+
+	mov	8*0($b_org), %rax
+	mov	8*0($a_ptr), @acc[6]
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[4]
+	mov	8*3($a_ptr), @acc[5]
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+	mov	$n0,    8*0(%rsp)
+	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
+
+	call	__mulq_mont_384
+
+	mov	24(%rsp),%r15
+.cfi_restore	%r15
+	mov	32(%rsp),%r14
+.cfi_restore	%r14
+	mov	40(%rsp),%r13
+.cfi_restore	%r13
+	mov	48(%rsp),%r12
+.cfi_restore	%r12
+	mov	56(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	64(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-72
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_mont_384,.-mul_mont_384
+___
+{ my @acc=@acc;				# will be rotated locally
+
+$code.=<<___;
+.type	__mulq_mont_384,\@abi-omnipotent
+.align	32
+__mulq_mont_384:
+	mov	%rax, $bi
+	mulq	@acc[6]			# a[0]*b[0]
+	mov	%rax, @acc[0]
+	mov	$bi, %rax
+	mov	%rdx, @acc[1]
+
+	mulq	@acc[7]			# a[1]*b[0]
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	@acc[4]			# a[2]*b[0]
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	 mov	@acc[0], $hi
+	 imulq	8(%rsp), @acc[0]
+
+	mulq	@acc[5]			# a[3]*b[0]
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[4]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[5]
+
+	mulq	8*5($a_ptr)
+	add	%rax, @acc[5]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	xor	@acc[7], @acc[7]
+	mov	%rdx, @acc[6]
+___
+for (my $i=0; $i<6;) {
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, $hi		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, $hi
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	add	%rax, @acc[3]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*4($n_ptr)
+	add	%rax, @acc[4]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*5($n_ptr)
+	add	%rax, @acc[5]
+	mov	$b_next, %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[5]
+	adc	%rdx, @acc[6]
+	adc	\$0, @acc[7]
+___
+    push(@acc,shift(@acc));
+$code.=<<___	if ($i++<5);
+	################################# Multiply by b[$i]
+	mov	%rax, $bi
+	mulq	8*0($a_ptr)
+	add	%rax, @acc[0]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	 mov	@acc[0], $hi
+	 imulq	8(%rsp), @acc[0]
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[4]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*5($a_ptr)
+	add	@acc[7], @acc[5]
+	adc	\$0, %rdx
+	xor	@acc[7], @acc[7]
+	add	%rax, @acc[5]
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[6]
+	adc	\$0, @acc[7]
+___
+}
+$code.=<<___;
+	#################################
+	# Branch-less conditional acc[0:6] - modulus
+
+	#mov	@acc[0], %rax
+	mov	8*2(%rsp), $r_ptr	# restore $r_ptr
+	sub	8*0($n_ptr), @acc[0]
+	mov	@acc[1], %rdx
+	sbb	8*1($n_ptr), @acc[1]
+	mov	@acc[2], $b_ptr
+	sbb	8*2($n_ptr), @acc[2]
+	mov	@acc[3], $a_ptr
+	sbb	8*3($n_ptr), @acc[3]
+	mov	@acc[4], $hi
+	sbb	8*4($n_ptr), @acc[4]
+	mov	@acc[5], @acc[7]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, @acc[6]
+
+	cmovc	%rax,    @acc[0]
+	cmovc	%rdx,    @acc[1]
+	cmovc	$b_ptr,  @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	$a_ptr,  @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	$hi,     @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	@acc[7], @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__mulq_mont_384,.-__mulq_mont_384
+___
+} }
+$code.=<<___;
+.globl	sqr_n_mul_mont_384
+.hidden	sqr_n_mul_mont_384
+.type	sqr_n_mul_mont_384,\@function,6,"unwind"
+.align	32
+sqr_n_mul_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8*17, %rsp
+.cfi_adjust_cfa_offset	8*17
+.cfi_end_prologue
+
+	mov	$n0,    8*0(%rsp)
+	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
+	mov	$n_ptr, 8*2(%rsp)
+	lea	8*4(%rsp), $r_ptr
+	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
+	movq	(%r9), %xmm2		# prefetch b[0]
+
+.Loop_sqr_384:
+	movd	%edx, %xmm1		# loop counter
+
+	call	__sqrq_384
+
+	lea	0($r_ptr), $a_ptr
+	mov	8*0(%rsp), %rcx		# n0 for mul_by_1
+	mov	8*2(%rsp), $b_ptr	# n_ptr for mul_by_1
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movd	%xmm1, %edx
+	lea	0($r_ptr), $a_ptr
+	dec	%edx
+	jnz	.Loop_sqr_384
+
+	movq	%xmm2, %rax		# b[0]
+	mov	$b_ptr, $n_ptr
+	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
+
+	#mov	8*0($b_ptr), %rax
+	#mov	8*0($a_ptr), @acc[6]
+	#mov	8*1($a_ptr), @acc[7]
+	#mov	8*2($a_ptr), @acc[4]
+	#mov	8*3($a_ptr), @acc[5]
+	mov	@acc[0], @acc[4]
+	mov	@acc[1], @acc[5]
+
+	call	__mulq_mont_384
+
+	lea	8*17(%rsp), %r8		# size optimization
+	mov	8*17(%rsp), %r15
+.cfi_restore	%r15
+	mov	8*1(%r8), %r14
+.cfi_restore	%r14
+	mov	8*2(%r8), %r13
+.cfi_restore	%r13
+	mov	8*3(%r8), %r12
+.cfi_restore	%r12
+	mov	8*4(%r8), %rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8), %rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8), %rsp
+.cfi_adjust_cfa_offset	-8*23
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_n_mul_mont_384,.-sqr_n_mul_mont_384
+
+.globl	sqr_n_mul_mont_383
+.hidden	sqr_n_mul_mont_383
+.type	sqr_n_mul_mont_383,\@function,6,"unwind"
+.align	32
+sqr_n_mul_mont_383:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8*17, %rsp
+.cfi_adjust_cfa_offset	8*17
+.cfi_end_prologue
+
+	mov	$n0, 8*0(%rsp)
+	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
+	mov	$n_ptr, 8*2(%rsp)
+	lea	8*4(%rsp), $r_ptr
+	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
+	movq	(%r9), %xmm2		# prefetch b[0]
+
+.Loop_sqr_383:
+	movd	%edx, %xmm1		# loop counter
+
+	call	__sqrq_384
+
+	lea	0($r_ptr), $a_ptr
+	mov	8*0(%rsp), %rcx		# n0 for mul_by_1
+	mov	8*2(%rsp), $b_ptr	# n_ptr for mul_by_1
+	call	__mulq_by_1_mont_384
+
+	movd	%xmm1, %edx		# loop counter
+        add     8*6($a_ptr), @acc[6]	# just accumulate upper half
+        adc     8*7($a_ptr), @acc[7]
+        adc     8*8($a_ptr), @acc[0]
+        adc     8*9($a_ptr), @acc[1]
+        adc     8*10($a_ptr), @acc[2]
+        adc     8*11($a_ptr), @acc[3]
+	lea	0($r_ptr), $a_ptr
+
+	mov	@acc[6], 8*0($r_ptr)	# omitting full reduction gives ~5%
+	mov	@acc[7], 8*1($r_ptr)	# in addition-chains
+	mov	@acc[0], 8*2($r_ptr)
+	mov	@acc[1], 8*3($r_ptr)
+	mov	@acc[2], 8*4($r_ptr)
+	mov	@acc[3], 8*5($r_ptr)
+
+	dec	%edx
+	jnz	.Loop_sqr_383
+
+	movq	%xmm2, %rax		# b[0]
+	mov	$b_ptr, $n_ptr
+	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
+
+	#movq	8*0($b_ptr), %rax
+	#mov	8*0($a_ptr), @acc[6]
+	#mov	8*1($a_ptr), @acc[7]
+	#mov	8*2($a_ptr), @acc[4]
+	#mov	8*3($a_ptr), @acc[5]
+	mov	@acc[0], @acc[4]
+	mov	@acc[1], @acc[5]
+
+	call	__mulq_mont_384		# formally one can omit full reduction
+					# even after multiplication...
+	lea	8*17(%rsp), %r8		# size optimization
+	mov	8*17(%rsp), %r15
+.cfi_restore	%r15
+	mov	8*1(%r8), %r14
+.cfi_restore	%r14
+	mov	8*2(%r8), %r13
+.cfi_restore	%r13
+	mov	8*3(%r8), %r12
+.cfi_restore	%r12
+	mov	8*4(%r8), %rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8), %rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8), %rsp
+.cfi_adjust_cfa_offset	-8*23
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
+___
+{ my @acc=@acc;				# will be rotated locally
+  my $bi = "%rbp";
+
+$code.=<<___;
+.type	__mulq_mont_383_nonred,\@abi-omnipotent
+.align	32
+__mulq_mont_383_nonred:
+	mov	%rax, $bi
+	mulq	@acc[6]			# a[0]*b[0]
+	mov	%rax, @acc[0]
+	mov	$bi, %rax
+	mov	%rdx, @acc[1]
+
+	mulq	@acc[7]			# a[1]*b[0]
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	@acc[4]			# a[2]*b[0]
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	 mov	@acc[0], @acc[7]
+	 imulq	8(%rsp), @acc[0]
+
+	mulq	@acc[5]			# a[3]*b[0]
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[4]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[5]
+
+	mulq	8*5($a_ptr)
+	add	%rax, @acc[5]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+___
+for (my $i=0; $i<6;) {
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[7]		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[7]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*3($n_ptr)
+	add	@acc[7], @acc[3]
+	adc	\$0, %rdx
+	add	%rax, @acc[3]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*4($n_ptr)
+	add	%rax, @acc[4]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*5($n_ptr)
+	add	%rax, @acc[5]
+	mov	$b_next, %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[5]
+	adc	%rdx, @acc[6]
+___
+    push(@acc,shift(@acc));
+$code.=<<___	if ($i++<5);
+	################################# Multiply by b[$i]
+	mov	%rax, $bi
+	mulq	8*0($a_ptr)
+	add	%rax, @acc[0]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[6], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[6], @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	 mov	@acc[0], @acc[7]
+	 imulq	8(%rsp), @acc[0]
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[6], @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[4]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[6], @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	mulq	8*5($a_ptr)
+	add	@acc[6], @acc[5]
+	adc	\$0, %rdx
+	add	%rax, @acc[5]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+___
+}
+$code.=<<___;
+	ret
+.size	__mulq_mont_383_nonred,.-__mulq_mont_383_nonred
+___
+}
+{ my $frame = 4*8 +	# place for argument off-load +
+	      2*384/8 +	# place for 2 384-bit temporary vectors
+	      8;	# align
+my @acc = (@acc,"%rax","%rdx","%rbx","%rbp");
+
+# omitting 3 reductions gives 8-11% better performance in add-chains
+$code.=<<___;
+.globl	sqr_mont_382x
+.hidden	sqr_mont_382x
+.type	sqr_mont_382x,\@function,4,"unwind"
+.align	32
+sqr_mont_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$n_ptr, 8*0(%rsp)	# n0
+	mov	$b_org, $n_ptr		# n_ptr
+	mov	$a_ptr, 8*2(%rsp)
+	mov	$r_ptr, 8*3(%rsp)
+
+	#################################
+	mov	8*0($a_ptr), @acc[0]	# a->re
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	mov	@acc[0], @acc[6]
+	add	8*6($a_ptr), @acc[0]	# a->re + a->im
+	mov	@acc[1], @acc[7]
+	adc	8*7($a_ptr), @acc[1]
+	mov	@acc[2], @acc[8]
+	adc	8*8($a_ptr), @acc[2]
+	mov	@acc[3], @acc[9]
+	adc	8*9($a_ptr), @acc[3]
+	mov	@acc[4], @acc[10]
+	adc	8*10($a_ptr), @acc[4]
+	mov	@acc[5], @acc[11]
+	adc	8*11($a_ptr), @acc[5]
+
+	sub	8*6($a_ptr), @acc[6]	# a->re - a->im
+	sbb	8*7($a_ptr), @acc[7]
+	sbb	8*8($a_ptr), @acc[8]
+	sbb	8*9($a_ptr), @acc[9]
+	sbb	8*10($a_ptr), @acc[10]
+	sbb	8*11($a_ptr), @acc[11]
+	sbb	$r_ptr, $r_ptr		# borrow flag as mask
+
+	mov	@acc[0], 32+8*0(%rsp)	# t0
+	mov	@acc[1], 32+8*1(%rsp)
+	mov	@acc[2], 32+8*2(%rsp)
+	mov	@acc[3], 32+8*3(%rsp)
+	mov	@acc[4], 32+8*4(%rsp)
+	mov	@acc[5], 32+8*5(%rsp)
+
+	mov	@acc[6], 32+8*6(%rsp)	# t1
+	mov	@acc[7], 32+8*7(%rsp)
+	mov	@acc[8], 32+8*8(%rsp)
+	mov	@acc[9], 32+8*9(%rsp)
+	mov	@acc[10], 32+8*10(%rsp)
+	mov	@acc[11], 32+8*11(%rsp)
+	mov	$r_ptr,   32+8*12(%rsp)
+
+	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
+	#mov	8*2(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_ptr	# a->im
+
+	mov	48($a_ptr), %rax	# a->im
+	mov	8*0($a_ptr), @acc[6]	# a->re
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[4]
+	mov	8*3($a_ptr), @acc[5]
+
+	mov	8*3(%rsp), $r_ptr
+	call	__mulq_mont_383_nonred
+___
+{
+my @acc = map("%r$_",14,15,8..11,	# output from __mulq_mont_384
+                     12,13,"ax","bx","bp","si");
+$code.=<<___;
+	add	@acc[0], @acc[0]	# add with itself
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+
+	mov	@acc[0],  8*6($r_ptr)	# ret->im
+	mov	@acc[1],  8*7($r_ptr)
+	mov	@acc[2],  8*8($r_ptr)
+	mov	@acc[3],  8*9($r_ptr)
+	mov	@acc[4],  8*10($r_ptr)
+	mov	@acc[5],  8*11($r_ptr)
+___
+}
+$code.=<<___;
+	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
+	lea	32(%rsp), $a_ptr	# t0
+	lea	32+8*6(%rsp), $b_ptr	# t1
+
+	mov	32+8*6(%rsp), %rax	# t1[0]
+	mov	32+8*0(%rsp), @acc[6]	# t0[0..3]
+	mov	32+8*1(%rsp), @acc[7]
+	mov	32+8*2(%rsp), @acc[4]
+	mov	32+8*3(%rsp), @acc[5]
+
+	call	__mulq_mont_383_nonred
+___
+{
+my @acc = map("%r$_",14,15,8..11,	# output from __mulq_mont_384
+                     12,13,"ax","bx","bp","si");
+$code.=<<___;
+	mov	32+8*12(%rsp), @acc[11]	# account for sign from a->re - a->im
+	mov	32+8*0(%rsp), @acc[6]
+	mov	32+8*1(%rsp), @acc[7]
+	and	@acc[11], @acc[6]
+	mov	32+8*2(%rsp), @acc[8]
+	and	@acc[11], @acc[7]
+	mov	32+8*3(%rsp), @acc[9]
+	and	@acc[11], @acc[8]
+	mov	32+8*4(%rsp), @acc[10]
+	and	@acc[11], @acc[9]
+	and	@acc[11], @acc[10]
+	and	32+8*5(%rsp), @acc[11]
+
+	sub	@acc[6], @acc[0]
+	mov	8*0($n_ptr), @acc[6]
+	sbb	@acc[7], @acc[1]
+	mov	8*1($n_ptr), @acc[7]
+	sbb	@acc[8], @acc[2]
+	mov	8*2($n_ptr), @acc[8]
+	sbb	@acc[9], @acc[3]
+	mov	8*3($n_ptr), @acc[9]
+	sbb	@acc[10], @acc[4]
+	mov	8*4($n_ptr), @acc[10]
+	sbb	@acc[11], @acc[5]
+	sbb	@acc[11], @acc[11]
+
+	and	@acc[11], @acc[6]
+	and	@acc[11], @acc[7]
+	and	@acc[11], @acc[8]
+	and	@acc[11], @acc[9]
+	and	@acc[11], @acc[10]
+	and	8*5($n_ptr), @acc[11]
+
+	add	@acc[6], @acc[0]
+	adc	@acc[7], @acc[1]
+	adc	@acc[8], @acc[2]
+	adc	@acc[9], @acc[3]
+	adc	@acc[10], @acc[4]
+	adc	@acc[11], @acc[5]
+
+	mov	@acc[0],  8*0($r_ptr)	# ret->re
+	mov	@acc[1],  8*1($r_ptr)
+	mov	@acc[2],  8*2($r_ptr)
+	mov	@acc[3],  8*3($r_ptr)
+	mov	@acc[4],  8*4($r_ptr)
+	mov	@acc[5],  8*5($r_ptr)
+___
+}
+$code.=<<___;
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_mont_382x,.-sqr_mont_382x
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/mulx_mont_256-x86_64.pl b/blst/asm/mulx_mont_256-x86_64.pl
new file mode 100755
index 0000000..0d6bf2e
--- /dev/null
+++ b/blst/asm/mulx_mont_256-x86_64.pl
@@ -0,0 +1,486 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# "Sparse" in subroutine names refers to most significant limb of the
+# modulus. Though "sparse" is a bit of misnomer, because limitation is
+# just not-all-ones. Or in other words not larger than 2^256-2^192-1.
+# In general Montgomery multiplication algorithm can handle one of the
+# inputs being non-reduced and capped by 1<<radix_width, 1<<256 in this
+# case, rather than the modulus. Whether or not mul_mont_sparse_256, a
+# *taylored* implementation of the algorithm, can handle such input can
+# be circumstantial. For example, in most general case it depends on
+# similar "bit sparsity" of individual limbs of the second, fully reduced
+# multiplicand. If you can't make such assumption about the limbs, then
+# non-reduced value shouldn't be larger than "same old" 2^256-2^192-1.
+# This requirement can be met by conditionally subtracting "bitwise
+# left-aligned" modulus. For example, if modulus is 200 bits wide, you
+# would need to conditionally subtract the value of modulus<<56. Common
+# source of non-reduced values is redc_mont_256 treating 512-bit inputs.
+# Well, more specifically ones with upper half not smaller than modulus.
+# Just in case, why limitation at all and not general-purpose 256-bit
+# subroutines? Unlike the 384-bit case, accounting for additional carry
+# has disproportionate impact on performance, especially in adcx/adox
+# implementation.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+{ ############################################################## 255 bits
+my @acc=map("%r$_",(10..15));
+
+{ ############################################################## mulq
+my ($lo,$hi)=("%rbp","%r9");
+
+$code.=<<___;
+.text
+
+.globl	mulx_mont_sparse_256
+.hidden	mulx_mont_sparse_256
+.type	mulx_mont_sparse_256,\@function,5,"unwind"
+.align	32
+mulx_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8,%rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), @acc[4]
+	mov	8*1($a_ptr), @acc[5]
+	mov	8*2($a_ptr), $lo
+	mov	8*3($a_ptr), $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	@acc[4], %rax, @acc[1]	# a[0]*b[0]
+	call	__mulx_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_mont_sparse_256,.-mulx_mont_sparse_256
+
+.globl	sqrx_mont_sparse_256
+.hidden	sqrx_mont_sparse_256
+.type	sqrx_mont_sparse_256,\@function,4,"unwind"
+.align	32
+sqrx_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8,%rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $b_ptr
+	mov	$n_ptr, $n0
+	mov	$b_org, $n_ptr
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[5]
+	mov	8*2($a_ptr), $lo
+	mov	8*3($a_ptr), $hi
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%rdx, %rax, @acc[1]	# a[0]*a[0]
+	call	__mulx_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_mont_sparse_256,.-sqrx_mont_sparse_256
+___
+{
+my @acc=@acc;
+$code.=<<___;
+.type	__mulx_mont_sparse_256,\@abi-omnipotent
+.align	32
+__mulx_mont_sparse_256:
+	mulx	@acc[5], @acc[5], @acc[2]
+	mulx	$lo, $lo, @acc[3]
+	add	@acc[5], @acc[1]
+	mulx	$hi, $hi, @acc[4]
+	 mov	8($b_ptr), %rdx
+	adc	$lo, @acc[2]
+	adc	$hi, @acc[3]
+	adc	\$0, @acc[4]
+
+___
+for (my $i=1; $i<4; $i++) {
+my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : "%rax";
+my $a5 = $i==1 ? @acc[5] : $lo;
+$code.=<<___;
+	 mov	%rax, @acc[0]
+	 imulq	$n0, %rax
+
+	################################# Multiply by b[$i]
+	xor	$a5, $a5		# [@acc[5]=0,] cf=0, of=0
+	mulx	8*0+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[1]
+	adcx	$hi, @acc[2]
+
+	mulx	8*1+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[2]
+	adcx	$hi, @acc[3]
+
+	mulx	8*2+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[3]
+	adcx	$hi, @acc[4]
+
+	mulx	8*3+128($a_ptr), $lo, $hi
+	 mov	%rax, %rdx
+	adox	$lo, @acc[4]
+	adcx	@acc[5], $hi 		# cf=0
+	adox	$hi, @acc[5]		# of=0
+
+	################################# reduction
+	mulx	8*0+128($n_ptr), $lo, %rax
+	adcx	$lo, @acc[0]		# guaranteed to be zero
+	adox	@acc[1], %rax
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, %rax		# @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	 mov	$b_next, %rdx
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+	adcx	@acc[0], @acc[4]
+	adox	@acc[0], @acc[5]
+	adcx	@acc[0], @acc[5]
+	adox	@acc[0], @acc[0]	# acc[5] in next iteration
+	adc	\$0, @acc[0]		# cf=0, of=0
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	imulq	$n0, %rdx
+
+	################################# last reduction
+	xor	$lo, $lo		# cf=0, of=0
+	mulx	8*0+128($n_ptr), @acc[0], $hi
+	adcx	%rax, @acc[0]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	 mov	@acc[1], %rdx
+	 lea	128($n_ptr), $n_ptr
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+	 mov	@acc[2], %rax
+	adcx	@acc[0], @acc[4]
+	adox	@acc[0], @acc[5]
+	adc	\$0, @acc[5]
+
+	#################################
+	# Branch-less conditional acc[1:5] - modulus
+
+	 mov	@acc[3], $lo
+	sub	8*0($n_ptr), @acc[1]
+	sbb	8*1($n_ptr), @acc[2]
+	sbb	8*2($n_ptr), @acc[3]
+	 mov	@acc[4], $hi
+	sbb	8*3($n_ptr), @acc[4]
+	sbb	\$0, @acc[5]
+
+	cmovc	%rdx, @acc[1]
+	cmovc	%rax, @acc[2]
+	cmovc	$lo,  @acc[3]
+	mov	@acc[1], 8*0($r_ptr)
+	cmovc	$hi,  @acc[4]
+	mov	@acc[2], 8*1($r_ptr)
+	mov	@acc[3], 8*2($r_ptr)
+	mov	@acc[4], 8*3($r_ptr)
+
+	ret
+.size	__mulx_mont_sparse_256,.-__mulx_mont_sparse_256
+___
+} }
+{ my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
+
+$code.=<<___;
+.globl	fromx_mont_256
+.hidden	fromx_mont_256
+.type	fromx_mont_256,\@function,4,"unwind"
+.align	32
+fromx_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulx_by_1_mont_256
+
+	#################################
+	# Branch-less conditional acc[0:3] - modulus
+
+	#mov	@acc[4], %rax		# __mulq_by_1_mont_256 does it
+	mov	@acc[5], %rdx
+	mov	@acc[0], @acc[2]
+	mov	@acc[1], @acc[3]
+
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[0]
+	sbb	8*3($n_ptr), @acc[1]
+
+	cmovnc	@acc[4], %rax
+	cmovnc	@acc[5], %rdx
+	cmovnc	@acc[0], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[1], @acc[3]
+	mov	%rdx,    8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	fromx_mont_256,.-fromx_mont_256
+
+.globl	redcx_mont_256
+.hidden	redcx_mont_256
+.type	redcx_mont_256,\@function,4,"unwind"
+.align	32
+redcx_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulx_by_1_mont_256
+
+	add	8*4($a_ptr), @acc[4]	# accumulate upper half
+	adc	8*5($a_ptr), @acc[5]
+	mov	@acc[4], %rax
+	adc	8*6($a_ptr), @acc[0]
+	mov	@acc[5], %rdx
+	adc	8*7($a_ptr), @acc[1]
+	sbb	$a_ptr, $a_ptr
+
+	#################################
+	# Branch-less conditional acc[0:4] - modulus
+
+	mov	@acc[0], @acc[2]
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[0]
+	mov	@acc[1], @acc[3]
+	sbb	8*3($n_ptr), @acc[1]
+	sbb	\$0, $a_ptr
+
+	cmovnc	@acc[4], %rax 
+	cmovnc	@acc[5], %rdx
+	cmovnc	@acc[0], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[1], @acc[3]
+	mov	%rdx,    8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	redcx_mont_256,.-redcx_mont_256
+___
+{
+my @acc=@acc;
+
+$code.=<<___;
+.type	__mulx_by_1_mont_256,\@abi-omnipotent
+.align	32
+__mulx_by_1_mont_256:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	mov	%rax, @acc[4]
+	imulq	$n0, %rax
+	mov	%rax, @acc[0]
+___
+for (my $i=0; $i<4; $i++) {
+my $hi = @acc[4];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[4]		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[4]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[4], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+___
+$code.=<<___	if ($i<3);
+	 mov	@acc[1], @acc[5]
+	 imulq	$n0, @acc[1]
+___
+$code.=<<___;
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	ret
+.size	__mulx_by_1_mont_256,.-__mulx_by_1_mont_256
+___
+} } }
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/mulx_mont_384-x86_64.pl b/blst/asm/mulx_mont_384-x86_64.pl
new file mode 100755
index 0000000..a762807
--- /dev/null
+++ b/blst/asm/mulx_mont_384-x86_64.pl
@@ -0,0 +1,2384 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+# common accumulator layout
+@acc=map("%r$_",(8..15));
+
+########################################################################
+{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.text
+
+########################################################################
+# Double-width subtraction modulo n<<384, as opposite to naively
+# expected modulo n*n. It works because n<<384 is the actual
+# input boundary condition for Montgomery reduction, not n*n.
+# Just in case, this is duplicated, but only one module is
+# supposed to be linked...
+.type	__sub_mod_384x384,\@abi-omnipotent
+.align	32
+__sub_mod_384x384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	8*6($a_ptr), @acc[6]
+
+	sub	8*0($b_org), @acc[0]
+	mov	8*7($a_ptr), @acc[7]
+	sbb	8*1($b_org), @acc[1]
+	mov	8*8($a_ptr), @acc[8]
+	sbb	8*2($b_org), @acc[2]
+	mov	8*9($a_ptr), @acc[9]
+	sbb	8*3($b_org), @acc[3]
+	mov	8*10($a_ptr), @acc[10]
+	sbb	8*4($b_org), @acc[4]
+	mov	8*11($a_ptr), @acc[11]
+	sbb	8*5($b_org), @acc[5]
+	 mov	@acc[0], 8*0($r_ptr)
+	sbb	8*6($b_org), @acc[6]
+	 mov	8*0($n_ptr), @acc[0]
+	 mov	@acc[1], 8*1($r_ptr)
+	sbb	8*7($b_org), @acc[7]
+	 mov	8*1($n_ptr), @acc[1]
+	 mov	@acc[2], 8*2($r_ptr)
+	sbb	8*8($b_org), @acc[8]
+	 mov	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], 8*3($r_ptr)
+	sbb	8*9($b_org), @acc[9]
+	 mov	8*3($n_ptr), @acc[3]
+	 mov	@acc[4], 8*4($r_ptr)
+	sbb	8*10($b_org), @acc[10]
+	 mov	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], 8*5($r_ptr)
+	sbb	8*11($b_org), @acc[11]
+	 mov	8*5($n_ptr), @acc[5]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[0]
+	and	$b_org, @acc[1]
+	and	$b_org, @acc[2]
+	and	$b_org, @acc[3]
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+
+	add	@acc[0], @acc[6]
+	adc	@acc[1], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[2], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[3], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[4], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[5], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,\@abi-omnipotent
+.align	32
+__add_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	8*3($b_org), @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	8*4($b_org), @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	8*5($b_org), @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[9],  @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,\@abi-omnipotent
+.align	32
+__sub_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+__sub_mod_384_a_is_loaded:
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[6]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[7]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[8]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[9]
+	sbb	8*4($b_org), @acc[4]
+	 mov	8*4($n_ptr), @acc[10]
+	sbb	8*5($b_org), @acc[5]
+	 mov	8*5($n_ptr), @acc[11]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+	and	$b_org, @acc[8]
+	and	$b_org, @acc[9]
+	and	$b_org, @acc[10]
+	and	$b_org, @acc[11]
+
+	add	@acc[6], @acc[0]
+	adc	@acc[7], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[8], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[9], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	adc	@acc[10], @acc[4]
+	mov	@acc[3], 8*3($r_ptr)
+	adc	@acc[11], @acc[5]
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+___
+}
+
+########################################################################
+# "Complex" multiplication and squaring. Use vanilla multiplication when
+# possible to fold reductions. I.e. instead of mul_mont, mul_mont
+# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod
+# followed by *common* reduction... For single multiplication disjoint
+# reduction is bad for performance for given vector length, yet overall
+# it's a win here, because it's one reduction less.
+{ my $frame = 5*8 +	# place for argument off-load +
+	      3*768/8;	# place for 3 768-bit temporary vectors
+$code.=<<___;
+.globl	mulx_mont_384x
+.hidden	mulx_mont_384x
+.type	mulx_mont_384x,\@function,5,"unwind"
+.align	32
+mulx_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr
+	mov	$r_ptr, 8*4(%rsp)	# offload arguments
+	mov	$a_ptr, 8*3(%rsp)
+	mov	$b_org, 8*2(%rsp)
+	mov	$n_ptr, 8*1(%rsp)
+	mov	$n0,    8*0(%rsp)
+
+	################################# mul_384(t0, a->re, b->re);
+	#lea	0($b_btr), $b_ptr	# b->re
+	#lea	0($a_ptr), $a_ptr	# a->re
+	lea	40(%rsp), $r_ptr	# t0
+	call	__mulx_384
+
+	################################# mul_384(t1, a->im, b->im);
+	lea	48($b_ptr), $b_ptr	# b->im
+	lea	128+48($a_ptr), $a_ptr	# a->im
+	lea	96($r_ptr), $r_ptr	# t1
+	call	__mulx_384
+
+	################################# mul_384(t2, a->re+a->im, b->re+b->im);
+	mov	8*1(%rsp), $n_ptr
+	lea	($b_ptr), $a_ptr	# b->re
+	lea	-48($b_ptr), $b_org	# b->im
+	lea	40+192+48(%rsp), $r_ptr
+	call	__add_mod_384
+
+	mov	8*3(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_org	# a->im
+	lea	-48($r_ptr), $r_ptr
+	call	__add_mod_384
+
+	lea	($r_ptr),$b_ptr
+	lea	48($r_ptr),$a_ptr
+	call	__mulx_384
+
+	################################# t2=t2-t0-t1
+	lea	($r_ptr), $a_ptr	# t2
+	lea	40(%rsp), $b_org	# t0
+	mov	8*1(%rsp), $n_ptr
+	call	__sub_mod_384x384	# t2-t0
+
+	lea	($r_ptr), $a_ptr	# t2
+	lea	-96($r_ptr), $b_org	# t1
+	call	__sub_mod_384x384	# t2-t0-t1
+
+	################################# t0=t0-t1
+	lea	40(%rsp), $a_ptr
+	lea	40+96(%rsp), $b_org
+	lea	40(%rsp), $r_ptr
+	call	__sub_mod_384x384	# t0-t1
+
+	lea	($n_ptr), $b_ptr	# n_ptr for redc_mont_384
+
+	################################# redc_mont_384(ret->re, t0, mod, n0);
+	lea	40(%rsp), $a_ptr	# t0
+	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
+	mov	8*4(%rsp), $r_ptr	# ret->re
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	################################# redc_mont_384(ret->im, t2, mod, n0);
+	lea	40+192(%rsp), $a_ptr	# t2
+	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
+	lea	48($r_ptr), $r_ptr	# ret->im
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_mont_384x,.-mulx_mont_384x
+___
+}
+{ my $frame = 4*8 +	# place for argument off-load +
+	      2*384/8 +	# place for 2 384-bit temporary vectors
+	      8;	# alignment
+$code.=<<___;
+.globl	sqrx_mont_384x
+.hidden	sqrx_mont_384x
+.type	sqrx_mont_384x,\@function,4,"unwind"
+.align	32
+sqrx_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$n_ptr, 8*0(%rsp)	# n0
+	mov	$b_org, $n_ptr		# n_ptr
+					# gap for __mulx_mont_384
+	mov	$r_ptr, 8*2(%rsp)
+	mov	$a_ptr, 8*3(%rsp)
+
+	################################# add_mod_384(t0, a->re, a->im);
+	lea	48($a_ptr), $b_org	# a->im
+	lea	32(%rsp), $r_ptr	# t0
+	call	__add_mod_384
+
+	################################# sub_mod_384(t1, a->re, a->im);
+	mov	8*3(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_org	# a->im
+	lea	32+48(%rsp), $r_ptr	# t1
+	call	__sub_mod_384
+
+	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
+	mov	8*3(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_ptr	# a->im
+
+	mov	48($a_ptr), %rdx
+	mov	8*0($a_ptr), %r14	# @acc[6]
+	mov	8*1($a_ptr), %r15	# @acc[7]
+	mov	8*2($a_ptr), %rax	# @acc[8]
+	mov	8*3($a_ptr), %r12	# @acc[4]
+	mov	8*4($a_ptr), %rdi	# $lo
+	mov	8*5($a_ptr), %rbp	# $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%r14, %r8, %r9
+	call	__mulx_mont_384
+___
+{
+my @acc = map("%r$_","dx",15,"ax",12,"di","bp",	# output from __mulx_mont_384
+                      8..11,13,14);
+$code.=<<___;
+	add	@acc[0], @acc[0]	# add with itself
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	@acc[3], @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	@acc[4], @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	@acc[5], @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$a_ptr, $a_ptr
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $a_ptr
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	mov	@acc[0], 8*6($b_ptr)	# ret->im
+	cmovc	@acc[9],  @acc[3]
+	mov	@acc[1], 8*7($b_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2], 8*8($b_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3], 8*9($b_ptr)
+	mov	@acc[4], 8*10($b_ptr)
+	mov	@acc[5], 8*11($b_ptr)
+___
+}
+$code.=<<___;
+	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
+	lea	32(%rsp), $a_ptr	# t0
+	lea	32+48(%rsp), $b_ptr	# t1
+
+	mov	32+48(%rsp), %rdx	# t1[0]
+	mov	32+8*0(%rsp), %r14	# @acc[6]
+	mov	32+8*1(%rsp), %r15	# @acc[7]
+	mov	32+8*2(%rsp), %rax	# @acc[8]
+	mov	32+8*3(%rsp), %r12	# @acc[4]
+	mov	32+8*4(%rsp), %rdi	# $lo
+	mov	32+8*5(%rsp), %rbp	# $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%r14, %r8, %r9
+	call	__mulx_mont_384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_mont_384x,.-sqrx_mont_384x
+
+.globl	mulx_382x
+.hidden	mulx_382x
+.type	mulx_382x,\@function,4,"unwind"
+.align	32
+mulx_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	96($r_ptr), $r_ptr	# ret->im
+	mov	$a_ptr, 8*0(%rsp)
+	mov	$b_org, 8*1(%rsp)
+	mov	$r_ptr, 8*2(%rsp)	# offload ret->im
+	mov	$n_ptr, 8*3(%rsp)
+
+	################################# t0 = a->re + a->im
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	add	8*6($a_ptr), @acc[0]
+	adc	8*7($a_ptr), @acc[1]
+	adc	8*8($a_ptr), @acc[2]
+	adc	8*9($a_ptr), @acc[3]
+	adc	8*10($a_ptr), @acc[4]
+	adc	8*11($a_ptr), @acc[5]
+
+	mov	@acc[0], 32+8*0(%rsp)
+	mov	@acc[1], 32+8*1(%rsp)
+	mov	@acc[2], 32+8*2(%rsp)
+	mov	@acc[3], 32+8*3(%rsp)
+	mov	@acc[4], 32+8*4(%rsp)
+	mov	@acc[5], 32+8*5(%rsp)
+
+	################################# t1 = b->re + b->im
+	mov	8*0($b_org), @acc[0]
+	mov	8*1($b_org), @acc[1]
+	mov	8*2($b_org), @acc[2]
+	mov	8*3($b_org), @acc[3]
+	mov	8*4($b_org), @acc[4]
+	mov	8*5($b_org), @acc[5]
+
+	add	8*6($b_org), @acc[0]
+	adc	8*7($b_org), @acc[1]
+	adc	8*8($b_org), @acc[2]
+	adc	8*9($b_org), @acc[3]
+	adc	8*10($b_org), @acc[4]
+	adc	8*11($b_org), @acc[5]
+
+	mov	@acc[0], 32+8*6(%rsp)
+	mov	@acc[1], 32+8*7(%rsp)
+	mov	@acc[2], 32+8*8(%rsp)
+	mov	@acc[3], 32+8*9(%rsp)
+	mov	@acc[4], 32+8*10(%rsp)
+	mov	@acc[5], 32+8*11(%rsp)
+
+	################################# mul_384(ret->im, t0, t1);
+	lea	32+8*0(%rsp), $a_ptr	# t0
+	lea	32+8*6(%rsp), $b_ptr	# t1
+	call	__mulx_384
+
+	################################# mul_384(ret->re, a->re, b->re);
+	mov	8*0(%rsp), $a_ptr
+	mov	8*1(%rsp), $b_ptr
+	lea	-96($r_ptr), $r_ptr	# ret->re
+	call	__mulx_384
+
+	################################# mul_384(tx, a->im, b->im);
+	lea	48+128($a_ptr), $a_ptr
+	lea	48($b_ptr), $b_ptr
+	lea	32(%rsp), $r_ptr
+	call	__mulx_384
+
+	################################# ret->im -= tx
+	mov	8*2(%rsp), $a_ptr	# restore ret->im
+	lea	32(%rsp), $b_org
+	mov	8*3(%rsp), $n_ptr
+	mov	$a_ptr, $r_ptr
+	call	__sub_mod_384x384
+
+	################################# ret->im -= ret->re
+	lea	0($r_ptr), $a_ptr
+	lea	-96($r_ptr), $b_org
+	call	__sub_mod_384x384
+
+	################################# ret->re -= tx
+	lea	-96($r_ptr), $a_ptr
+	lea	32(%rsp), $b_org
+	lea	-96($r_ptr), $r_ptr
+	call	__sub_mod_384x384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_382x,.-mulx_382x
+___
+}
+{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.globl	sqrx_382x
+.hidden	sqrx_382x
+.type	sqrx_382x,\@function,3,"unwind"
+.align	32
+sqrx_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$a_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+
+	################################# t0 = a->re + a->im
+	mov	8*0($a_ptr), @acc[6]
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[9]
+	mov	8*4($a_ptr), @acc[10]
+	mov	8*5($a_ptr), @acc[11]
+
+	mov	@acc[6], @acc[0]
+	add	8*6($a_ptr), @acc[6]
+	mov	@acc[7], @acc[1]
+	adc	8*7($a_ptr), @acc[7]
+	mov	@acc[8], @acc[2]
+	adc	8*8($a_ptr), @acc[8]
+	mov	@acc[9], @acc[3]
+	adc	8*9($a_ptr), @acc[9]
+	mov	@acc[10], @acc[4]
+	adc	8*10($a_ptr), @acc[10]
+	mov	@acc[11], @acc[5]
+	adc	8*11($a_ptr), @acc[11]
+
+	mov	@acc[6], 8*0($r_ptr)
+	mov	@acc[7], 8*1($r_ptr)
+	mov	@acc[8], 8*2($r_ptr)
+	mov	@acc[9], 8*3($r_ptr)
+	mov	@acc[10], 8*4($r_ptr)
+	mov	@acc[11], 8*5($r_ptr)
+
+	################################# t1 = a->re - a->im
+	lea	48($a_ptr), $b_org
+	lea	48($r_ptr), $r_ptr
+	call	__sub_mod_384_a_is_loaded
+
+	################################# mul_384(ret->re, t0, t1);
+	lea	($r_ptr), $a_ptr
+	lea	-48($r_ptr), $b_ptr
+	lea	-48($r_ptr), $r_ptr
+	call	__mulx_384
+
+	################################# mul_384(ret->im, a->re, a->im);
+	mov	(%rsp), $a_ptr
+	lea	48($a_ptr), $b_ptr
+	lea	96($r_ptr), $r_ptr
+	call	__mulx_384
+
+	mov	8*0($r_ptr), @acc[0]	# double ret->im
+	mov	8*1($r_ptr), @acc[1]
+	mov	8*2($r_ptr), @acc[2]
+	mov	8*3($r_ptr), @acc[3]
+	mov	8*4($r_ptr), @acc[4]
+	mov	8*5($r_ptr), @acc[5]
+	mov	8*6($r_ptr), @acc[6]
+	mov	8*7($r_ptr), @acc[7]
+	mov	8*8($r_ptr), @acc[8]
+	mov	8*9($r_ptr), @acc[9]
+	mov	8*10($r_ptr), @acc[10]
+	add	@acc[0], @acc[0]
+	mov	8*11($r_ptr), @acc[11]
+	adc	@acc[1], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[2], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[3], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	adc	@acc[4], @acc[4]
+	mov	@acc[3], 8*3($r_ptr)
+	adc	@acc[5], @acc[5]
+	mov	@acc[4], 8*4($r_ptr)
+	adc	@acc[6], @acc[6]
+	mov	@acc[5], 8*5($r_ptr)
+	adc	@acc[7], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[8], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[9], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[10], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[11], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	mov	8*1(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*2(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*3(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*4(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*5(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*6(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*7(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*7
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_382x,.-sqrx_382x
+___
+}
+{ ########################################################## 384-bit mulx
+my ($a0, $a1) = @acc[6..7];
+my @acc = @acc[0..5];
+my ($lo, $hi, $zr) = ("%rax", "%rcx", "%rbp");
+
+$code.=<<___;
+.globl	mulx_384
+.hidden	mulx_384
+.type	mulx_384,\@function,3,"unwind"
+.align	32
+mulx_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+	call	__mulx_384
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_384,.-mulx_384
+
+.type	__mulx_384,\@abi-omnipotent
+.align	32
+__mulx_384:
+	mov	8*0($b_ptr), %rdx
+	mov	8*0($a_ptr), $a0
+	mov	8*1($a_ptr), $a1
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	lea	-128($a_ptr), $a_ptr
+
+	mulx	$a0, @acc[1], $hi
+	xor	$zr, $zr
+
+	mulx	$a1, @acc[0], $lo
+	adcx	$hi, @acc[0]
+	mov	@acc[1], 8*0($r_ptr)
+
+	mulx	@acc[2], @acc[1], $hi
+	adcx	$lo, @acc[1]
+
+	mulx	@acc[3], @acc[2], $lo
+	adcx	$hi, @acc[2]
+
+	mulx	@acc[4], @acc[3], $hi
+	adcx	$lo, @acc[3]
+
+	mulx	@acc[5], @acc[4], @acc[5]
+	mov	8*1($b_ptr), %rdx
+	adcx	$hi, @acc[4]
+	adcx	$zr, @acc[5]
+___
+for(my $i=1; $i<6; $i++) {
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax";
+$code.=<<___;
+	mulx	$a0, $lo, $hi
+	adcx	@acc[0], $lo
+	adox	$hi, @acc[1]
+	mov	$lo, 8*$i($r_ptr)
+
+	mulx	$a1, @acc[0], $hi
+	adcx	@acc[1], $acc[0]
+	adox	$hi, @acc[2]
+
+	mulx	128+8*2($a_ptr), @acc[1], $lo
+	adcx	@acc[2], @acc[1]
+	adox	$lo, @acc[3]
+
+	mulx	128+8*3($a_ptr), @acc[2], $hi
+	adcx	@acc[3], @acc[2]
+	adox	$hi, @acc[4]
+
+	mulx	128+8*4($a_ptr), @acc[3], $lo
+	adcx	@acc[4], @acc[3]
+	adox	@acc[5], $lo
+
+	mulx	128+8*5($a_ptr), @acc[4], @acc[5]
+	mov	$b_next, %rdx
+	adcx	$lo, @acc[4]
+	adox	$zr, @acc[5]
+	adcx	$zr, @acc[5]
+___
+}
+$code.=<<___;
+	mov	@acc[0], 8*6($r_ptr)
+	mov	@acc[1], 8*7($r_ptr)
+	mov	@acc[2], 8*8($r_ptr)
+	mov	@acc[3], 8*9($r_ptr)
+	mov	@acc[4], 8*10($r_ptr)
+	mov	@acc[5], 8*11($r_ptr)
+
+	ret
+.size	__mulx_384,.-__mulx_384
+___
+}
+{ ########################################################## 384-bit sqrx
+$code.=<<___;
+.globl	sqrx_384
+.hidden	sqrx_384
+.type	sqrx_384,\@function,2,"unwind"
+.align	32
+sqrx_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__sqrx_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_384,.-sqrx_384
+___
+if (0) {
+# up to 5% slower than below variant
+my @acc=map("%r$_",("no",8..15,"cx","bx"));
+   push(@acc, $a_ptr);
+my ($lo, $hi, $carry)=("%rax", "%rbp", "%rno");
+
+$code.=<<___;
+.type	__sqrx_384,\@abi-omnipotent
+.align	32
+__sqrx_384:
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[9]
+	mov	8*4($a_ptr), @acc[10]
+
+	#########################################
+	mulx	@acc[7], @acc[1], $lo		# a[1]*a[0]
+	 mov	8*5($a_ptr), @acc[11]
+	mulx	@acc[8], @acc[2], $hi		# a[2]*a[0]
+	add	$lo, @acc[2]
+	mulx	@acc[9], @acc[3], $lo		# a[3]*a[0]
+	adc	$hi, @acc[3]
+	mulx	@acc[10], @acc[4], $hi		# a[4]*a[0]
+	adc	$lo, @acc[4]
+	mulx	@acc[11], @acc[5], @acc[6]	# a[5]*a[0]
+	adc	$hi, @acc[5]
+	adc	\$0, @acc[6]
+
+	mulx	%rdx, $lo, $hi			# a[0]*a[0]
+	 mov	@acc[7], %rdx
+	xor	@acc[7], @acc[7]
+	add	@acc[1], @acc[1]		# double acc[1]
+	adc	\$0, @acc[7]
+	add	$hi, @acc[1]
+	adc	\$0, @acc[7]
+	mov	$lo, 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+___
+($carry, @acc[7]) = (@acc[7], @acc[1]);
+$code.=<<___;
+	#########################################
+	xor	@acc[7], @acc[7]
+	mulx	@acc[8], $lo, $hi		# a[2]*a[1]
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	@acc[9], $lo, $hi		# a[3]*a[1]
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	@acc[10], $lo, $hi		# a[4]*a[1]
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[1]
+	adcx	$lo, @acc[6]
+	adox	@acc[7], $hi
+	adcx	$hi, @acc[7]
+
+	mulx	%rdx, $lo, $hi			# a[1]*a[1]
+	 mov	@acc[8], %rdx
+	xor	@acc[8], @acc[8]
+	adox	@acc[2], @acc[2]		# double acc[2:3]
+	adcx	$carry, $lo			# can't carry
+	adox	@acc[3], @acc[3]
+	adcx	$lo, @acc[2]
+	adox	@acc[8], @acc[8]
+	adcx	$hi, @acc[3]
+	adc	\$0, @acc[8]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+___
+($carry,@acc[8])=(@acc[8],$carry);
+$code.=<<___;
+	#########################################
+	xor	@acc[8], @acc[8]
+	mulx	@acc[9], $lo, $hi		# a[3]*a[2]
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+
+	mulx	@acc[10], $lo, $hi		# a[4]*a[2]
+	adcx	$lo, @acc[6]
+	adox	$hi, @acc[7]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[2]
+	adcx	$lo, @acc[7]
+	adox	@acc[8], $hi
+	adcx	$hi, @acc[8]
+
+	mulx	%rdx, $lo, $hi			# a[2]*a[2]
+	 mov	@acc[9], %rdx
+	xor	@acc[9], @acc[9]
+	adox	@acc[4], @acc[4]		# double acc[4:5]
+	adcx	$carry, $lo			# can't carry
+	adox	@acc[5], @acc[5]
+	adcx	$lo, @acc[4]
+	adox	@acc[9], @acc[9]
+	adcx	$hi, @acc[5]
+	adc	\$0, $acc[9]
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+___
+($carry,@acc[9])=(@acc[9],$carry);
+$code.=<<___;
+	#########################################
+	xor	@acc[9], @acc[9]
+	mulx	@acc[10], $lo, $hi		# a[4]*a[3]
+	adcx	$lo, @acc[7]
+	adox	$hi, @acc[8]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[3]
+	adcx	$lo, @acc[8]
+	adox	@acc[9], $hi
+	adcx	$hi, @acc[9]
+
+	mulx	%rdx, $lo, $hi
+	 mov	@acc[10], %rdx
+	xor	@acc[10], @acc[10]
+	adox	@acc[6], @acc[6]		# double acc[6:7]
+	adcx	$carry, $lo			# can't carry
+	adox	@acc[7], @acc[7]
+	adcx	$lo, @acc[6]
+	adox	@acc[10], @acc[10]
+	adcx	$hi, @acc[7]
+	adc	\$0, $acc[10]
+	mov	@acc[6], 8*6($r_ptr)
+	mov	@acc[7], 8*7($r_ptr)
+___
+($carry,@acc[10])=(@acc[10],$carry);
+$code.=<<___;
+	#########################################
+	mulx	@acc[11], $lo, @acc[10]		# a[5]*a[4]
+	add	$lo, @acc[9]
+	adc	\$0, @acc[10]
+
+	mulx	%rdx, $lo, $hi			# a[4]*a[4]
+	 mov	@acc[11], %rdx
+	xor	@acc[11], @acc[11]
+	adox	@acc[8], @acc[8]		# double acc[8:10]
+	adcx	$carry, $lo			# can't carry
+	adox	@acc[9], @acc[9]
+	adcx	$lo, @acc[8]
+	adox	@acc[10], @acc[10]
+	adcx	$hi, @acc[9]
+	adox	@acc[11], @acc[11]
+	mov	@acc[8], 8*8($r_ptr)
+	mov	@acc[9], 8*9($r_ptr)
+
+	#########################################
+	mulx	%rdx, $lo, $hi			# a[5]*a[5]
+	adcx	$lo, @acc[10]
+	adcx	$hi, @acc[11]
+
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sqrx_384,.-__sqrx_384
+___
+} else {
+my @acc=map("%r$_",("no",8..15,"cx","bx","bp"));
+my ($lo, $hi)=($r_ptr, "%rax");
+
+$code.=<<___;
+.type	__sqrx_384,\@abi-omnipotent
+.align	32
+__sqrx_384:
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[9]
+	mov	8*4($a_ptr), @acc[10]
+
+	#########################################
+	mulx	@acc[7], @acc[1], $lo		# a[1]*a[0]
+	 mov	8*5($a_ptr), @acc[11]
+	mulx	@acc[8], @acc[2], $hi		# a[2]*a[0]
+	add	$lo, @acc[2]
+	mulx	@acc[9], @acc[3], $lo		# a[3]*a[0]
+	adc	$hi, @acc[3]
+	mulx	@acc[10], @acc[4], $hi		# a[4]*a[0]
+	adc	$lo, @acc[4]
+	mulx	@acc[11], @acc[5], @acc[6]	# a[5]*a[0]
+	 mov	@acc[7], %rdx
+	adc	$hi, @acc[5]
+	adc	\$0, @acc[6]
+
+	#########################################
+	xor	@acc[7], @acc[7]
+	mulx	@acc[8], $lo, $hi		# a[2]*a[1]
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	@acc[9], $lo, $hi		# a[3]*a[1]
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	@acc[10], $lo, $hi		# a[4]*a[1]
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[1]
+	 mov	@acc[8], %rdx
+	adcx	$lo, @acc[6]
+	adox	@acc[7], $hi
+	adcx	$hi, @acc[7]
+
+	#########################################
+	xor	@acc[8], @acc[8]
+	mulx	@acc[9], $lo, $hi		# a[3]*a[2]
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+
+	mulx	@acc[10], $lo, $hi		# a[4]*a[2]
+	adcx	$lo, @acc[6]
+	adox	$hi, @acc[7]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[2]
+	 mov	@acc[9], %rdx
+	adcx	$lo, @acc[7]
+	adox	@acc[8], $hi
+	adcx	$hi, @acc[8]
+
+	#########################################
+	xor	@acc[9], @acc[9]
+	mulx	@acc[10], $lo, $hi		# a[4]*a[3]
+	adcx	$lo, @acc[7]
+	adox	$hi, @acc[8]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[3]
+	 mov	@acc[10], %rdx
+	adcx	$lo, @acc[8]
+	adox	@acc[9], $hi
+	adcx	$hi, @acc[9]
+
+	#########################################
+	mulx	@acc[11], $lo, @acc[10]		# a[5]*a[4]
+	 mov	8*0($a_ptr), %rdx
+	add	$lo, @acc[9]
+	 mov	8(%rsp), $r_ptr			# restore $r_ptr
+	adc	\$0, @acc[10]
+
+	######################################### double acc[1:10]
+	xor	@acc[11], @acc[11]
+	adcx	@acc[1], @acc[1]
+	adcx	@acc[2], @acc[2]
+	adcx	@acc[3], @acc[3]
+	adcx	@acc[4], @acc[4]
+	adcx	@acc[5], @acc[5]
+
+	######################################### accumulate a[i]*a[i]
+	mulx	%rdx, %rdx, $hi 		# a[0]*a[0]
+	mov	%rdx, 8*0($r_ptr)
+	mov	8*1($a_ptr), %rdx
+	adox	$hi, @acc[1]
+	mov	@acc[1], 8*1($r_ptr)
+
+	mulx	%rdx, @acc[1], $hi		# a[1]*a[1]
+	mov	8*2($a_ptr), %rdx
+	adox	@acc[1], @acc[2]
+	adox	$hi,     @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mulx	%rdx, @acc[1], @acc[2]		# a[2]*a[2]
+	mov	8*3($a_ptr), %rdx
+	adox	@acc[1], @acc[4]
+	adox	@acc[2], @acc[5]
+	adcx	@acc[6], @acc[6]
+	adcx	@acc[7], @acc[7]
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mulx	%rdx, @acc[1], @acc[2]		# a[3]*a[3]
+	mov	8*4($a_ptr), %rdx
+	adox	@acc[1], @acc[6]
+	adox	@acc[2], @acc[7]
+	adcx	@acc[8], @acc[8]
+	adcx	@acc[9], @acc[9]
+	mov	@acc[6], 8*6($r_ptr)
+	mov	@acc[7], 8*7($r_ptr)
+
+	mulx	%rdx, @acc[1], @acc[2]		# a[4]*a[4]
+	mov	8*5($a_ptr), %rdx
+	adox	@acc[1], @acc[8]
+	adox	@acc[2], @acc[9]
+	adcx	@acc[10], @acc[10]
+	adcx	@acc[11], @acc[11]
+	mov	@acc[8], 8*8($r_ptr)
+	mov	@acc[9], 8*9($r_ptr)
+
+	mulx	%rdx, @acc[1], @acc[2]		# a[5]*a[5]
+	adox	@acc[1], @acc[10]
+	adox	@acc[2], @acc[11]
+
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sqrx_384,.-__sqrx_384
+___
+}
+
+{ ########################################################## 384-bit redcx_mont
+my ($n_ptr, $n0)=($b_ptr, $n_ptr);      # arguments are "shifted"
+my ($lo, $hi) = ("%rax", "%rbp");
+
+$code.=<<___;
+########################################################################
+# void redcx_mont_384(uint64_t ret[6], const uint64_t a[12],
+#                     uint64_t m[6], uint64_t n0);
+.globl	redcx_mont_384
+.hidden	redcx_mont_384
+.type	redcx_mont_384,\@function,4,"unwind"
+.align	32
+redcx_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	redcx_mont_384,.-redcx_mont_384
+
+########################################################################
+# void fromx_mont_384(uint64_t ret[6], const uint64_t a[6],
+#                    uint64_t m[6], uint64_t n0);
+.globl	fromx_mont_384
+.hidden	fromx_mont_384
+.type	fromx_mont_384,\@function,4,"unwind"
+.align	32
+fromx_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulx_by_1_mont_384
+
+	#################################
+	# Branch-less conditional acc[0:6] - modulus
+
+	mov	@acc[6], %rax
+	mov	@acc[7], %rcx
+	mov	@acc[0], %rdx
+	mov	@acc[1], %rbp
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[7]
+	mov	@acc[2], @acc[5]
+	sbb	8*2($n_ptr), @acc[0]
+	sbb	8*3($n_ptr), @acc[1]
+	sbb	8*4($n_ptr), @acc[2]
+	mov	@acc[3], $a_ptr
+	sbb	8*5($n_ptr), @acc[3]
+
+	cmovc	%rax, @acc[6]
+	cmovc	%rcx, @acc[7]
+	cmovc	%rdx, @acc[0]
+	mov	@acc[6], 8*0($r_ptr)
+	cmovc	%rbp, @acc[1]
+	mov	@acc[7], 8*1($r_ptr)
+	cmovc	@acc[5], @acc[2]
+	mov	@acc[0], 8*2($r_ptr)
+	cmovc	$a_ptr,  @acc[3]
+	mov	@acc[1], 8*3($r_ptr)
+	mov	@acc[2], 8*4($r_ptr)
+	mov	@acc[3], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	fromx_mont_384,.-fromx_mont_384
+___
+{ my @acc=@acc;				# will be rotated locally
+
+$code.=<<___;
+.type	__mulx_by_1_mont_384,\@abi-omnipotent
+.align	32
+__mulx_by_1_mont_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	$n0, %rdx
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+___
+for (my $i=0; $i<6; $i++) {
+$code.=<<___;
+	imulq	@acc[0], %rdx
+
+	################################# reduction $i
+	xor	@acc[6], @acc[6]	# @acc[6]=0, cf=0, of=0
+	mulx	8*0($n_ptr), $lo, $hi
+	adcx	$lo, @acc[0]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3($n_ptr), $lo, $hi
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	8*4($n_ptr), $lo, $hi
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	8*5($n_ptr), $lo, $hi
+	 mov	$n0, %rdx
+	adcx	$lo, @acc[5]
+	adox	@acc[6], $hi
+	adcx	$hi, @acc[6]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	ret
+.size	__mulx_by_1_mont_384,.-__mulx_by_1_mont_384
+
+.type	__redc_tail_mont_384,\@abi-omnipotent
+.align	32
+__redc_tail_mont_384:
+	add	8*6($a_ptr), @acc[0]	# accumulate upper half
+	mov	@acc[0], %rax
+	adc	8*7($a_ptr), @acc[1]
+	adc	8*8($a_ptr), @acc[2]
+	adc	8*9($a_ptr), @acc[3]
+	mov	@acc[1], %rcx
+	adc	8*10($a_ptr), @acc[4]
+	adc	8*11($a_ptr), @acc[5]
+	sbb	@acc[6], @acc[6]
+
+	#################################
+	# Branch-less conditional acc[0:6] - modulus
+
+	mov	@acc[2], %rdx
+	mov	@acc[3], %rbp
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	mov	@acc[4], @acc[7]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	mov	@acc[5], $a_ptr
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, @acc[6]
+
+	cmovc	%rax, @acc[0]
+	cmovc	%rcx, @acc[1]
+	cmovc	%rdx, @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	%rbp, @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[7], @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	$a_ptr,  @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	sgn0x_pty_mont_384
+.hidden	sgn0x_pty_mont_384
+.type	sgn0x_pty_mont_384,\@function,3,"unwind"
+.align	32
+sgn0x_pty_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $n_ptr
+	lea	0($r_ptr), $a_ptr
+	mov	$b_org, $n0
+	call	__mulx_by_1_mont_384
+
+	xor	%rax, %rax
+	mov	@acc[0], @acc[7]
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	not	%rax			# 2*x > p, which means "negative"
+	and	\$1, @acc[7]
+	and	\$2, %rax
+	or	@acc[7], %rax		# pack sign and parity
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0x_pty_mont_384,.-sgn0x_pty_mont_384
+
+.globl	sgn0x_pty_mont_384x
+.hidden	sgn0x_pty_mont_384x
+.type	sgn0x_pty_mont_384x,\@function,3,"unwind"
+.align	32
+sgn0x_pty_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $n_ptr
+	lea	48($r_ptr), $a_ptr	# sgn0(a->im)
+	mov	$b_org, $n0
+	call	__mulx_by_1_mont_384
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	lea	0($r_ptr), $a_ptr	# sgn0(a->re)
+	xor	$r_ptr, $r_ptr
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, $r_ptr
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $r_ptr
+
+	mov	@acc[0], 0(%rsp)	# a->im is zero or not
+	not	$r_ptr			# 2*x > p, which means "negative"
+	and	\$1, @acc[7]
+	and	\$2, $r_ptr
+	or	@acc[7], $r_ptr		# pack sign and parity
+
+	call	__mulx_by_1_mont_384
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	xor	%rax, %rax
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	mov	0(%rsp), @acc[6]
+
+	not	%rax			# 2*x > p, which means "negative"
+
+	test	@acc[0], @acc[0]
+	cmovz	$r_ptr, @acc[7]		# a->re==0? prty(a->im) : prty(a->re)
+
+	test	@acc[6], @acc[6]
+	cmovnz	$r_ptr, %rax		# a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	\$1, @acc[7]
+	and	\$2, %rax
+	or	@acc[7], %rax		# pack sign and parity
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x
+___
+} }
+
+{ ########################################################## mulx/sqrx_mont
+my @acc = (@acc, "%rax");
+my ($lo,$hi)=("%rdi","%rbp");
+
+$code.=<<___;
+.globl	mulx_mont_384
+.hidden	mulx_mont_384
+.type	mulx_mont_384,\@function,5,"unwind"
+.align	32
+mulx_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	lea	-8*3(%rsp), %rsp
+.cfi_adjust_cfa_offset	8*3
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), @acc[6]
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[4]
+	mov	$r_ptr, 8*2(%rsp)
+	mov	8*4($a_ptr), $lo
+	mov	8*5($a_ptr), $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+	mov	$n0, (%rsp)
+
+	mulx	@acc[6],@acc[0],@acc[1]	# a[0]*b[0]
+	call	__mulx_mont_384
+
+	mov	8*3(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*4(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*5(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*6(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*7(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*8(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*9(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*9
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_mont_384,.-mulx_mont_384
+___
+{ my @acc=@acc;				# will be rotated locally
+
+$code.=<<___;
+.type	__mulx_mont_384,\@abi-omnipotent
+.align	32
+__mulx_mont_384:
+.cfi_startproc
+	mulx	@acc[7], @acc[6], @acc[2]
+	mulx	@acc[8], @acc[7], @acc[3]
+	add	@acc[6], @acc[1]
+	mulx	@acc[4], @acc[8], @acc[4]
+	adc	@acc[7], @acc[2]
+	mulx	$lo, $lo, @acc[5]
+	adc	@acc[8], @acc[3]
+	mulx	$hi, $hi, @acc[6]
+	 mov	8($b_ptr), %rdx
+	adc	$lo, @acc[4]
+	adc	$hi, @acc[5]
+	adc	\$0, @acc[6]
+	xor	@acc[7], @acc[7]
+
+___
+for (my $i=1; $i<6; $i++) {
+my $tt = $i==1 ? @acc[7] : $hi;
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	 mov	@acc[0], 16(%rsp)
+	 imulq	8(%rsp), @acc[0]
+
+	################################# Multiply by b[$i]
+	xor	@acc[8], @acc[8]	# @acc[8]=0, cf=0, of=0
+	mulx	8*0+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[1]
+	adcx	$hi, @acc[2]
+
+	mulx	8*1+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[2]
+	adcx	$hi, @acc[3]
+
+	mulx	8*2+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[3]
+	adcx	$hi, @acc[4]
+
+	mulx	8*3+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[4]
+	adcx	$hi, @acc[5]
+
+	mulx	8*4+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[5]
+	adcx	$hi, @acc[6]
+
+	mulx	8*5+128($a_ptr), $lo, $hi
+	 mov	@acc[0], %rdx
+	adox	$lo, @acc[6]
+	adcx	$hi, @acc[7]		# cf=0
+	adox	@acc[8], @acc[7]
+	adox	@acc[8], @acc[8]
+
+	################################# reduction
+	xor	@acc[0], @acc[0]	# acc[0]=0, cf=0, of=0
+	mulx	8*0+128($n_ptr), $lo, $hi
+	adcx	16(%rsp), $lo		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	8*4+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	8*5+128($n_ptr), $lo, $hi
+	 mov	$b_next, %rdx
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+	adcx	@acc[0], @acc[6]
+	adox	@acc[0], @acc[7]
+	adcx	@acc[0], @acc[7]
+	adox	@acc[0], @acc[8]
+	adcx	@acc[0], @acc[8]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	imulq	8(%rsp), %rdx
+	mov	8*3(%rsp), $b_ptr	# restore $r_ptr
+
+	################################# last reduction
+	xor	@acc[8], @acc[8]	# @acc[8]=0, cf=0, of=0
+	mulx	8*0+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[0]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+	 mov	@acc[2], @acc[0]
+
+	mulx	8*4+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+	 mov	@acc[3], $a_ptr
+
+	mulx	8*5+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+	 mov	@acc[1], %rdx
+	adcx	@acc[8], @acc[6]
+	adox	@acc[8], @acc[7]
+	 lea	128($n_ptr), $n_ptr
+	 mov	@acc[4], @acc[8]
+	adc	\$0, @acc[7]
+
+	#################################
+	# Branch-less conditional acc[1:7] - modulus
+
+	sub	8*0($n_ptr), @acc[1]
+	sbb	8*1($n_ptr), @acc[2]
+	 mov	@acc[5], $lo
+	sbb	8*2($n_ptr), @acc[3]
+	sbb	8*3($n_ptr), @acc[4]
+	sbb	8*4($n_ptr), @acc[5]
+	 mov	@acc[6], $hi
+	sbb	8*5($n_ptr), @acc[6]
+	sbb	\$0, @acc[7]
+
+	cmovnc	@acc[1], %rdx
+	cmovc	@acc[0], @acc[2]
+	cmovc	$a_ptr, @acc[3]
+	cmovnc	@acc[4], @acc[8]
+	mov	%rdx, 8*0($b_ptr)
+	cmovnc	@acc[5], $lo
+	mov	@acc[2], 8*1($b_ptr)
+	cmovnc	@acc[6], $hi
+	mov	@acc[3], 8*2($b_ptr)
+	mov	@acc[8], 8*3($b_ptr)
+	mov	$lo, 8*4($b_ptr)
+	mov	$hi, 8*5($b_ptr)
+
+	ret
+.cfi_endproc
+.size	__mulx_mont_384,.-__mulx_mont_384
+___
+}
+$code.=<<___;
+.globl	sqrx_mont_384
+.hidden	sqrx_mont_384
+.type	sqrx_mont_384,\@function,4,"unwind"
+.align	32
+sqrx_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	lea	-8*3(%rsp), %rsp
+.cfi_adjust_cfa_offset	8*3
+.cfi_end_prologue
+
+	mov	$n_ptr, $n0		# n0
+	lea	-128($b_org), $n_ptr	# control u-op density
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[4]
+	mov	$r_ptr, 8*2(%rsp)
+	mov	8*4($a_ptr), $lo
+	mov	8*5($a_ptr), $hi
+
+	lea	($a_ptr), $b_ptr
+	mov	$n0, (%rsp)		# n0
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+
+	mulx	%rdx, @acc[0], @acc[1]	# a[0]*a[0]
+	call	__mulx_mont_384		# as fast as dedicated squaring
+
+	mov	8*3(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*4(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*5(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*6(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*7(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*8(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*9(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*9
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_mont_384,.-sqrx_mont_384
+
+.globl	sqrx_n_mul_mont_384
+.hidden	sqrx_n_mul_mont_384
+.type	sqrx_n_mul_mont_384,\@function,6,"unwind"
+.align	32
+sqrx_n_mul_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	lea	-8*5(%rsp), %rsp
+.cfi_adjust_cfa_offset	8*5
+.cfi_end_prologue
+
+	mov	$b_org, @acc[2]		# loop counter
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	$a_ptr, $b_ptr
+	mov	8*3($a_ptr), @acc[4]
+	mov	$r_ptr, 8*2(%rsp)	# to __mulx_mont_384
+	mov	8*4($a_ptr), $lo
+	mov	8*5($a_ptr), $hi
+
+	mov	$n0, (%rsp)
+	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
+	movq	8*0(%r9), %xmm2		# prefetch b[0]
+
+.Loop_sqrx_384:
+	movd	@acc[2]d, %xmm1
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%rdx, @acc[0], @acc[1]	# a[0]*a[0]
+	call	__mulx_mont_384
+
+	movd	%xmm1, @acc[2]d
+	dec	@acc[2]d
+	jnz	.Loop_sqrx_384
+
+	mov	%rdx, @acc[6]
+	movq	%xmm2, %rdx		# b[0]
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	@acc[6],@acc[0],@acc[1]	# a[0]*b[0]
+	call	__mulx_mont_384
+
+	mov	8*5(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*6(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*7(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*8(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*9(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*10(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*11(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*11
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384
+
+.globl	sqrx_n_mul_mont_383
+.hidden	sqrx_n_mul_mont_383
+.type	sqrx_n_mul_mont_383,\@function,6,"unwind"
+.align	32
+sqrx_n_mul_mont_383:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	lea	-8*5(%rsp), %rsp
+.cfi_adjust_cfa_offset	8*5
+.cfi_end_prologue
+
+	mov	$b_org, @acc[2]		# loop counter
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	$a_ptr, $b_ptr
+	mov	8*3($a_ptr), @acc[4]
+	mov	$r_ptr, 8*2(%rsp)	# to __mulx_mont_383_nonred
+	mov	8*4($a_ptr), $lo
+	mov	8*5($a_ptr), $hi
+
+	mov	$n0, (%rsp)
+	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
+	movq	8*0(%r9), %xmm2		# prefetch b[0]
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+.Loop_sqrx_383:
+	movd	@acc[2]d, %xmm1
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+
+	mulx	%rdx, @acc[0], @acc[1]	# a[0]*a[0]
+	call	__mulx_mont_383_nonred	# omitting full reduction gives ~15%
+					# in addition-chains
+	movd	%xmm1, @acc[2]d
+	dec	@acc[2]d
+	jnz	.Loop_sqrx_383
+
+	mov	%rdx, @acc[6]
+	movq	%xmm2, %rdx		# b[0]
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
+
+	mulx	@acc[6], @acc[0], @acc[1]	# a[0]*b[0]
+	call	__mulx_mont_384
+
+	mov	8*5(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*6(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*7(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*8(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*9(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*10(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*11(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*11
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383
+___
+{ my @acc=@acc;				# will be rotated locally
+
+$code.=<<___;
+.type	__mulx_mont_383_nonred,\@abi-omnipotent
+.align	32
+__mulx_mont_383_nonred:
+.cfi_startproc
+	mulx	@acc[7], @acc[6], @acc[2]
+	mulx	@acc[8], @acc[7], @acc[3]
+	add	@acc[6], @acc[1]
+	mulx	@acc[4], @acc[8], @acc[4]
+	adc	@acc[7], @acc[2]
+	mulx	$lo, $lo, @acc[5]
+	adc	@acc[8], @acc[3]
+	mulx	$hi, $hi, @acc[6]
+	 mov	8($b_ptr), %rdx
+	adc	$lo, @acc[4]
+	adc	$hi, @acc[5]
+	adc	\$0, @acc[6]
+___
+for (my $i=1; $i<6; $i++) {
+my $tt = $i==1 ? @acc[7] : $hi;
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	 mov	@acc[0], @acc[8]
+	 imulq	8(%rsp), @acc[0]
+
+	################################# Multiply by b[$i]
+	xor	@acc[7], @acc[7]	# @acc[8]=0, cf=0, of=0
+	mulx	8*0+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[1]
+	adcx	$hi, @acc[2]
+
+	mulx	8*1+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[2]
+	adcx	$hi, @acc[3]
+
+	mulx	8*2+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[3]
+	adcx	$hi, @acc[4]
+
+	mulx	8*3+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[4]
+	adcx	$hi, @acc[5]
+
+	mulx	8*4+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[5]
+	adcx	$hi, @acc[6]
+
+	mulx	8*5+128($a_ptr), $lo, $hi
+	 mov	@acc[0], %rdx
+	adox	$lo, @acc[6]
+	adcx	@acc[7], $hi
+	adox	$hi, @acc[7]
+
+	################################# reduction
+	xor	@acc[0], @acc[0]	# acc[0]=0, cf=0, of=0
+	mulx	8*0+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[8]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	8*4+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	8*5+128($n_ptr), $lo, $hi
+	 mov	$b_next, %rdx
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+	adcx	@acc[8], @acc[6]
+	adox	@acc[8], @acc[7]
+	adcx	@acc[8], @acc[7]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	imulq	8(%rsp), %rdx
+	mov	8*3(%rsp), $b_ptr	# restore $r_ptr
+
+	################################# last reduction
+	xor	@acc[8], @acc[8]	# @acc[8]=0, cf=0, of=0
+	mulx	8*0+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[0]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	8*4+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	8*5+128($n_ptr), $lo, $hi
+	 mov	@acc[1], %rdx
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+	adc	\$0, @acc[6]
+	 mov	@acc[4], @acc[8]
+
+	mov	@acc[1], 8*0($b_ptr)
+	mov	@acc[2], 8*1($b_ptr)
+	mov	@acc[3], 8*2($b_ptr)
+	 mov	@acc[5], $lo
+	mov	@acc[4], 8*3($b_ptr)
+	mov	@acc[5], 8*4($b_ptr)
+	mov	@acc[6], 8*5($b_ptr)
+	 mov	@acc[6], $hi
+
+	ret
+.cfi_endproc
+.size	__mulx_mont_383_nonred,.-__mulx_mont_383_nonred
+___
+} } }
+{ my $frame = 4*8 +	# place for argument off-load +
+	      2*384/8 +	# place for 2 384-bit temporary vectors
+	      8;	# align
+my @acc = (@acc,"%rax","%rdx","%rbx","%rbp");
+
+# omitting 3 reductions gives ~10% better performance in add-chains
+$code.=<<___;
+.globl	sqrx_mont_382x
+.hidden	sqrx_mont_382x
+.type	sqrx_mont_382x,\@function,4,"unwind"
+.align	32
+sqrx_mont_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$n_ptr, 8*0(%rsp)	# n0
+	mov	$b_org, $n_ptr		# n_ptr
+	mov	$r_ptr, 8*2(%rsp)
+	mov	$a_ptr, 8*3(%rsp)
+
+	#################################
+	mov	8*0($a_ptr), @acc[0]	# a->re
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	mov	@acc[0], @acc[6]
+	add	8*6($a_ptr), @acc[0]	# a->re + a->im
+	mov	@acc[1], @acc[7]
+	adc	8*7($a_ptr), @acc[1]
+	mov	@acc[2], @acc[8]
+	adc	8*8($a_ptr), @acc[2]
+	mov	@acc[3], @acc[9]
+	adc	8*9($a_ptr), @acc[3]
+	mov	@acc[4], @acc[10]
+	adc	8*10($a_ptr), @acc[4]
+	mov	@acc[5], @acc[11]
+	adc	8*11($a_ptr), @acc[5]
+
+	sub	8*6($a_ptr), @acc[6]	# a->re - a->im
+	sbb	8*7($a_ptr), @acc[7]
+	sbb	8*8($a_ptr), @acc[8]
+	sbb	8*9($a_ptr), @acc[9]
+	sbb	8*10($a_ptr), @acc[10]
+	sbb	8*11($a_ptr), @acc[11]
+	sbb	$r_ptr, $r_ptr		# borrow flag as mask
+
+	mov	@acc[0], 32+8*0(%rsp)	# t0
+	mov	@acc[1], 32+8*1(%rsp)
+	mov	@acc[2], 32+8*2(%rsp)
+	mov	@acc[3], 32+8*3(%rsp)
+	mov	@acc[4], 32+8*4(%rsp)
+	mov	@acc[5], 32+8*5(%rsp)
+
+	mov	@acc[6], 32+8*6(%rsp)	# t1
+	mov	@acc[7], 32+8*7(%rsp)
+	mov	@acc[8], 32+8*8(%rsp)
+	mov	@acc[9], 32+8*9(%rsp)
+	mov	@acc[10], 32+8*10(%rsp)
+	mov	@acc[11], 32+8*11(%rsp)
+	mov	$r_ptr,   32+8*12(%rsp)
+
+	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
+	#mov	8*3(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_ptr	# a->im
+
+	mov	48($a_ptr), %rdx
+	mov	8*0($a_ptr), %r14	# @acc[6]
+	mov	8*1($a_ptr), %r15	# @acc[7]
+	mov	8*2($a_ptr), %rax	# @acc[8]
+	mov	8*3($a_ptr), %r12	# @acc[4]
+	mov	8*4($a_ptr), %rdi	# $lo
+	mov	8*5($a_ptr), %rbp	# $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%r14, %r8, %r9
+	call	__mulx_mont_383_nonred
+___
+{
+my @acc = map("%r$_","dx",15,"ax",12,"di","bp",	# output from __mulx_mont_384
+                      8..11,13,14);
+$code.=<<___;
+	add	@acc[0], @acc[0]	# add with itself
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+
+	mov	@acc[0],  8*6($b_ptr)	# ret->im
+	mov	@acc[1],  8*7($b_ptr)
+	mov	@acc[2],  8*8($b_ptr)
+	mov	@acc[3],  8*9($b_ptr)
+	mov	@acc[4],  8*10($b_ptr)
+	mov	@acc[5],  8*11($b_ptr)
+___
+}
+$code.=<<___;
+	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
+	lea	32-128(%rsp), $a_ptr	# t0 [+u-op density]
+	lea	32+8*6(%rsp), $b_ptr	# t1
+
+	mov	32+8*6(%rsp), %rdx	# t1[0]
+	mov	32+8*0(%rsp), %r14	# @acc[6]
+	mov	32+8*1(%rsp), %r15	# @acc[7]
+	mov	32+8*2(%rsp), %rax	# @acc[8]
+	mov	32+8*3(%rsp), %r12	# @acc[4]
+	mov	32+8*4(%rsp), %rdi	# $lo
+	mov	32+8*5(%rsp), %rbp	# $hi
+	#lea	-128($a_ptr), $a_ptr	# control u-op density
+	#lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%r14, %r8, %r9
+	call	__mulx_mont_383_nonred
+___
+{
+my @acc = map("%r$_","dx",15,"ax",12,"di","bp",	# output from __mulx_mont_384
+                      8..11,13,14);
+$code.=<<___;
+	mov	32+8*12(%rsp), @acc[11]	# account for sign from a->re - a->im
+	lea	128($n_ptr), $n_ptr
+	mov	32+8*0(%rsp), @acc[6]
+	and	@acc[11], @acc[6]
+	mov	32+8*1(%rsp), @acc[7]
+	and	@acc[11], @acc[7]
+	mov	32+8*2(%rsp), @acc[8]
+	and	@acc[11], @acc[8]
+	mov	32+8*3(%rsp), @acc[9]
+	and	@acc[11], @acc[9]
+	mov	32+8*4(%rsp), @acc[10]
+	and	@acc[11], @acc[10]
+	and	32+8*5(%rsp), @acc[11]
+
+	sub	@acc[6], @acc[0]
+	mov	8*0($n_ptr), @acc[6]
+	sbb	@acc[7], @acc[1]
+	mov	8*1($n_ptr), @acc[7]
+	sbb	@acc[8], @acc[2]
+	mov	8*2($n_ptr), @acc[8]
+	sbb	@acc[9], @acc[3]
+	mov	8*3($n_ptr), @acc[9]
+	sbb	@acc[10], @acc[4]
+	mov	8*4($n_ptr), @acc[10]
+	sbb	@acc[11], @acc[5]
+	sbb	@acc[11], @acc[11]
+
+	and	@acc[11], @acc[6]
+	and	@acc[11], @acc[7]
+	and	@acc[11], @acc[8]
+	and	@acc[11], @acc[9]
+	and	@acc[11], @acc[10]
+	and	8*5($n_ptr), @acc[11]
+
+	add	@acc[6], @acc[0]
+	adc	@acc[7], @acc[1]
+	adc	@acc[8], @acc[2]
+	adc	@acc[9], @acc[3]
+	adc	@acc[10], @acc[4]
+	adc	@acc[11], @acc[5]
+
+	mov	@acc[0],  8*0($b_ptr)	# ret->re
+	mov	@acc[1],  8*1($b_ptr)
+	mov	@acc[2],  8*2($b_ptr)
+	mov	@acc[3],  8*3($b_ptr)
+	mov	@acc[4],  8*4($b_ptr)
+	mov	@acc[5],  8*5($b_ptr)
+___
+}
+$code.=<<___;
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_mont_382x,.-sqrx_mont_382x
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/blst/asm/sha256-armv8.pl b/blst/asm/sha256-armv8.pl
new file mode 100755
index 0000000..1de27c7
--- /dev/null
+++ b/blst/asm/sha256-armv8.pl
@@ -0,0 +1,541 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# sha256_block procedure for ARMv8.
+#
+# This module is stripped of scalar code paths, with raionale that all
+# known processors are NEON-capable.
+#
+# See original module at CRYPTOGAMS for further details.
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+$BITS=256;
+$SZ=4;
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+$reg_t="w";
+$pre="blst_";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+$code.=<<___;
+.text
+
+.align	6
+.type	.LK$BITS,%object
+.LK$BITS:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0	//terminator
+.size	.LK$BITS,.-.LK$BITS
+.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+.globl	${pre}sha256_block_armv8
+.type	${pre}sha256_block_armv8,%function
+.align	6
+${pre}sha256_block_armv8:
+.Lv8_entry:
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1.32		{$ABCD,$EFGH},[$ctx]
+	adr		$Ktbl,.LK256
+
+.Loop_hw:
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
+	sub		$num,$num,#1
+	ld1.32		{$W0},[$Ktbl],#16
+	rev32		@MSG[0],@MSG[0]
+	rev32		@MSG[1],@MSG[1]
+	rev32		@MSG[2],@MSG[2]
+	rev32		@MSG[3],@MSG[3]
+	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
+	orr		$EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	ld1.32		{$W0},[$Ktbl],#16
+	add.i32		$W1,$W1,@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	ld1.32		{$W1},[$Ktbl]
+	add.i32		$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	add.i32		$W1,$W1,@MSG[3]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	add.i32		$ABCD,$ABCD,$ABCD_SAVE
+	add.i32		$EFGH,$EFGH,$EFGH_SAVE
+
+	cbnz		$num,.Loop_hw
+
+	st1.32		{$ABCD,$EFGH},[$ctx]
+
+	ldr		x29,[sp],#16
+	ret
+.size	${pre}sha256_block_armv8,.-${pre}sha256_block_armv8
+___
+}
+
+if ($SZ==4) {	######################################### NEON stuff #
+# You'll surely note a lot of similarities with sha256-armv4 module,
+# and of course it's not a coincidence. sha256-armv4 was used as
+# initial template, but was adapted for ARMv8 instruction set and
+# extensively re-tuned for all-round performance.
+
+my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
+my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
+my $Ktbl="x16";
+my $Xfer="x17";
+my @X = map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
+my $j=0;
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
+sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
+sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ushr_32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	&ushr_32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	&sli_32		($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ushr_32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&sli_32		($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T4,$T7,$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T4,$T7,32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T5,$T7,$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T3,$T7,$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_u32	($T3,$T7,32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T6,@X[0],$sigma1[0]);
+	 eval(shift(@insns));
+	  &ushr_32	($T7,@X[0],$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T6,@X[0],32-$sigma1[0]);
+	 eval(shift(@insns));
+	  &ushr_32	($T5,@X[0],$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T7,$T7,$T6);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T5,@X[0],32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_32		("{$T0}","[$Ktbl], #16");
+	 eval(shift(@insns));
+	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T5,$T5,$T5);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&mov		(&Dhi($T5), &Dlo($T7));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		($T0,$T0,@X[0]);
+	 while($#insns>=1) { eval(shift(@insns)); }
+	&st1_32		("{$T0}","[$Xfer], #16");
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_8		("{@X[0]}","[$inp],#16");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_32		("{$T0}","[$Ktbl],#16");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&rev32		(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&st1_32		("{$T0}","[$Xfer], #16");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
+	'&and	($t1,$f,$e)',
+	'&bic	($t4,$g,$e)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
+	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ror	($t0,$t0,"#$Sigma1[0]")',
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&ror	($t4,$t4,"#$Sigma0[0]")',
+	'&add	($d,$d,$h)',			# d+=h
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+.globl	${pre}sha256_block_data_order
+.type	${pre}sha256_block_data_order,%function
+.align	4
+${pre}sha256_block_data_order:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	sub	sp,sp,#16*4
+
+	adr	$Ktbl,.LK256
+	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
+
+	ld1.8	{@X[0]},[$inp], #16
+	ld1.8	{@X[1]},[$inp], #16
+	ld1.8	{@X[2]},[$inp], #16
+	ld1.8	{@X[3]},[$inp], #16
+	ld1.32	{$T0},[$Ktbl], #16
+	ld1.32	{$T1},[$Ktbl], #16
+	ld1.32	{$T2},[$Ktbl], #16
+	ld1.32	{$T3},[$Ktbl], #16
+	rev32	@X[0],@X[0]		// yes, even on
+	rev32	@X[1],@X[1]		// big-endian
+	rev32	@X[2],@X[2]
+	rev32	@X[3],@X[3]
+	mov	$Xfer,sp
+	add.32	$T0,$T0,@X[0]
+	add.32	$T1,$T1,@X[1]
+	add.32	$T2,$T2,@X[2]
+	st1.32	{$T0-$T1},[$Xfer], #32
+	add.32	$T3,$T3,@X[3]
+	st1.32	{$T2-$T3},[$Xfer]
+	sub	$Xfer,$Xfer,#32
+
+	ldp	$A,$B,[$ctx]
+	ldp	$C,$D,[$ctx,#8]
+	ldp	$E,$F,[$ctx,#16]
+	ldp	$G,$H,[$ctx,#24]
+	ldr	$t1,[sp,#0]
+	mov	$t2,wzr
+	eor	$t3,$B,$C
+	mov	$t4,wzr
+	b	.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	cmp	$t1,#0				// check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
+	cmp	$inp,$num
+	mov	$Xfer, #64
+	csel	$Xfer, $Xfer, xzr, eq
+	sub	$inp,$inp,$Xfer			// avoid SEGV
+	mov	$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	add	$A,$A,$t4			// h+=Sigma0(a) from the past
+	ldp	$t0,$t1,[$ctx,#0]
+	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
+	ldp	$t2,$t3,[$ctx,#8]
+	add	$A,$A,$t0			// accumulate
+	add	$B,$B,$t1
+	ldp	$t0,$t1,[$ctx,#16]
+	add	$C,$C,$t2
+	add	$D,$D,$t3
+	ldp	$t2,$t3,[$ctx,#24]
+	add	$E,$E,$t0
+	add	$F,$F,$t1
+	 ldr	$t1,[sp,#0]
+	stp	$A,$B,[$ctx,#0]
+	add	$G,$G,$t2
+	 mov	$t2,wzr
+	stp	$C,$D,[$ctx,#8]
+	add	$H,$H,$t3
+	stp	$E,$F,[$ctx,#16]
+	 eor	$t3,$B,$C
+	stp	$G,$H,[$ctx,#24]
+	 mov	$t4,wzr
+	 mov	$Xfer,sp
+	b.ne	.L_00_48
+
+	ldr	x29,[x29]
+	add	sp,sp,#16*4+16
+	ret
+.size	${pre}sha256_block_data_order,.-${pre}sha256_block_data_order
+___
+}
+
+{
+my ($out,$inp,$len) = map("x$_",(0..2));
+
+$code.=<<___;
+.globl	${pre}sha256_emit
+.hidden	${pre}sha256_emit
+.type	${pre}sha256_emit,%function
+.align	4
+${pre}sha256_emit:
+	ldp	x4,x5,[$inp]
+	ldp	x6,x7,[$inp,#16]
+#ifndef	__AARCH64EB__
+	rev	x4,x4
+	rev	x5,x5
+	rev	x6,x6
+	rev	x7,x7
+#endif
+	str	w4,[$out,#4]
+	lsr	x4,x4,#32
+	str	w5,[$out,#12]
+	lsr	x5,x5,#32
+	str	w6,[$out,#20]
+	lsr	x6,x6,#32
+	str	w7,[$out,#28]
+	lsr	x7,x7,#32
+	str	w4,[$out,#0]
+	str	w5,[$out,#8]
+	str	w6,[$out,#16]
+	str	w7,[$out,#24]
+	ret
+.size	${pre}sha256_emit,.-${pre}sha256_emit
+
+.globl	${pre}sha256_bcopy
+.hidden	${pre}sha256_bcopy
+.type	${pre}sha256_bcopy,%function
+.align	4
+${pre}sha256_bcopy:
+.Loop_bcopy:
+	ldrb	w3,[$inp],#1
+	sub	$len,$len,#1
+	strb	w3,[$out],#1
+	cbnz	$len,.Loop_bcopy
+	ret
+.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
+
+.globl	${pre}sha256_hcopy
+.hidden	${pre}sha256_hcopy
+.type	${pre}sha256_hcopy,%function
+.align	4
+${pre}sha256_hcopy:
+	ldp	x4,x5,[$inp]
+	ldp	x6,x7,[$inp,#16]
+	stp	x4,x5,[$out]
+	stp	x6,x7,[$out,#16]
+	ret
+.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
+___
+}
+
+{   my  %opcode = (
+	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
+	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge	or
+	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
+
+	s/\.[ui]?8(\s)/$1/;
+	s/\.\w?64\b//		and s/\.16b/\.2d/g	or
+	s/\.\w?32\b//		and s/\.16b/\.4s/g;
+	m/\bext\b/		and s/\.2d/\.16b/g	or
+	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/blst/asm/sha256-portable-x86_64.pl b/blst/asm/sha256-portable-x86_64.pl
new file mode 100755
index 0000000..eca0564
--- /dev/null
+++ b/blst/asm/sha256-portable-x86_64.pl
@@ -0,0 +1,337 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# sha256_block procedure for x86_64.
+#
+# Scalar-only version with minor twist minimizing 'lea' instructions.
+
+$flavour = shift;
+$output  = pop;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+$pre="blst_";
+$func="${pre}sha256_block_data_order";
+$TABLE="K256";
+$SZ=4;
+@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
+				"%r8d","%r9d","%r10d","%r11d");
+($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+
+$ctx="%rdi";	# 1st arg, zapped by $a3
+$inp="%rsi";	# 2nd arg
+$Tbl="%rbp";
+
+$_ctx="16*$SZ+0*8(%rsp)";
+$_inp="16*$SZ+1*8(%rsp)";
+$_end="16*$SZ+2*8(%rsp)";
+$framesz="16*$SZ+3*8";
+
+sub ROUND_00_15()
+{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+  my $STRIDE=$SZ;
+  #   $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
+
+$code.=<<___;
+	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
+	mov	$f,$a2
+
+	xor	$e,$a0
+	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$g,$a2			# f^g
+
+	mov	$T1,`$SZ*($i&0xf)`(%rsp)
+	xor	$a,$a1
+	and	$e,$a2			# (f^g)&e
+
+	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
+	add	$h,$T1			# T1+=h
+	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
+
+	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
+	xor	$e,$a0
+	add	$a2,$T1			# T1+=Ch(e,f,g)
+
+	mov	$a,$a2
+	add	`$SZ*$i`($Tbl),$T1	# T1+=K[round]
+	xor	$a,$a1
+
+	xor	$b,$a2			# a^b, b^c in next round
+	ror	\$$Sigma1[0],$a0	# Sigma1(e)
+	mov	$b,$h
+
+	and	$a2,$a3
+	ror	\$$Sigma0[0],$a1	# Sigma0(a)
+	add	$a0,$T1			# T1+=Sigma1(e)
+
+	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
+	add	$T1,$d			# d+=T1
+	add	$T1,$h			# h+=T1
+___
+$code.=<<___ if ($i==31);
+	lea	`16*$SZ`($Tbl),$Tbl	# round+=16
+___
+$code.=<<___ if ($i<15);
+	add	$a1,$h			# h+=Sigma0(a)
+___
+	($a2,$a3) = ($a3,$a2);
+}
+
+sub ROUND_16_XX()
+{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
+	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
+
+	mov	$a0,$T1
+	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
+	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
+	mov	$a2,$a1
+	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
+
+	xor	$T1,$a0
+	shr	\$$sigma0[2],$T1
+	ror	\$$sigma0[0],$a0
+	xor	$a1,$a2
+	shr	\$$sigma1[2],$a1
+
+	ror	\$$sigma1[0],$a2
+	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
+	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
+	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
+
+	add	`$SZ*($i&0xf)`(%rsp),$T1
+	mov	$e,$a0
+	add	$a2,$T1
+	mov	$a,$a1
+___
+	&ROUND_00_15(@_);
+}
+
+$code=<<___;
+.text
+
+.globl	$func
+.type	$func,\@function,3,"unwind"
+.align	16
+$func:
+.cfi_startproc
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$$framesz,%rsp
+.cfi_adjust_cfa_offset	$framesz
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+.cfi_end_prologue
+
+	mov	$SZ*0($ctx),$A
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	mov	$B,$a3
+	lea	$TABLE(%rip),$Tbl
+	xor	$C,$a3			# magic
+___
+	for($i=0;$i<16;$i++) {
+		$code.="	mov	$SZ*$i($inp),$T1\n";
+		$code.="	mov	@ROT[4],$a0\n";
+		$code.="	mov	@ROT[0],$a1\n";
+		$code.="	bswap	$T1\n";
+		&ROUND_00_15($i,@ROT);
+		unshift(@ROT,pop(@ROT));
+	}
+$code.=<<___;
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+___
+	for(;$i<32;$i++) {
+		&ROUND_16_XX($i,@ROT);
+		unshift(@ROT,pop(@ROT));
+	}
+
+$code.=<<___;
+	cmpb	\$0x19,`$SZ-1`($Tbl)
+	jnz	.Lrounds_16_xx
+
+	mov	$_ctx,$ctx
+	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
+	lea	16*$SZ($inp),$inp
+
+	add	$SZ*0($ctx),$A
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop
+
+	lea	$framesz+6*8(%rsp),%r11
+.cfi_def_cfa	%r11,8
+	mov	$framesz(%rsp),%r15
+.cfi_restore	%r15
+	mov	-40(%r11),%r14
+.cfi_restore	%r14
+	mov	-32(%r11),%r13
+.cfi_restore	%r13
+	mov	-24(%r11),%r12
+.cfi_restore	%r12
+	mov	-16(%r11),%rbp
+.cfi_restore	%rbp
+	mov	-8(%r11),%rbx
+.cfi_restore	%rbx
+.cfi_epilogue
+	lea	(%r11),%rsp
+	ret
+.cfi_endproc
+.size	$func,.-$func
+
+.align	64
+.type	$TABLE,\@object
+$TABLE:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
+___
+{
+my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") :  # Win64 order
+                               ("%rdi","%rsi","%rdx");  # Unix order
+$code.=<<___;
+.globl	${pre}sha256_emit
+.hidden	${pre}sha256_emit
+.type	${pre}sha256_emit,\@abi-omnipotent
+.align	16
+${pre}sha256_emit:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	bswap	%r8
+	mov	24($inp), %r11
+	bswap	%r9
+	mov	%r8d, 4($out)
+	bswap	%r10
+	mov	%r9d, 12($out)
+	bswap	%r11
+	mov	%r10d, 20($out)
+	shr	\$32, %r8
+	mov	%r11d, 28($out)
+	shr	\$32, %r9
+	mov	%r8d, 0($out)
+	shr	\$32, %r10
+	mov	%r9d, 8($out)
+	shr	\$32, %r11
+	mov	%r10d, 16($out)
+	mov	%r11d, 24($out)
+	ret
+.size	${pre}sha256_emit,.-${pre}sha256_emit
+
+.globl	${pre}sha256_bcopy
+.hidden	${pre}sha256_bcopy
+.type	${pre}sha256_bcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_bcopy:
+	sub	$inp, $out
+.Loop_bcopy:
+	movzb	($inp), %eax
+	lea	1($inp), $inp
+	mov	%al, -1($out,$inp)
+	dec	$len
+	jnz	.Loop_bcopy
+	ret
+.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
+
+.globl	${pre}sha256_hcopy
+.hidden	${pre}sha256_hcopy
+.type	${pre}sha256_hcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_hcopy:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	mov	24($inp), %r11
+	mov	%r8, 0($out)
+	mov	%r9, 8($out)
+	mov	%r10, 16($out)
+	mov	%r11, 24($out)
+	ret
+.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/blst/asm/sha256-x86_64.pl b/blst/asm/sha256-x86_64.pl
new file mode 100755
index 0000000..22b3763
--- /dev/null
+++ b/blst/asm/sha256-x86_64.pl
@@ -0,0 +1,789 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# sha256_block procedure for x86_64.
+#
+# This module is stripped of AVX and even scalar code paths, with
+# raionale that
+#
+# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one*
+#    processor, venerable Sandy Bridge;
+# b) AVX2 incurs costly power transitions, which would be justifiable
+#    if AVX2 code was executing most of the time, which is not the
+#    case in the context;
+# c) all comtemporary processors support SSSE3, so that nobody would
+#    actually use scalar code path anyway;
+#
+# See original module at CRYPTOGAMS for further details.
+
+$flavour = shift;
+$output  = pop;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+$pre="blst_";
+$func="${pre}sha256_block_data_order";
+$TABLE="K256";
+$SZ=4;
+@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
+				"%r8d","%r9d","%r10d","%r11d");
+($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+
+$ctx="%rdi";	# 1st arg, zapped by $a3
+$inp="%rsi";	# 2nd arg
+$Tbl="%rbp";
+
+$_ctx="16*$SZ+0*8(%rsp)";
+$_inp="16*$SZ+1*8(%rsp)";
+$_end="16*$SZ+2*8(%rsp)";
+$framesz="16*$SZ+3*8";
+
+$code=<<___;
+.text
+
+.align	64
+.type	$TABLE,\@object
+$TABLE:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
+___
+
+######################################################################
+# SIMD code paths
+#
+{{{
+######################################################################
+# Intel SHA Extensions implementation of SHA256 update function.
+#
+my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
+
+my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
+my @MSG=map("%xmm$_",(3..6));
+
+$code.=<<___;
+.globl	${pre}sha256_block_data_order_shaext
+.hidden	${pre}sha256_block_data_order_shaext
+.type	${pre}sha256_block_data_order_shaext,\@function,3,"unwind"
+.align	64
+${pre}sha256_block_data_order_shaext:
+.cfi_startproc
+___
+$code.=<<___ if ($win64);
+	sub	\$0x58,%rsp
+.cfi_adjust_cfa_offset	0x58
+	movaps	%xmm6,-0x58(%r11)
+.cfi_offset	%xmm6,-0x60
+	movaps	%xmm7,-0x48(%r11)
+.cfi_offset	%xmm7,-0x50
+	movaps	%xmm8,-0x38(%r11)
+.cfi_offset	%xmm8,-0x40
+	movaps	%xmm9,-0x28(%r11)
+.cfi_offset	%xmm9,-0x30
+	movaps	%xmm10,-0x18(%r11)
+.cfi_offset	%xmm10,-0x20
+.cfi_end_prologue
+___
+$code.=<<___;
+	lea		K256+0x80(%rip),$Tbl
+	movdqu		($ctx),$ABEF		# DCBA
+	movdqu		16($ctx),$CDGH		# HGFE
+	movdqa		0x100-0x80($Tbl),$TMP	# byte swap mask
+
+	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
+	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
+	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
+	movdqa		$TMP,$BSWAP		# offload
+	palignr		\$8,$CDGH,$ABEF		# ABEF
+	punpcklqdq	$Wi,$CDGH		# CDGH
+	jmp		.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu		($inp),@MSG[0]
+	movdqu		0x10($inp),@MSG[1]
+	movdqu		0x20($inp),@MSG[2]
+	pshufb		$TMP,@MSG[0]
+	movdqu		0x30($inp),@MSG[3]
+
+	movdqa		0*16-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	pshufb		$TMP,@MSG[1]
+	movdqa		$CDGH,$CDGH_SAVE	# offload
+	sha256rnds2	$ABEF,$CDGH		# 0-3
+	pshufd		\$0x0e,$Wi,$Wi
+	nop
+	movdqa		$ABEF,$ABEF_SAVE	# offload
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		1*16-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	pshufb		$TMP,@MSG[2]
+	sha256rnds2	$ABEF,$CDGH		# 4-7
+	pshufd		\$0x0e,$Wi,$Wi
+	lea		0x40($inp),$inp
+	sha256msg1	@MSG[1],@MSG[0]
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		2*16-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	pshufb		$TMP,@MSG[3]
+	sha256rnds2	$ABEF,$CDGH		# 8-11
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[3],$TMP
+	palignr		\$4,@MSG[2],$TMP
+	nop
+	paddd		$TMP,@MSG[0]
+	sha256msg1	@MSG[2],@MSG[1]
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		3*16-0x80($Tbl),$Wi
+	paddd		@MSG[3],$Wi
+	sha256msg2	@MSG[3],@MSG[0]
+	sha256rnds2	$ABEF,$CDGH		# 12-15
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[0],$TMP
+	palignr		\$4,@MSG[3],$TMP
+	nop
+	paddd		$TMP,@MSG[1]
+	sha256msg1	@MSG[3],@MSG[2]
+	sha256rnds2	$CDGH,$ABEF
+___
+for($i=4;$i<16-3;$i++) {
+$code.=<<___;
+	movdqa		$i*16-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256rnds2	$ABEF,$CDGH		# 16-19...
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	nop
+	paddd		$TMP,@MSG[2]
+	sha256msg1	@MSG[0],@MSG[3]
+	sha256rnds2	$CDGH,$ABEF
+___
+	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	movdqa		13*16-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256rnds2	$ABEF,$CDGH		# 52-55
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	sha256rnds2	$CDGH,$ABEF
+	paddd		$TMP,@MSG[2]
+
+	movdqa		14*16-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	sha256rnds2	$ABEF,$CDGH		# 56-59
+	pshufd		\$0x0e,$Wi,$Wi
+	sha256msg2	@MSG[1],@MSG[2]
+	movdqa		$BSWAP,$TMP
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		15*16-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	nop
+	sha256rnds2	$ABEF,$CDGH		# 60-63
+	pshufd		\$0x0e,$Wi,$Wi
+	dec		$num
+	nop
+	sha256rnds2	$CDGH,$ABEF
+
+	paddd		$CDGH_SAVE,$CDGH
+	paddd		$ABEF_SAVE,$ABEF
+	jnz		.Loop_shaext
+
+	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
+	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
+	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
+	punpckhqdq	$CDGH,$ABEF		# DCBA
+	palignr		\$8,$TMP,$CDGH		# HGFE
+
+	movdqu	$ABEF,($ctx)
+	movdqu	$CDGH,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	-0x58(%r11),%xmm6
+	movaps	-0x48(%r11),%xmm7
+	movaps	-0x38(%r11),%xmm8
+	movaps	-0x28(%r11),%xmm9
+	movaps	-0x18(%r11),%xmm10
+	mov	%r11,%rsp
+.cfi_def_cfa	%r11,8
+.cfi_epilogue
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.size	${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext
+___
+}}}
+{{{
+
+my $a4=$T1;
+my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
+	'&mov	($a,$a1)',
+	'&mov	($a4,$f)',
+
+	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
+	'&xor	($a0,$e)',
+	'&xor	($a4,$g)',			# f^g
+
+	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
+	'&xor	($a1,$a)',
+	'&and	($a4,$e)',			# (f^g)&e
+
+	'&xor	($a0,$e)',
+	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
+	'&mov	($a2,$a)',
+
+	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
+	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
+	'&xor	($a2,$b)',			# a^b, b^c in next round
+
+	'&add	($h,$a4)',			# h+=Ch(e,f,g)
+	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
+	'&and	($a3,$a2)',			# (b^c)&(a^b)
+
+	'&xor	($a1,$a)',
+	'&add	($h,$a0)',			# h+=Sigma1(e)
+	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
+
+	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
+	'&add	($d,$h)',			# d+=h
+	'&add	($h,$a3)',			# h+=Maj(a,b,c)
+
+	'&mov	($a0,$d)',
+	'&add	($a1,$h);'.			# h+=Sigma0(a)
+	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+	);
+}
+
+######################################################################
+# SSSE3 code path
+#
+{
+my $Tbl = $inp;
+my $_ctx="0(%rbp)";
+my $_inp="8(%rbp)";
+my $_end="16(%rbp)";
+my $framesz=4*8+$win64*16*4+8;
+
+my @X = map("%xmm$_",(0..3));
+my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
+
+$code.=<<___;
+.globl	${func}
+.hidden	${func}
+.type	${func},\@function,3,"unwind"
+.align	64
+${func}:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$$framesz,%rsp
+.cfi_adjust_cfa_offset	$framesz
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	mov	$ctx,0(%rsp)		# save ctx, 1st arg
+	#mov	$inp,8(%rsp)		# save inp, 2nd arg
+	mov	%rdx,16(%rsp)		# save end pointer, "3rd" arg
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x20(%rsp)
+.cfi_offset	%xmm6,-0x78
+	movaps	%xmm7,0x30(%rsp)
+.cfi_offset	%xmm7,-0x68
+	movaps	%xmm8,0x40(%rsp)
+.cfi_offset	%xmm8,-0x58
+	movaps	%xmm9,0x50(%rsp)
+.cfi_offset	%xmm9,-0x48
+___
+$code.=<<___;
+	mov	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+.cfi_end_prologue
+
+	lea	-16*$SZ(%rsp),%rsp
+	mov	$SZ*0($ctx),$A
+	and	\$-64,%rsp		# align stack
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+___
+
+$code.=<<___;
+	#movdqa	$TABLE+`$SZ*$rounds`+32(%rip),$t4
+	#movdqa	$TABLE+`$SZ*$rounds`+64(%rip),$t5
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	$TABLE+`$SZ*$rounds`(%rip),$t3
+	mov	$inp,$_inp		# offload $inp
+	movdqu	0x00($inp),@X[0]
+	movdqu	0x10($inp),@X[1]
+	movdqu	0x20($inp),@X[2]
+	pshufb	$t3,@X[0]
+	movdqu	0x30($inp),@X[3]
+	lea	$TABLE(%rip),$Tbl
+	pshufb	$t3,@X[1]
+	movdqa	0x00($Tbl),$t0
+	movdqa	0x10($Tbl),$t1
+	pshufb	$t3,@X[2]
+	paddd	@X[0],$t0
+	movdqa	0x20($Tbl),$t2
+	pshufb	$t3,@X[3]
+	movdqa	0x30($Tbl),$t3
+	paddd	@X[1],$t1
+	paddd	@X[2],$t2
+	paddd	@X[3],$t3
+	movdqa	$t0,0x00(%rsp)
+	mov	$A,$a1
+	movdqa	$t1,0x10(%rsp)
+	mov	$B,$a3
+	movdqa	$t2,0x20(%rsp)
+	xor	$C,$a3			# magic
+	movdqa	$t3,0x30(%rsp)
+	mov	$E,$a0
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	sub	\$`-16*$SZ`,$Tbl	# size optimization
+___
+sub Xupdate_256_SSSE3 () {
+	(
+	'&movdqa	($t0,@X[1]);',
+	'&movdqa	($t3,@X[3])',
+	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
+	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
+	'&movdqa	($t1,$t0)',
+	'&movdqa	($t2,$t0);',
+	'&psrld		($t0,$sigma0[2])',
+	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
+	'&psrld		($t2,$sigma0[0])',
+	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
+	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
+	'&pxor		($t0,$t2)',
+	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
+	'&pxor		($t0,$t1)',
+	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
+	'&pxor		($t0,$t2);',
+	 '&movdqa	($t2,$t3)',
+	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
+	 '&psrld	($t3,$sigma1[2])',
+	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
+	 '&psrlq	($t2,$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
+	 '&pxor		($t3,$t2)',
+	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
+	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
+	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
+	 '&movdqa	($t2,$t3);',
+	 '&psrld	($t3,$sigma1[2])',
+	 '&psrlq	($t2,$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	'&movdqa	($t2,16*$j."($Tbl)")',
+	 '&pshufb	($t3,$t5)',
+	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
+	);
+}
+
+sub SSSE3_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
+
+    if (0) {
+	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	}
+    } else {			# squeeze extra 4% on Westmere and 19% on Atom
+	  eval(shift(@insns));	#@
+	&movdqa		($t0,@X[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t3,@X[3]);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&palignr	($t0,@X[0],$SZ);	# X[1..4]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&movdqa		($t1,$t0);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t2,$t0);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&psrld		($t0,$sigma0[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&psrld		($t2,$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&pslld		($t1,8*$SZ-$sigma0[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&psrld		($t2,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	&pxor		($t0,$t1);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pslld		($t1,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t1);		# sigma0(X[1..4])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &psrlq		($t2,$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
+	 &pshufd	($t3,$t3,0b10000000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &psrlq		($t2,$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 #&pshufb	($t3,$t5);
+	 &pshufd	($t3,$t3,0b00001000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t2,16*$j."($Tbl)");
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &pslldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+    }
+	&paddd		($t2,@X[0]);
+	  foreach (@insns) { eval; }		# remaining instructions
+	&movdqa		(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&SSSE3_256_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*$SZ."($Tbl)",0);
+	&jne	(".Lssse3_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+$code.=<<___;
+	mov	$_ctx,$ctx
+	mov	$a1,$A
+	mov	$_inp,$inp
+
+	add	$SZ*0($ctx),$A
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	lea	16*$SZ($inp),$inp
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop_ssse3
+
+	xorps	%xmm0, %xmm0
+	lea	$framesz+6*8(%rbp),%r11
+.cfi_def_cfa	%r11,8
+	movaps	%xmm0, 0x00(%rsp)	# scrub the stack
+	movaps	%xmm0, 0x10(%rsp)
+	movaps	%xmm0, 0x20(%rsp)
+	movaps	%xmm0, 0x30(%rsp)
+___
+$code.=<<___ if ($win64);
+	movaps	0x20(%rbp),%xmm6
+	movaps	0x30(%rbp),%xmm7
+	movaps	0x40(%rbp),%xmm8
+	movaps	0x50(%rbp),%xmm9
+___
+$code.=<<___;
+	mov	$framesz(%rbp),%r15
+.cfi_restore	%r15
+	mov	-40(%r11),%r14
+.cfi_restore	%r14
+	mov	-32(%r11),%r13
+.cfi_restore	%r13
+	mov	-24(%r11),%r12
+.cfi_restore	%r12
+	mov	-16(%r11),%rbx
+.cfi_restore	%rbx
+	mov	-8(%r11),%rbp
+.cfi_restore	%rbp
+.cfi_epilogue
+	lea	(%r11),%rsp
+	ret
+.cfi_endproc
+.size	${func},.-${func}
+___
+}
+}}}
+{
+my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") :  # Win64 order
+                               ("%rdi","%rsi","%rdx");  # Unix order
+$code.=<<___;
+.globl	${pre}sha256_emit
+.hidden	${pre}sha256_emit
+.type	${pre}sha256_emit,\@abi-omnipotent
+.align	16
+${pre}sha256_emit:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	bswap	%r8
+	mov	24($inp), %r11
+	bswap	%r9
+	mov	%r8d, 4($out)
+	bswap	%r10
+	mov	%r9d, 12($out)
+	bswap	%r11
+	mov	%r10d, 20($out)
+	shr	\$32, %r8
+	mov	%r11d, 28($out)
+	shr	\$32, %r9
+	mov	%r8d, 0($out)
+	shr	\$32, %r10
+	mov	%r9d, 8($out)
+	shr	\$32, %r11
+	mov	%r10d, 16($out)
+	mov	%r11d, 24($out)
+	ret
+.size	${pre}sha256_emit,.-${pre}sha256_emit
+
+.globl	${pre}sha256_bcopy
+.hidden	${pre}sha256_bcopy
+.type	${pre}sha256_bcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_bcopy:
+	sub	$inp, $out
+.Loop_bcopy:
+	movzb	($inp), %eax
+	lea	1($inp), $inp
+	mov	%al, -1($out,$inp)
+	dec	$len
+	jnz	.Loop_bcopy
+	ret
+.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
+
+.globl	${pre}sha256_hcopy
+.hidden	${pre}sha256_hcopy
+.type	${pre}sha256_hcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_hcopy:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	mov	24($inp), %r11
+	mov	%r8, 0($out)
+	mov	%r9, 8($out)
+	mov	%r10, 16($out)
+	mov	%r11, 24($out)
+	ret
+.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
+___
+}
+
+sub sha256op38 {
+    my $instr = shift;
+    my %opcodelet = (
+		"sha256rnds2" => 0xcb,
+  		"sha256msg1"  => 0xcc,
+		"sha256msg2"  => 0xcd	);
+
+    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
+      my @opcode=(0x0f,0x38);
+	push @opcode,$opcodelet{$instr};
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	return ".byte\t".join(',',@opcode);
+    } else {
+	return $instr."\t".@_[0];
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
+
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/blst/asm/x86_64-xlate.pl b/blst/asm/x86_64-xlate.pl
new file mode 100755
index 0000000..62be619
--- /dev/null
+++ b/blst/asm/x86_64-xlate.pl
@@ -0,0 +1,1781 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Ascetic x86_64 AT&T to MASM/NASM assembler translator by @dot-asm.
+#
+# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T
+# format is way easier to parse. Because it's simpler to "gear" from
+# Unix ABI to Windows one [see cross-reference "card" at the end of
+# file]. Because Linux targets were available first...
+#
+# In addition the script also "distills" code suitable for GNU
+# assembler, so that it can be compiled with more rigid assemblers,
+# such as Solaris /usr/ccs/bin/as.
+#
+# This translator is not designed to convert *arbitrary* assembler
+# code from AT&T format to MASM one. It's designed to convert just
+# enough to provide for dual-ABI OpenSSL modules development...
+# There *are* limitations and you might have to modify your assembler
+# code or this script to achieve the desired result...
+#
+# Currently recognized limitations:
+#
+# - can't use multiple ops per line;
+#
+# Dual-ABI styling rules.
+#
+# 1. Adhere to Unix register and stack layout [see cross-reference
+#    ABI "card" at the end for explanation].
+# 2. Forget about "red zone," stick to more traditional blended
+#    stack frame allocation. If volatile storage is actually required
+#    that is. If not, just leave the stack as is.
+# 3. Functions tagged with ".type name,@function" get crafted with
+#    unified Win64 prologue and epilogue automatically. If you want
+#    to take care of ABI differences yourself, tag functions as
+#    ".type name,@abi-omnipotent" instead.
+# 4. To optimize the Win64 prologue you can specify number of input
+#    arguments as ".type name,@function,N." Keep in mind that if N is
+#    larger than 6, then you *have to* write "abi-omnipotent" code,
+#    because >6 cases can't be addressed with unified prologue.
+# 5. Name local labels as .L*, do *not* use dynamic labels such as 1:
+#    (sorry about latter).
+# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is
+#    required to identify the spots, where to inject Win64 epilogue!
+#    But on the pros, it's then prefixed with rep automatically:-)
+# 7. Stick to explicit ip-relative addressing. If you have to use
+#    GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??.
+#    Both are recognized and translated to proper Win64 addressing
+#    modes.
+#
+# 8. In order to provide for structured exception handling unified
+#    Win64 prologue copies %rsp value to %rax. [Unless function is
+#    tagged with additional .type tag.] For further details see SEH
+#    paragraph at the end.
+# 9. .init segment is allowed to contain calls to functions only.
+# a. If function accepts more than 4 arguments *and* >4th argument
+#    is declared as non 64-bit value, do clear its upper part.
+
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+open STDOUT,">$output" || die "can't open $output: $!"
+	if (defined($output));
+
+my $gas=1;	$gas=0 if ($output =~ /\.asm$/);
+my $elf=1;	$elf=0 if (!$gas);
+my $dwarf=$elf;
+my $win64=0;
+my $prefix="";
+my $decor=".L";
+
+my $masmref=8 + 50727*2**-32;	# 8.00.50727 shipped with VS2005
+my $masm=0;
+my $PTR=" PTR";
+
+my $nasmref=2.03;
+my $nasm=0;
+
+if    ($flavour eq "mingw64")	{ $gas=1; $elf=0; $win64=1;
+				  $prefix=`echo __USER_LABEL_PREFIX__ | \${CC:-false} -E -P -`;
+				  $prefix =~ s|\R$||; # Better chomp
+				}
+elsif ($flavour eq "macosx")	{ $gas=1; $elf=0; $prefix="_"; $decor="L\$"; }
+elsif ($flavour eq "masm")	{ $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; }
+elsif ($flavour eq "nasm")	{ $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; }
+elsif (!$gas)
+{   if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i)
+    {	$nasm = $1 + $2*0.01; $PTR="";  }
+    elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/)
+    {	$masm = $1 + $2*2**-16 + $4*2**-32;   }
+    die "no assembler found on %PATH%" if (!($nasm || $masm));
+    $win64=1;
+    $elf=0;
+    $decor="\$L\$";
+}
+
+$dwarf=0 if($win64);
+
+my $current_segment;
+my $current_function;
+my %globals;
+
+{ package opcode;	# pick up opcodes
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /^([a-z][a-z0-9]*)/i) {
+	    bless $self,$class;
+	    $self->{op} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    undef $self->{sz};
+	    if ($self->{op} =~ /^(movz)x?([bw]).*/) {	# movz is pain...
+		$self->{op} = $1;
+		$self->{sz} = $2;
+	    } elsif ($self->{op} =~ /cmov[n]?[lb]$/) {
+		# pass through
+	    } elsif ($self->{op} =~ /call|jmp/) {
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) {
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
+		$self->{op} = $1;
+		$self->{sz} = $2;
+	    }
+	}
+	$ret;
+    }
+    sub size {
+	my ($self, $sz) = @_;
+	$self->{sz} = $sz if (defined($sz) && !defined($self->{sz}));
+	$self->{sz};
+    }
+    sub out {
+	my $self = shift;
+	if ($gas) {
+	    if ($self->{op} eq "movz") {	# movz is pain...
+		sprintf "%s%s%s",$self->{op},$self->{sz},shift;
+	    } elsif ($self->{op} =~ /^set/) {
+		"$self->{op}";
+	    } elsif ($self->{op} eq "ret") {
+		my $epilogue = "";
+		if ($win64 && $current_function->{abi} eq "svr4"
+			   && !$current_function->{unwind}) {
+		    $epilogue = "movq	8(%rsp),%rdi\n\t" .
+				"movq	16(%rsp),%rsi\n\t";
+		}
+		$epilogue . ".byte	0xf3,0xc3";
+	    } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") {
+		".p2align\t3\n\t.quad";
+	    } else {
+		"$self->{op}$self->{sz}";
+	    }
+	} else {
+	    $self->{op} =~ s/^movz/movzx/;
+	    if ($self->{op} eq "ret") {
+		$self->{op} = "";
+		if ($win64 && $current_function->{abi} eq "svr4"
+			   && !$current_function->{unwind}) {
+		    $self->{op} = "mov	rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t".
+				  "mov	rsi,QWORD$PTR\[16+rsp\]\n\t";
+	    	}
+		$self->{op} .= "DB\t0F3h,0C3h\t\t;repret";
+	    } elsif ($self->{op} =~ /^(pop|push)f/) {
+		$self->{op} .= $self->{sz};
+	    } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
+		$self->{op} = "\tDQ";
+	    }
+	    $self->{op};
+	}
+    }
+    sub mnemonic {
+	my ($self, $op) = @_;
+	$self->{op}=$op if (defined($op));
+	$self->{op};
+    }
+}
+{ package const;	# pick up constants, which start with $
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /^\$([^,]+)/) {
+	    bless $self, $class;
+	    $self->{value} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+	}
+	$ret;
+    }
+    sub out {
+    	my $self = shift;
+
+	$self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig;
+	if ($gas) {
+	    # Solaris /usr/ccs/bin/as can't handle multiplications
+	    # in $self->{value}
+	    my $value = $self->{value};
+	    no warnings;    # oct might complain about overflow, ignore here...
+	    $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	    if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) {
+		$self->{value} = $value;
+	    }
+	    sprintf "\$%s",$self->{value};
+	} else {
+	    my $value = $self->{value};
+	    $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm);
+	    sprintf "%s",$value;
+	}
+    }
+}
+{ package ea;		# pick up effective addresses: expr(%reg,%reg,scale)
+
+    my %szmap = (	b=>"BYTE$PTR",    w=>"WORD$PTR",
+			l=>"DWORD$PTR",   d=>"DWORD$PTR",
+			q=>"QWORD$PTR",   o=>"OWORD$PTR",
+			x=>"XMMWORD$PTR", y=>"YMMWORD$PTR",
+			z=>"ZMMWORD$PTR" ) if (!$gas);
+
+    my %sifmap = (	ss=>"d",	sd=>"q",	# broadcast only
+			i32x2=>"q",	f32x2=>"q",
+			i32x4=>"x",	i64x2=>"x",	i128=>"x",
+			f32x4=>"x",	f64x2=>"x",	f128=>"x",
+			i32x8=>"y",	i64x4=>"y",
+			f32x8=>"y",	f64x4=>"y" ) if (!$gas);
+
+    sub re {
+	my	($class, $line, $opcode) = @_;
+	my	$self = {};
+	my	$ret;
+
+	# optional * ----vvv--- appears in indirect jmp/call
+	if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) {
+	    bless $self, $class;
+	    $self->{asterisk} = $1;
+	    $self->{label} = $2;
+	    ($self->{base},$self->{index},$self->{scale})=split(/,/,$3);
+	    $self->{scale} = 1 if (!defined($self->{scale}));
+	    $self->{opmask} = $4;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    if ($win64 && $self->{label} =~ s/\@GOTPCREL//) {
+		die if ($opcode->mnemonic() ne "mov");
+		$opcode->mnemonic("lea");
+	    }
+	    $self->{base}  =~ s/^%//;
+	    $self->{index} =~ s/^%// if (defined($self->{index}));
+	    $self->{opcode} = $opcode;
+	}
+	$ret;
+    }
+    sub size {}
+    sub out {
+	my ($self, $sz) = @_;
+
+	$self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+	$self->{label} =~ s/\.L/$decor/g;
+
+	# Silently convert all EAs to 64-bit. This is required for
+	# elder GNU assembler and results in more compact code,
+	# *but* most importantly AES module depends on this feature!
+	$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+	$self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+
+	# Solaris /usr/ccs/bin/as can't handle multiplications
+	# in $self->{label}...
+	use integer;
+	$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	$self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg;
+
+	# Some assemblers insist on signed presentation of 32-bit
+	# offsets, but sign extension is a tricky business in perl...
+	$self->{label} =~ s/\b([0-9]+)\b/unpack("l",pack("L",$1))/eg;
+
+	# if base register is %rbp or %r13, see if it's possible to
+	# flip base and index registers [for better performance]
+	if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
+	    $self->{base} =~ /(rbp|r13)/) {
+		$self->{base} = $self->{index}; $self->{index} = $1;
+	}
+
+	if ($gas) {
+	    $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
+
+	    if (defined($self->{index})) {
+		sprintf "%s%s(%s,%%%s,%d)%s",
+					$self->{asterisk},$self->{label},
+					$self->{base}?"%$self->{base}":"",
+					$self->{index},$self->{scale},
+					$self->{opmask};
+	    } else {
+		sprintf "%s%s(%%%s)%s",	$self->{asterisk},$self->{label},
+					$self->{base},$self->{opmask};
+	    }
+	} else {
+	    $self->{label} =~ s/\./\$/g;
+	    $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
+	    $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
+
+	    my $mnemonic = $self->{opcode}->mnemonic();
+	    ($self->{asterisk})				&& ($sz="q") ||
+	    ($mnemonic =~ /^v?mov([qd])$/)		&& ($sz=$1)  ||
+	    ($mnemonic =~ /^v?pinsr([qdwb])$/)		&& ($sz=$1)  ||
+	    ($mnemonic =~ /^vpbroadcast([qdwb])$/)	&& ($sz=$1)  ||
+	    ($mnemonic =~ /^v(?:broadcast|extract|insert)([sif]\w+)$/)
+							&& ($sz=$sifmap{$1});
+
+	    $self->{opmask}  =~ s/%(k[0-7])/$1/;
+
+	    if (defined($self->{index})) {
+		sprintf "%s[%s%s*%d%s]%s",$szmap{$sz},
+					$self->{label}?"$self->{label}+":"",
+					$self->{index},$self->{scale},
+					$self->{base}?"+$self->{base}":"",
+					$self->{opmask};
+	    } elsif ($self->{base} eq "rip") {
+		sprintf "%s[%s]",$szmap{$sz},$self->{label};
+	    } else {
+		sprintf "%s[%s%s]%s",	$szmap{$sz},
+					$self->{label}?"$self->{label}+":"",
+					$self->{base},$self->{opmask};
+	    }
+	}
+    }
+}
+{ package register;	# pick up registers, which start with %.
+    sub re {
+	my	($class, $line, $opcode) = @_;
+	my	$self = {};
+	my	$ret;
+
+	# optional * ----vvv--- appears in indirect jmp/call
+	if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) {
+	    bless $self,$class;
+	    $self->{asterisk} = $1;
+	    $self->{value} = $2;
+	    $self->{opmask} = $3;
+	    $opcode->size($self->size());
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+	}
+	$ret;
+    }
+    sub size {
+	my	$self = shift;
+	my	$ret;
+
+	if    ($self->{value} =~ /^r[\d]+b$/i)	{ $ret="b"; }
+	elsif ($self->{value} =~ /^r[\d]+w$/i)	{ $ret="w"; }
+	elsif ($self->{value} =~ /^r[\d]+d$/i)	{ $ret="l"; }
+	elsif ($self->{value} =~ /^r[\w]+$/i)	{ $ret="q"; }
+	elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; }
+	elsif ($self->{value} =~ /^[\w]{2}l$/i)	{ $ret="b"; }
+	elsif ($self->{value} =~ /^[\w]{2}$/i)	{ $ret="w"; }
+	elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; }
+
+	$ret;
+    }
+    sub out {
+    	my $self = shift;
+	if ($gas)	{ sprintf "%s%%%s%s",	$self->{asterisk},
+						$self->{value},
+						$self->{opmask}; }
+	else		{ $self->{opmask} =~ s/%(k[0-7])/$1/;
+			  $self->{value}.$self->{opmask}; }
+    }
+}
+{ package label;	# pick up labels, which end with :
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /(^[\.\w]+)\:/) {
+	    bless $self,$class;
+	    $self->{value} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    $self->{value} =~ s/^\.L/$decor/;
+	}
+	$ret;
+    }
+    sub out {
+	my $self = shift;
+
+	if ($gas) {
+	    my $func = ($globals{$self->{value}} or $self->{value}) . ":";
+	    if ($current_function->{name} eq $self->{value}) {
+		$func .= "\n.cfi_".cfi_directive::startproc()   if ($dwarf);
+		$func .= "\n	.byte	0xf3,0x0f,0x1e,0xfa\n";	# endbranch
+		if ($win64 && $current_function->{abi} eq "svr4") {
+		    my $fp = $current_function->{unwind} ? "%r11" : "%rax";
+		    $func .= "	movq	%rdi,8(%rsp)\n";
+		    $func .= "	movq	%rsi,16(%rsp)\n";
+		    $func .= "	movq	%rsp,$fp\n";
+		    $func .= "${decor}SEH_begin_$current_function->{name}:\n";
+		    my $narg = $current_function->{narg};
+		    $narg=6 if (!defined($narg));
+		    $func .= "	movq	%rcx,%rdi\n" if ($narg>0);
+		    $func .= "	movq	%rdx,%rsi\n" if ($narg>1);
+		    $func .= "	movq	%r8,%rdx\n"  if ($narg>2);
+		    $func .= "	movq	%r9,%rcx\n"  if ($narg>3);
+		    $func .= "	movq	40(%rsp),%r8\n" if ($narg>4);
+		    $func .= "	movq	48(%rsp),%r9\n" if ($narg>5);
+		}
+	    }
+	    $func;
+	} elsif ($self->{value} ne "$current_function->{name}") {
+	    # Make all labels in masm global.
+	    $self->{value} .= ":" if ($masm);
+	    $self->{value} . ":";
+	} elsif ($win64 && $current_function->{abi} eq "svr4") {
+	    my $func =	"$current_function->{name}" .
+			($nasm ? ":" : "\tPROC $current_function->{scope}") .
+			"\n";
+	    my $fp = $current_function->{unwind} ? "r11" : "rax";
+	    $func .= "	DB	243,15,30,250\n";	# endbranch
+	    $func .= "	mov	QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n";
+	    $func .= "	mov	QWORD$PTR\[16+rsp\],rsi\n";
+	    $func .= "	mov	$fp,rsp\n";
+	    $func .= "${decor}SEH_begin_$current_function->{name}:";
+	    $func .= ":" if ($masm);
+	    $func .= "\n";
+	    my $narg = $current_function->{narg};
+	    $narg=6 if (!defined($narg));
+	    $func .= "	mov	rdi,rcx\n" if ($narg>0);
+	    $func .= "	mov	rsi,rdx\n" if ($narg>1);
+	    $func .= "	mov	rdx,r8\n"  if ($narg>2);
+	    $func .= "	mov	rcx,r9\n"  if ($narg>3);
+	    $func .= "	mov	r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4);
+	    $func .= "	mov	r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5);
+	    $func .= "\n";
+	} else {
+	   "$current_function->{name}".
+			($nasm ? ":" : "\tPROC $current_function->{scope}").
+	   "\n	DB	243,15,30,250";			# endbranch
+	}
+    }
+}
+{ package expr;		# pick up expressions
+    sub re {
+	my	($class, $line, $opcode) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /(^[^,]+)/) {
+	    bless $self,$class;
+	    $self->{value} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    $self->{value} =~ s/\@PLT// if (!$elf);
+	    $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+	    $self->{value} =~ s/\.L/$decor/g;
+	    $self->{opcode} = $opcode;
+	}
+	$ret;
+    }
+    sub out {
+	my $self = shift;
+	$self->{value};
+    }
+}
+
+my @xdata_seg = (".section	.xdata", ".align	8");
+my @pdata_seg = (".section	.pdata", ".align	4");
+
+{ package cfi_directive;
+    # CFI directives annotate instructions that are significant for
+    # stack unwinding procedure compliant with DWARF specification,
+    # see http://dwarfstd.org/. Besides naturally expected for this
+    # script platform-specific filtering function, this module adds
+    # three auxiliary synthetic directives not recognized by [GNU]
+    # assembler:
+    #
+    # - .cfi_push to annotate push instructions in prologue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_offset;
+    # - .cfi_pop to annotate pop instructions in epilogue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_restore;
+    # - [and most notably] .cfi_cfa_expression which encodes
+    #   DW_CFA_def_cfa_expression and passes it to .cfi_escape as
+    #   byte vector;
+    #
+    # CFA expressions were introduced in DWARF specification version
+    # 3 and describe how to deduce CFA, Canonical Frame Address. This
+    # becomes handy if your stack frame is variable and you can't
+    # spare register for [previous] frame pointer. Suggested directive
+    # syntax is made-up mix of DWARF operator suffixes [subset of]
+    # and references to registers with optional bias. Following example
+    # describes offloaded *original* stack pointer at specific offset
+    # from *current* stack pointer:
+    #
+    #   .cfi_cfa_expression     %rsp+40,deref,+8
+    #
+    # Final +8 has everything to do with the fact that CFA is defined
+    # as reference to top of caller's stack, and on x86_64 call to
+    # subroutine pushes 8-byte return address. In other words original
+    # stack pointer upon entry to a subroutine is 8 bytes off from CFA.
+    #
+    # In addition the .cfi directives are re-purposed even for Win64
+    # stack unwinding. Two more synthetic directives were added:
+    #
+    # - .cfi_end_prologue to denote point when all non-volatile
+    #   registers are saved and stack or [chosen] frame pointer is
+    #   stable;
+    # - .cfi_epilogue to denote point when all non-volatile registers
+    #   are restored [and it even adds missing .cfi_restore-s];
+    #
+    # Though it's not universal "miracle cure," it has its limitations.
+    # Most notably .cfi_cfa_expression won't start working... For more
+    # information see the end of this file.
+
+    # Below constants are taken from "DWARF Expressions" section of the
+    # DWARF specification, section is numbered 7.7 in versions 3 and 4.
+    my %DW_OP_simple = (	# no-arg operators, mapped directly
+	deref	=> 0x06,	dup	=> 0x12,
+	drop	=> 0x13,	over	=> 0x14,
+	pick	=> 0x15,	swap	=> 0x16,
+	rot	=> 0x17,	xderef	=> 0x18,
+
+	abs	=> 0x19,	and	=> 0x1a,
+	div	=> 0x1b,	minus	=> 0x1c,
+	mod	=> 0x1d,	mul	=> 0x1e,
+	neg	=> 0x1f,	not	=> 0x20,
+	or	=> 0x21,	plus	=> 0x22,
+	shl	=> 0x24,	shr	=> 0x25,
+	shra	=> 0x26,	xor	=> 0x27,
+	);
+
+    my %DW_OP_complex = (	# used in specific subroutines
+	constu		=> 0x10,	# uleb128
+	consts		=> 0x11,	# sleb128
+	plus_uconst	=> 0x23,	# uleb128
+	lit0 		=> 0x30,	# add 0-31 to opcode
+	reg0		=> 0x50,	# add 0-31 to opcode
+	breg0		=> 0x70,	# add 0-31 to opcole, sleb128
+	regx		=> 0x90,	# uleb28
+	fbreg		=> 0x91,	# sleb128
+	bregx		=> 0x92,	# uleb128, sleb128
+	piece		=> 0x93,	# uleb128
+	);
+
+    # Following constants are defined in x86_64 ABI supplement, for
+    # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf,
+    # see section 3.7 "Stack Unwind Algorithm".
+    my %DW_reg_idx = (
+	"%rax"=>0,  "%rdx"=>1,  "%rcx"=>2,  "%rbx"=>3,
+	"%rsi"=>4,  "%rdi"=>5,  "%rbp"=>6,  "%rsp"=>7,
+	"%r8" =>8,  "%r9" =>9,  "%r10"=>10, "%r11"=>11,
+	"%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
+	);
+
+    my ($cfa_reg, $cfa_off, $cfa_rsp, %saved_regs);
+    my @cfa_stack;
+
+    # [us]leb128 format is variable-length integer representation base
+    # 2^128, with most significant bit of each byte being 0 denoting
+    # *last* most significant digit. See "Variable Length Data" in the
+    # DWARF specification, numbered 7.6 at least in versions 3 and 4.
+    sub sleb128 {
+	use integer;	# get right shift extend sign
+
+	my $val = shift;
+	my $sign = ($val < 0) ? -1 : 0;
+	my @ret = ();
+
+	while(1) {
+	    push @ret, $val&0x7f;
+
+	    # see if remaining bits are same and equal to most
+	    # significant bit of the current digit, if so, it's
+	    # last digit...
+	    last if (($val>>6) == $sign);
+
+	    @ret[-1] |= 0x80;
+	    $val >>= 7;
+	}
+
+	return @ret;
+    }
+    sub uleb128 {
+	my $val = shift;
+	my @ret = ();
+
+	while(1) {
+	    push @ret, $val&0x7f;
+
+	    # see if it's last significant digit...
+	    last if (($val >>= 7) == 0);
+
+	    @ret[-1] |= 0x80;
+	}
+
+	return @ret;
+    }
+    sub const {
+	my $val = shift;
+
+	if ($val >= 0 && $val < 32) {
+            return ($DW_OP_complex{lit0}+$val);
+	}
+	return ($DW_OP_complex{consts}, sleb128($val));
+    }
+    sub reg {
+	my $val = shift;
+
+	return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/);
+
+	my $reg = $DW_reg_idx{$1};
+	my $off = eval ("0 $2 $3");
+
+	return (($DW_OP_complex{breg0} + $reg), sleb128($off));
+	# Yes, we use DW_OP_bregX+0 to push register value and not
+	# DW_OP_regX, because latter would require even DW_OP_piece,
+	# which would be a waste under the circumstances. If you have
+	# to use DWP_OP_reg, use "regx:N"...
+    }
+    sub cfa_expression {
+	my $line = shift;
+	my @ret;
+
+	foreach my $token (split(/,\s*/,$line)) {
+	    if ($token =~ /^%r/) {
+		push @ret,reg($token);
+	    } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) {
+		push @ret,reg("$2+$1");
+	    } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) {
+		my $i = 1*eval($2);
+		push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i));
+	    } elsif (my $i = 1*eval($token) or $token eq "0") {
+		if ($token =~ /^\+/) {
+		    push @ret,$DW_OP_complex{plus_uconst},uleb128($i);
+		} else {
+		    push @ret,const($i);
+		}
+	    } else {
+		push @ret,$DW_OP_simple{$token};
+	    }
+	}
+
+	# Finally we return DW_CFA_def_cfa_expression, 15, followed by
+	# length of the expression and of course the expression itself.
+	return (15,scalar(@ret),@ret);
+    }
+
+    # Following constants are defined in "x64 exception handling" at
+    # https://docs.microsoft.com/ and match the register sequence in
+    # CONTEXT structure defined in winnt.h.
+    my %WIN64_reg_idx = (
+	"%rax"=>0,  "%rcx"=>1,  "%rdx"=>2,  "%rbx"=>3,
+	"%rsp"=>4,  "%rbp"=>5,  "%rsi"=>6,  "%rdi"=>7,
+	"%r8" =>8,  "%r9" =>9,  "%r10"=>10, "%r11"=>11,
+	"%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
+	);
+    sub xdata {
+	our @dat = ();
+	our $len = 0;
+
+	sub allocstack {
+	    my $offset = shift;
+
+	    if ($offset) {
+		if ($offset <= 128) {
+	            $offset = ($offset - 8) >> 3;
+		    push @dat, [0,$offset<<4|2];	# UWOP_ALLOC_SMALL
+		} elsif ($offset < 0x80000) {
+		    push @dat, [0,0x01,unpack("C2",pack("v",$offset>>3))];
+		} else {
+		    push @dat, [0,0x11,unpack("C4",pack("V",$offset))];
+		}
+		$len += $#{@dat[-1]}+1;
+	    }
+	}
+
+	# allocate stack frame
+	if (my $offset = -8 - $cfa_rsp) {
+	    # but see if frame pointer is among saved registers
+	    if ($cfa_reg ne "%rsp" and my $fp_off = $saved_regs{$cfa_reg}) {
+	        $fp_off = -8 - $fp_off;
+		allocstack($fp_off-8);
+		$offset -= $fp_off;
+		push @dat, [0,$WIN64_reg_idx{$cfa_reg}<<4]; # UWOP_PUSH_NONVOL
+		$len += $#{@dat[-1]}+1;
+	    }
+	    allocstack($offset);
+	}
+	# set up frame pointer
+	my $fp_info = 0;
+	if ($cfa_reg ne "%rsp") {
+	    my $offset = $cfa_off - $cfa_rsp;
+	    ($offset > 240 or $offset&0xf) and die "invalid FP offset $offset";
+	    $fp_info = ($offset&-16)|$WIN64_reg_idx{$cfa_reg};
+	    push @dat, [0,3];				# UWOP_SET_FPREG
+	    $len += $#{@dat[-1]}+1;
+	}
+	# save registers
+	foreach my $key (sort { $saved_regs{$b} <=> $saved_regs{$a} }
+			      keys(%saved_regs)) {
+	    next if ($cfa_reg ne "%rsp" && $cfa_reg eq $key);
+	    my $offset = $saved_regs{$key} - $cfa_rsp;
+	    if ($key =~ /%xmm([0-9]+)/) {
+		if ($offset < 0x100000) {
+		    push @dat, [0,($1<<4)|8,unpack("C2",pack("v",$offset>>4))];
+		} else {
+		    push @dat, [0,($1<<4)|9,unpack("C4",pack("V",$offset))];
+		}
+	    } else {
+		if ($offset < 0x80000) {
+		    push @dat, [0,(($WIN64_reg_idx{$key})<<4)|4,
+				unpack("C2",pack("v",$offset>>3))];
+		} else {
+		    push @dat, [0,(($WIN64_reg_idx{$key})<<4)|5,
+				unpack("C4",pack("V",$offset))];
+		}
+	    }
+	    $len += $#{@dat[-1]}+1;
+	}
+
+	my @ret;
+	# generate 4-byte descriptor
+	push @ret, ".byte	1,0,".($len/2).",$fp_info";
+	$len += 4;
+	# pad to 8*n
+	unshift @dat, [(0)x((-$len)&7)] if ($len&7);
+	# emit data
+	while(defined(my $row = pop @dat)) {
+	    push @ret, ".byte	". join(",",
+					map { sprintf "0x%02x",$_ } @{$row});
+	}
+
+	return @ret;
+    }
+    sub startproc {
+	return if ($cfa_rsp == -8);
+	($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", -8, -8);
+	%saved_regs = ();
+	return "startproc";
+    }
+    sub endproc {
+	return if ($cfa_rsp == 0);
+	($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", 0, 0);
+	%saved_regs = ();
+	return "endproc";
+    }
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) {
+	    bless $self,$class;
+	    $ret = $self;
+	    undef $self->{value};
+	    my $dir = $1;
+
+	    SWITCH: for ($dir) {
+	    # What is $cfa_rsp? Effectively it's difference between %rsp
+	    # value and current CFA, Canonical Frame Address, which is
+	    # why it starts with -8. Recall that CFA is top of caller's
+	    # stack...
+	    /startproc/	&& do {	$dir = startproc(); last; };
+	    /endproc/	&& do {	$dir = endproc();
+				# .cfi_remember_state directives that are not
+				# matched with .cfi_restore_state are
+				# unnecessary.
+				die "unpaired .cfi_remember_state" if (@cfa_stack);
+				last;
+			      };
+	    /def_cfa_register/
+			&& do {	$cfa_off = $cfa_rsp if ($cfa_reg eq "%rsp");
+				$cfa_reg = $$line;
+				last;
+			      };
+	    /def_cfa_offset/
+			&& do {	$cfa_off = -1*eval($$line);
+				$cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp");
+				last;
+			      };
+	    /adjust_cfa_offset/
+			&& do { my $val = 1*eval($$line);
+				$cfa_off -= $val;
+				if ($cfa_reg eq "%rsp") {
+				    $cfa_rsp -= $val;
+				}
+				last;
+			      };
+	    /def_cfa/	&& do {	if ($$line =~ /(%r\w+)\s*,\s*(.+)/) {
+				    $cfa_reg = $1;
+				    $cfa_off = -1*eval($2);
+				    $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp");
+				}
+				last;
+			      };
+	    /push/	&& do {	$dir = undef;
+				$cfa_rsp -= 8;
+				if ($cfa_reg eq "%rsp") {
+				    $cfa_off = $cfa_rsp;
+				    $self->{value} = ".cfi_adjust_cfa_offset\t8\n";
+				}
+				$saved_regs{$$line} = $cfa_rsp;
+				$self->{value} .= ".cfi_offset\t$$line,$cfa_rsp";
+				last;
+			      };
+	    /pop/	&& do {	$dir = undef;
+				$cfa_rsp += 8;
+				if ($cfa_reg eq "%rsp") {
+				    $cfa_off = $cfa_rsp;
+				    $self->{value} = ".cfi_adjust_cfa_offset\t-8\n";
+				}
+				$self->{value} .= ".cfi_restore\t$$line";
+				delete $saved_regs{$$line};
+				last;
+			      };
+	    /cfa_expression/
+			&& do {	$dir = undef;
+				$self->{value} = ".cfi_escape\t" .
+					join(",", map(sprintf("0x%02x", $_),
+						      cfa_expression($$line)));
+				last;
+			      };
+	    /remember_state/
+			&& do {	push @cfa_stack,
+				     [$cfa_reg,$cfa_off,$cfa_rsp,%saved_regs];
+				last;
+			      };
+	    /restore_state/
+			&& do {	     ($cfa_reg,$cfa_off,$cfa_rsp,%saved_regs)
+				= @{pop @cfa_stack};
+				last;
+			      };
+	    /offset/	&& do { if ($$line =~ /(%\w+)\s*,\s*(.+)/) {
+				    $saved_regs{$1} = 1*eval($2);
+				    $dir = undef if ($1 =~ /%xmm/);
+				}
+				last;
+			      };
+	    /restore/	&& do {	delete $saved_regs{$$line}; last; };
+	    /end_prologue/
+			&& do {	$dir = undef;
+				$self->{win64} = ".endprolog";
+				last;
+			      };
+	    /epilogue/	&& do {	$dir = undef;
+				$self->{win64} = ".epilogue";
+				$self->{value} = join("\n",
+						      map { ".cfi_restore\t$_" }
+						      sort keys(%saved_regs));
+				%saved_regs = ();
+				last;
+			      };
+	    }
+
+	    $self->{value} = ".cfi_$dir\t$$line" if ($dir);
+
+	    $$line = "";
+	}
+
+	return $ret;
+    }
+    sub out {
+	my $self = shift;
+	return $self->{value} if ($dwarf);
+
+	if ($win64 and $current_function->{unwind}
+		   and my $ret = $self->{win64}) {
+	    my ($reg, $off) = ($cfa_reg =~ /%(?!rsp)/)  ? ($',    $cfa_off)
+							: ("rsp", $cfa_rsp);
+	    my $fname = $current_function->{name};
+
+	    if ($ret eq ".endprolog") {
+		$saved_regs{"%rdi"} = 0;	# relative to CFA, remember?
+		$saved_regs{"%rsi"} = 8;
+
+		push @pdata_seg,
+		    ".rva	.LSEH_begin_${fname}",
+		    ".rva	.LSEH_body_${fname}",
+		    ".rva	.LSEH_info_${fname}_prologue","";
+		push @xdata_seg,
+		    ".LSEH_info_${fname}_prologue:",
+		    ".byte	1,0,5,0x0b",	# 5 unwind codes, %r11 is FP
+		    ".byte	0,0x74,1,0",	# %rdi at 8(%rsp)
+		    ".byte	0,0x64,2,0",	# %rsi at 16(%rsp)
+		    ".byte	0,0x03",	# set frame pointer
+		    ".byte	0,0"		# padding
+		    ;
+		push @pdata_seg,
+		    ".rva	.LSEH_body_${fname}",
+		    ".rva	.LSEH_epilogue_${fname}",
+		    ".rva	.LSEH_info_${fname}_body","";
+		push @xdata_seg,".LSEH_info_${fname}_body:", xdata();
+		$ret  = "${decor}SEH_body_${fname}:";
+		$ret .= ":" if ($masm); $ret .= "\n";
+	    } elsif ($ret eq ".epilogue") {
+		%saved_regs = ();
+		$saved_regs{"%rdi"} = 0;	# relative to CFA, remember?
+		$saved_regs{"%rsi"} = 8;
+		$cfa_rsp = $cfa_off;
+
+		push @pdata_seg,
+		    ".rva	.LSEH_epilogue_${fname}",
+		    ".rva	.LSEH_end_${fname}",
+		    ".rva	.LSEH_info_${fname}_epilogue","";
+		push @xdata_seg,".LSEH_info_${fname}_epilogue:", xdata(), "";
+		$ret  = "${decor}SEH_epilogue_${fname}:";
+		$ret .= ":" if ($masm); $ret .= "\n";
+		if ($gas) {
+		    $ret .= "	mov	".(0-$off)."(%$reg),%rdi\n";
+		    $ret .= "	mov	".(8-$off)."(%$reg),%rsi\n";
+		} else {
+		    $ret .= "	mov	rdi,QWORD$PTR\[".(0-$off)."+$reg\]";
+		    $ret .= "	;WIN64 epilogue\n";
+		    $ret .= "	mov	rsi,QWORD$PTR\[".(8-$off)."+$reg\]\n";
+		}
+	    }
+	    return $ret;
+	}
+	return;
+    }
+}
+{ package directive;	# pick up directives, which start with .
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+	my	$dir;
+
+	# chain-call to cfi_directive
+	$ret = cfi_directive->re($line) and return $ret;
+
+	if ($$line =~ /^\s*(\.\w+)/) {
+	    bless $self,$class;
+	    $dir = $1;
+	    $ret = $self;
+	    undef $self->{value};
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    SWITCH: for ($dir) {
+		/\.global|\.globl|\.extern/
+			    && do { $globals{$$line} = $prefix . $$line;
+				    $$line = $globals{$$line} if ($prefix);
+				    last;
+				  };
+		/\.type/    && do { my ($sym,$type,$narg,$unwind) = split(',',$$line);
+				    if ($type eq "\@function") {
+					undef $current_function;
+					$current_function->{name} = $sym;
+					$current_function->{abi}  = "svr4";
+					$current_function->{narg} = $narg;
+					$current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
+					$current_function->{unwind} = $unwind;
+				    } elsif ($type eq "\@abi-omnipotent") {
+					undef $current_function;
+					$current_function->{name} = $sym;
+					$current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
+				    }
+				    $$line =~ s/\@abi\-omnipotent/\@function/;
+				    $$line =~ s/\@function.*/\@function/;
+				    last;
+				  };
+		/\.asciz/   && do { if ($$line =~ /^"(.*)"$/) {
+					$dir  = ".byte";
+					$$line = join(",",unpack("C*",$1),0);
+				    }
+				    last;
+				  };
+		/\.rva|\.long|\.quad/
+			    && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+				    $$line =~ s/\.L/$decor/g;
+				    last;
+				  };
+	    }
+
+	    if ($gas) {
+		$self->{value} = $dir . "\t" . $$line;
+
+		if ($dir =~ /\.extern/) {
+		    $self->{value} = ""; # swallow extern
+		} elsif (!$elf && $dir =~ /\.type/) {
+		    $self->{value} = "";
+		    $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" .
+				(defined($globals{$1})?".scl 2;":".scl 3;") .
+				"\t.type 32;\t.endef"
+				if ($win64 && $$line =~ /([^,]+),\@function/);
+		} elsif ($dir =~ /\.size/) {
+		    $self->{value} = "" if (!$elf);
+		    if ($dwarf and my $endproc = cfi_directive::endproc()) {
+			$self->{value} = ".cfi_$endproc\n$self->{value}";
+		    } elsif (!$elf && defined($current_function)) {
+			$self->{value} .= "${decor}SEH_end_$current_function->{name}:"
+				if ($win64 && $current_function->{abi} eq "svr4");
+			undef $current_function;
+		    }
+		} elsif (!$elf && $dir =~ /\.align/) {
+		    $self->{value} = ".p2align\t" . (log($$line)/log(2));
+		} elsif ($dir eq ".section") {
+		    $current_segment=$$line;
+		    if (!$elf && $current_segment eq ".init") {
+			if	($flavour eq "macosx")	{ $self->{value} = ".mod_init_func"; }
+			elsif	($flavour eq "mingw64")	{ $self->{value} = ".section\t.ctors"; }
+		    }
+		} elsif ($dir =~ /\.(text|data)/) {
+		    $current_segment=".$1";
+		} elsif ($dir =~ /\.hidden/) {
+		    if    ($flavour eq "macosx")  { $self->{value} = ".private_extern\t$prefix$$line"; }
+		    elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+		} elsif ($dir =~ /\.comm/) {
+		    $self->{value} = "$dir\t$prefix$$line";
+		    $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
+		}
+		$$line = "";
+		return $self;
+	    }
+
+	    # non-gas case or nasm/masm
+	    SWITCH: for ($dir) {
+		/\.text/    && do { my $v=undef;
+				    if ($nasm) {
+					$v="section	.text code align=64\n";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = ".text\$";
+					$v.="$current_segment\tSEGMENT ";
+					$v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE";
+					$v.=" 'CODE'";
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
+		/\.data/    && do { my $v=undef;
+				    if ($nasm) {
+					$v="section	.data data align=8\n";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = "_DATA";
+					$v.="$current_segment\tSEGMENT";
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
+		/\.section/ && do { my $v=undef;
+				    $$line =~ s/([^,]*).*/$1/;
+				    $$line = ".CRT\$XCU" if ($$line eq ".init");
+				    if ($nasm) {
+					$v="section	$$line";
+					if ($$line=~/\.([px])data/) {
+					    $v.=" rdata align=";
+					    $v.=$1 eq "p"? 4 : 8;
+					} elsif ($$line=~/\.CRT\$/i) {
+					    $v.=" rdata align=8";
+					}
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$v.="$$line\tSEGMENT";
+					if ($$line=~/\.([px])data/) {
+					    $v.=" READONLY";
+					    $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
+					} elsif ($$line=~/\.CRT\$/i) {
+					    $v.=" READONLY ";
+					    $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
+					}
+				    }
+				    $current_segment = $$line;
+				    $self->{value} = $v;
+				    last;
+				  };
+		/\.extern/  && do { $self->{value}  = "EXTERN\t".$$line;
+				    $self->{value} .= ":NEAR" if ($masm);
+				    last;
+				  };
+		/\.globl|.global/
+			    && do { $self->{value}  = $masm?"PUBLIC":"global";
+				    $self->{value} .= "\t".$$line;
+				    last;
+				  };
+		/\.size/    && do { if (defined($current_function)) {
+					undef $self->{value};
+					if ($current_function->{abi} eq "svr4") {
+					    $self->{value}="${decor}SEH_end_$current_function->{name}:";
+					    $self->{value}.=":\n" if($masm);
+					}
+					$self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
+					undef $current_function;
+				    }
+				    last;
+				  };
+		/\.align/   && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096;
+				    $self->{value} = "ALIGN\t".($$line>$max?$max:$$line);
+				    last;
+				  };
+		/\.(value|long|rva|quad)/
+			    && do { my $sz  = substr($1,0,1);
+				    my @arr = split(/,\s*/,$$line);
+				    my $last = pop(@arr);
+				    my $conv = sub  {	my $var=shift;
+							$var=~s/^(0b[0-1]+)/oct($1)/eig;
+							$var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm);
+							if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
+							{ $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
+							$var;
+						    };
+
+				    $sz =~ tr/bvlrq/BWDDQ/;
+				    $self->{value} = "\tD$sz\t";
+				    for (@arr) { $self->{value} .= &$conv($_).","; }
+				    $self->{value} .= &$conv($last);
+				    last;
+				  };
+		/\.byte/    && do { my @str=split(/,\s*/,$$line);
+				    map(s/(0b[0-1]+)/oct($1)/eig,@str);
+				    map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
+				    while ($#str>15) {
+					$self->{value}.="DB\t"
+						.join(",",@str[0..15])."\n";
+					foreach (0..15) { shift @str; }
+				    }
+				    $self->{value}.="DB\t"
+						.join(",",@str) if (@str);
+				    last;
+				  };
+		/\.comm/    && do { my @str=split(/,\s*/,$$line);
+				    my $v=undef;
+				    if ($nasm) {
+					$v.="common	$prefix@str[0] @str[1]";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = "_DATA";
+					$v.="$current_segment\tSEGMENT\n";
+					$v.="COMM	@str[0]:DWORD:".@str[1]/4;
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
+	    }
+	    $$line = "";
+	}
+
+	$ret;
+    }
+    sub out {
+	my $self = shift;
+	$self->{value};
+    }
+}
+
+# Upon initial x86_64 introduction SSE>2 extensions were not introduced
+# yet. In order not to be bothered by tracing exact assembler versions,
+# but at the same time to provide a bare security minimum of AES-NI, we
+# hard-code some instructions. Extensions past AES-NI on the other hand
+# are traced by examining assembler version in individual perlasm
+# modules...
+
+my %regrm = (	"%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+		"%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7	);
+
+sub rex {
+ my $opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+   $rex|=0x04 if($dst>=8);
+   $rex|=0x01 if($src>=8);
+   push @$opcode,($rex|0x40) if ($rex);
+}
+
+my $movq = sub {	# elderly gas can't handle inter-register movq
+  my $arg = shift;
+  my @opcode=(0x66);
+    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+	my ($src,$dst)=($1,$2);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x7e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+	my ($src,$dst)=($2,$1);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x6e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pextrd = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+      my @opcode=(0x66);
+	my $imm=$1;
+	my $src=$2;
+	my $dst=$3;
+	if ($dst =~ /%r([0-9]+)d/)	{ $dst = $1; }
+	elsif ($dst =~ /%e/)		{ $dst = $regrm{$dst}; }
+	rex(\@opcode,$src,$dst);
+	push @opcode,0x0f,0x3a,0x16;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pinsrd = sub {
+    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	my $imm=$1;
+	my $src=$2;
+	my $dst=$3;
+	if ($src =~ /%r([0-9]+)/)	{ $src = $1; }
+	elsif ($src =~ /%e/)		{ $src = $regrm{$src}; }
+	rex(\@opcode,$dst,$src);
+	push @opcode,0x0f,0x3a,0x22;
+	push @opcode,0xc0|(($dst&7)<<3)|($src&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pshufb = sub {
+    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$2,$1);
+	push @opcode,0x0f,0x38,0x00;
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $palignr = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x0f;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	push @opcode,$1;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pclmulqdq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x44;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $rdrand = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,0,$dst,8);
+	push @opcode,0x0f,0xc7,0xf0|($dst&7);
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $rdseed = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,0,$dst,8);
+	push @opcode,0x0f,0xc7,0xf8|($dst&7);
+	@opcode;
+    } else {
+	();
+    }
+};
+
+# Not all AVX-capable assemblers recognize AMD XOP extension. Since we
+# are using only two instructions hand-code them in order to be excused
+# from chasing assembler versions...
+
+sub rxb {
+ my $opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+   $rxb|=0x7<<5;
+   $rxb&=~(0x04<<5) if($dst>=8);
+   $rxb&=~(0x01<<5) if($src1>=8);
+   $rxb&=~(0x02<<5) if($src2>=8);
+   push @$opcode,$rxb;
+}
+
+my $vprotd = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x8f);
+	rxb(\@opcode,$3,$2,-1,0x08);
+	push @opcode,0x78,0xc2;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $vprotq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x8f);
+	rxb(\@opcode,$3,$2,-1,0x08);
+	push @opcode,0x78,0xc3;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+# Intel Control-flow Enforcement Technology extension. All functions and
+# indirect branch targets will have to start with this instruction...
+# However, it should not be used in functions' prologues explicitly, as
+# it's added automatically [and in the right spot]. Which leaves only
+# non-function indirect branch targets, such as in a case-like dispatch
+# table, as application area.
+
+my $endbr64 = sub {
+    (0xf3,0x0f,0x1e,0xfa);
+};
+
+########################################################################
+
+if ($nasm) {
+    print <<___;
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+___
+} elsif ($masm) {
+    print <<___;
+OPTION	DOTNAME
+___
+}
+
+sub process {
+    my $line = shift;
+
+    $line =~ s|\R$||;           # Better chomp
+
+    $line =~ s|[#!].*$||;	# get rid of asm-style comments...
+    $line =~ s|/\*.*\*/||;	# ... and C-style comments...
+    $line =~ s|^\s+||;		# ... and skip white spaces in beginning
+    $line =~ s|\s+$||;		# ... and at the end
+
+    if (my $label=label->re(\$line))	{ print $label->out(); }
+
+    if (my $directive=directive->re(\$line)) {
+	printf "%s",$directive->out();
+    } elsif (my $opcode=opcode->re(\$line)) {
+	my $asm = eval("\$".$opcode->mnemonic());
+
+	if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
+	    print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+	    next;
+	}
+
+	my @args;
+	ARGUMENT: while (1) {
+	    my $arg;
+
+	    ($arg=register->re(\$line, $opcode))||
+	    ($arg=const->re(\$line))		||
+	    ($arg=ea->re(\$line, $opcode))	||
+	    ($arg=expr->re(\$line, $opcode))	||
+	    last ARGUMENT;
+
+	    push @args,$arg;
+
+	    last ARGUMENT if ($line !~ /^,/);
+
+	    $line =~ s/^,\s*//;
+	} # ARGUMENT:
+
+	if ($#args>=0) {
+	    my $insn;
+	    my $sz=$opcode->size();
+
+	    if ($gas) {
+		$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+		@args = map($_->out($sz),@args);
+		printf "\t%s\t%s",$insn,join(",",@args);
+	    } else {
+		$insn = $opcode->out();
+		foreach (@args) {
+		    my $arg = $_->out();
+		    # $insn.=$sz compensates for movq, pinsrw, ...
+		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+		    if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; }
+		    if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; }
+		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
+		}
+		@args = reverse(@args);
+		undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+		printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
+	    }
+	} else {
+	    printf "\t%s",$opcode->out();
+	}
+    }
+
+    print $line,"\n";
+}
+
+while(<>) { process($_); }
+
+map { process($_) } @pdata_seg if ($win64);
+map { process($_) } @xdata_seg if ($win64);
+
+# platform-specific epilogue
+if ($masm) {
+    print "\n$current_segment\tENDS\n"	if ($current_segment);
+    print "END\n";
+} elsif ($elf) {
+    # -fcf-protection segment, snatched from compiler -S output
+    my $align = ($flavour =~ /elf32/) ? 4 : 8;
+    print <<___;
+
+.section	.note.GNU-stack,"",\@progbits
+.section	.note.gnu.property,"a",\@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	$align
+2:
+___
+}
+
+close STDOUT;
+
+#################################################
+# Cross-reference x86_64 ABI "card"
+#
+# 		Unix		Win64
+# %rax		*		*
+# %rbx		-		-
+# %rcx		#4		#1
+# %rdx		#3		#2
+# %rsi		#2		-
+# %rdi		#1		-
+# %rbp		-		-
+# %rsp		-		-
+# %r8		#5		#3
+# %r9		#6		#4
+# %r10		*		*
+# %r11		*		*
+# %r12		-		-
+# %r13		-		-
+# %r14		-		-
+# %r15		-		-
+#
+# (*)	volatile register
+# (-)	preserved by callee
+# (#)	Nth argument, volatile
+#
+# In Unix terms top of stack is argument transfer area for arguments
+# which could not be accommodated in registers. Or in other words 7th
+# [integer] argument resides at 8(%rsp) upon function entry point.
+# 128 bytes above %rsp constitute a "red zone" which is not touched
+# by signal handlers and can be used as temporal storage without
+# allocating a frame.
+#
+# In Win64 terms N*8 bytes on top of stack is argument transfer area,
+# which belongs to/can be overwritten by callee. N is the number of
+# arguments passed to callee, *but* not less than 4! This means that
+# upon function entry point 5th argument resides at 40(%rsp), as well
+# as that 32 bytes from 8(%rsp) can always be used as temporal
+# storage [without allocating a frame]. One can actually argue that
+# one can assume a "red zone" above stack pointer under Win64 as well.
+# Point is that at apparently no occasion Windows kernel would alter
+# the area above user stack pointer in true asynchronous manner...
+#
+# All the above means that if assembler programmer adheres to Unix
+# register and stack layout, but disregards the "red zone" existence,
+# it's possible to use following prologue and epilogue to "gear" from
+# Unix to Win64 ABI in leaf functions with not more than 6 arguments.
+#
+# omnipotent_function:
+# ifdef WIN64
+#	movq	%rdi,8(%rsp)
+#	movq	%rsi,16(%rsp)
+#	movq	%rcx,%rdi	; if 1st argument is actually present
+#	movq	%rdx,%rsi	; if 2nd argument is actually ...
+#	movq	%r8,%rdx	; if 3rd argument is ...
+#	movq	%r9,%rcx	; if 4th argument ...
+#	movq	40(%rsp),%r8	; if 5th ...
+#	movq	48(%rsp),%r9	; if 6th ...
+# endif
+#	...
+# ifdef WIN64
+#	movq	8(%rsp),%rdi
+#	movq	16(%rsp),%rsi
+# endif
+#	ret
+#
+#################################################
+# Win64 SEH, Structured Exception Handling.
+#
+# Unlike on Unix systems(*) lack of Win64 stack unwinding information
+# has undesired side-effect at run-time: if an exception is raised in
+# assembler subroutine such as those in question (basically we're
+# referring to segmentation violations caused by malformed input
+# parameters), the application is briskly terminated without invoking
+# any exception handlers, most notably without generating memory dump
+# or any user notification whatsoever. This poses a problem. It's
+# possible to address it by registering custom language-specific
+# handler that would restore processor context to the state at
+# subroutine entry point and return "exception is not handled, keep
+# unwinding" code. Writing such handler can be a challenge... But it's
+# doable, though requires certain coding convention. Consider following
+# snippet:
+#
+# .type	function,@function
+# function:
+#	movq	%rsp,%rax	# copy rsp to volatile register
+#	pushq	%r15		# save non-volatile registers
+#	pushq	%rbx
+#	pushq	%rbp
+#	movq	%rsp,%r11
+#	subq	%rdi,%r11	# prepare [variable] stack frame
+#	andq	$-64,%r11
+#	movq	%rax,0(%r11)	# check for exceptions
+#	movq	%r11,%rsp	# allocate [variable] stack frame
+#	movq	%rax,0(%rsp)	# save original rsp value
+# magic_point:
+#	...
+#	movq	0(%rsp),%rcx	# pull original rsp value
+#	movq	-24(%rcx),%rbp	# restore non-volatile registers
+#	movq	-16(%rcx),%rbx
+#	movq	-8(%rcx),%r15
+#	movq	%rcx,%rsp	# restore original rsp
+# magic_epilogue:
+#	ret
+# .size function,.-function
+#
+# The key is that up to magic_point copy of original rsp value remains
+# in chosen volatile register and no non-volatile register, except for
+# rsp, is modified. While past magic_point rsp remains constant till
+# the very end of the function. In this case custom language-specific
+# exception handler would look like this:
+#
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+# {	ULONG64 *rsp = (ULONG64 *)context->Rax;
+#	ULONG64  rip = context->Rip;
+#
+#	if (rip >= magic_point)
+#	{   rsp = (ULONG64 *)context->Rsp;
+#	    if (rip < magic_epilogue)
+#	    {	rsp = (ULONG64 *)rsp[0];
+#		context->Rbp = rsp[-3];
+#		context->Rbx = rsp[-2];
+#		context->R15 = rsp[-1];
+#	    }
+#	}
+#	context->Rsp = (ULONG64)rsp;
+#	context->Rdi = rsp[1];
+#	context->Rsi = rsp[2];
+#
+#	memcpy (disp->ContextRecord,context,sizeof(CONTEXT));
+#	RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase,
+#		dips->ControlPc,disp->FunctionEntry,disp->ContextRecord,
+#		&disp->HandlerData,&disp->EstablisherFrame,NULL);
+#	return ExceptionContinueSearch;
+# }
+#
+# It's appropriate to implement this handler in assembler, directly in
+# function's module. In order to do that one has to know members'
+# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant
+# values. Here they are:
+#
+#	CONTEXT.Rax				120
+#	CONTEXT.Rcx				128
+#	CONTEXT.Rdx				136
+#	CONTEXT.Rbx				144
+#	CONTEXT.Rsp				152
+#	CONTEXT.Rbp				160
+#	CONTEXT.Rsi				168
+#	CONTEXT.Rdi				176
+#	CONTEXT.R8				184
+#	CONTEXT.R9				192
+#	CONTEXT.R10				200
+#	CONTEXT.R11				208
+#	CONTEXT.R12				216
+#	CONTEXT.R13				224
+#	CONTEXT.R14				232
+#	CONTEXT.R15				240
+#	CONTEXT.Rip				248
+#	CONTEXT.Xmm6				512
+#	sizeof(CONTEXT)				1232
+#	DISPATCHER_CONTEXT.ControlPc		0
+#	DISPATCHER_CONTEXT.ImageBase		8
+#	DISPATCHER_CONTEXT.FunctionEntry	16
+#	DISPATCHER_CONTEXT.EstablisherFrame	24
+#	DISPATCHER_CONTEXT.TargetIp		32
+#	DISPATCHER_CONTEXT.ContextRecord	40
+#	DISPATCHER_CONTEXT.LanguageHandler	48
+#	DISPATCHER_CONTEXT.HandlerData		56
+#	UNW_FLAG_NHANDLER			0
+#	ExceptionContinueSearch			1
+#
+# In order to tie the handler to the function one has to compose
+# couple of structures: one for .xdata segment and one for .pdata.
+#
+# UNWIND_INFO structure for .xdata segment would be
+#
+# function_unwind_info:
+#	.byte	9,0,0,0
+#	.rva	handler
+#
+# This structure designates exception handler for a function with
+# zero-length prologue, no stack frame or frame register.
+#
+# To facilitate composing of .pdata structures, auto-generated "gear"
+# prologue copies rsp value to rax and denotes next instruction with
+# .LSEH_begin_{function_name} label. This essentially defines the SEH
+# styling rule mentioned in the beginning. Position of this label is
+# chosen in such manner that possible exceptions raised in the "gear"
+# prologue would be accounted to caller and unwound from latter's frame.
+# End of function is marked with respective .LSEH_end_{function_name}
+# label. To summarize, .pdata segment would contain
+#
+#	.rva	.LSEH_begin_function
+#	.rva	.LSEH_end_function
+#	.rva	function_unwind_info
+#
+# Reference to function_unwind_info from .xdata segment is the anchor.
+# In case you wonder why references are 32-bit .rvas and not 64-bit
+# .quads. References put into these two segments are required to be
+# *relative* to the base address of the current binary module, a.k.a.
+# image base. No Win64 module, be it .exe or .dll, can be larger than
+# 2GB and thus such relative references can be and are accommodated in
+# 32 bits.
+#
+# Having reviewed the example function code, one can argue that "movq
+# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix
+# rax would contain an undefined value. If this "offends" you, use
+# another register and refrain from modifying rax till magic_point is
+# reached, i.e. as if it was a non-volatile register. If more registers
+# are required prior [variable] frame setup is completed, note that
+# nobody says that you can have only one "magic point." You can
+# "liberate" non-volatile registers by denoting last stack off-load
+# instruction and reflecting it in finer grade unwind logic in handler.
+# After all, isn't it why it's called *language-specific* handler...
+#
+# SE handlers are also involved in unwinding stack when executable is
+# profiled or debugged. Profiling implies additional limitations that
+# are too subtle to discuss here. For now it's sufficient to say that
+# in order to simplify handlers one should either a) offload original
+# %rsp to stack (like discussed above); or b) if you have a register to
+# spare for frame pointer, choose volatile one.
+#
+# (*)	Note that we're talking about run-time, not debug-time. Lack of
+#	unwind information makes debugging hard on both Windows and
+#	Unix. "Unlike" refers to the fact that on Unix signal handler
+#	will always be invoked, core dumped and appropriate exit code
+#	returned to parent (for user notification).
+#
+########################################################################
+# As of May 2020 an alternative approach that works with both exceptions
+# and debugging/profiling was implemented by re-purposing DWARF .cfi
+# annotations even for Win64 unwind tables' generation. Unfortunately,
+# but not really unexpectedly, it imposes additional limitations on
+# coding style. Probably most significant limitation is that frame
+# pointer has to be at 16*n distance from stack pointer at the exit
+# from prologue. But first things first. There are two additional
+# synthetic .cfi directives, .cfi_end_prologue and .cfi_epilogue,
+# that need to be added to all functions marked with additional .type
+# tag (see example below). There are "do's and don'ts" for prologue
+# and epilogue. It shouldn't come as surprise that in prologue one may
+# not modify non-volatile registers, but one may not modify %r11 either.
+# This is because it's used as temporary frame pointer(*). There is one
+# exception to this rule, and it's setting up frame pointer that is
+# non-volatile or %r11. But it must be last instruction in the prologue.
+# Constraints for epilogue, or rather on its boundary, depend on whether
+# the frame is fixed- or variable-length. In fixed-frame subroutine
+# stack pointer has to be restored in the last instruction prior the
+# .cfi_epilogue directive. If it's variable-frame subroutine, and a
+# non-volatile register was used as frame pointer, then last instruction
+# prior the directive has to restore its original value. This means that
+# final stack pointer adjustment would have to be pushed past the
+# directive. Normally this would render the epilogue non-unwindable, so
+# special care has to be taken. To resolve the dilemma, copy frame
+# pointer to a volatile register in advance. To give an example:
+#
+# .type	rbp_as_frame_pointer,\@function,3,"unwind"  # mind extra tag!
+# rbp_as_frame_pointer:
+# .cfi_startproc
+#	push	%rbp
+# .cfi_push	%rbp
+#	push	%rbx
+# .cfi_push	%rbx
+# 	mov	%rsp,%rbp	# last instruction in prologue
+# .cfi_def_cfa_register	%rbp	# %rsp-%rbp has to be 16*n, e.g. 16*0
+# .cfi_end_prologue
+#	sub	\$40,%rsp
+#	and	\$-64,%rsp
+#	...
+#	mov	%rbp,%r11
+# .cfi_def_cfa_register	%r11	# copy frame pointer to volatile %r11
+#	mov	0(%rbp),%rbx
+#	mov	8(%rbp),%rbp	# last instruction prior epilogue
+# .cfi_epilogue			# may not change %r11 in epilogue
+#	lea	16(%r11),%rsp
+#	ret
+# .cfi_endproc
+# .size	rbp_as_frame_pointer,.-rbp_as_frame_pointer
+#
+# To give an example of fixed-frame subroutine for reference:
+#
+# .type	fixed_frame,\@function,3,"unwind"           # mind extra tag!
+# fixed_frame:
+# .cfi_startproc
+#	push	%rbp
+# .cfi_push	%rbp
+#	push	%rbx
+# .cfi_push	%rbx
+#	sub	\$40,%rsp
+# .cfi_adjust_cfa_offset 40
+# .cfi_end_prologue
+#	...
+#	mov	40(%rsp),%rbx
+#	mov	48(%rsp),%rbp
+#	lea	56(%rsp),%rsp
+# .cfi_adjust_cfa_offset -56
+# .cfi_epilogue
+#	ret
+# .cfi_endproc
+# .size	fixed_frame,.-fixed_frame
+#
+# As for epilogue itself, one can only work on non-volatile registers.
+# "Non-volatile" in "Windows" sense, i.e. minus %rdi and %rsi.
+#
+# On a final note, mixing old-style and modernized subroutines in the
+# same file takes some trickery. Ones of the new kind have to appear
+# after old-style ones. This has everything to do with the fact that
+# entries in the .pdata segment have to appear in strictly same order
+# as corresponding subroutines, and auto-generated RUNTIME_FUNCTION
+# structures get mechanically appended to whatever existing .pdata.
+#
+# (*)	Just in case, why %r11 and not %rax. This has everything to do
+#	with the way UNWIND_INFO is, one just can't designate %rax as
+#	frame pointer.
diff --git a/blst/assembly.S b/blst/assembly.S
new file mode 100644
index 0000000..a1a7c54
--- /dev/null
+++ b/blst/assembly.S
@@ -0,0 +1,123 @@
+#if defined(__x86_64) || defined(__x86_64__)
+# if defined(__ELF__)
+#  if defined(__BLST_PORTABLE__)
+#   include "elf/sha256-portable-x86_64.s"
+#  else
+#   include "elf/sha256-x86_64.s"
+#  endif
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "elf/ctx_inverse_mod_384-x86_64.s"
+#  else
+#   include "elf/ctq_inverse_mod_384-x86_64.s"
+#  endif
+#  include "elf/add_mod_384-x86_64.s"
+#  include "elf/add_mod_384x384-x86_64.s"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  define __sub_mod_384x384 __sub_mont_384x384
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "elf/mulx_mont_384-x86_64.s"
+#   include "elf/mulx_mont_256-x86_64.s"
+#  else
+#   include "elf/mulq_mont_384-x86_64.s"
+#   include "elf/mulq_mont_256-x86_64.s"
+#  endif
+#  include "elf/add_mod_256-x86_64.s"
+#  include "elf/ct_inverse_mod_256-x86_64.s"
+#  include "elf/div3w-x86_64.s"
+#  include "elf/ct_is_square_mod_384-x86_64.s"
+# elif defined(_WIN64) || defined(__CYGWIN__)
+#  if defined(__BLST_PORTABLE__)
+#   include "coff/sha256-portable-x86_64.s"
+#  else
+#   include "coff/sha256-x86_64.s"
+#  endif
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "coff/ctx_inverse_mod_384-x86_64.s"
+#  else
+#   include "coff/ctq_inverse_mod_384-x86_64.s"
+#  endif
+#  include "coff/add_mod_384-x86_64.s"
+#  include "coff/add_mod_384x384-x86_64.s"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  define __sub_mod_384x384 __sub_mont_384x384
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "coff/mulx_mont_384-x86_64.s"
+#   include "coff/mulx_mont_256-x86_64.s"
+#  else
+#   include "coff/mulq_mont_384-x86_64.s"
+#   include "coff/mulq_mont_256-x86_64.s"
+#  endif
+#  include "coff/add_mod_256-x86_64.s"
+#  include "coff/ct_inverse_mod_256-x86_64.s"
+#  include "coff/div3w-x86_64.s"
+#  include "coff/ct_is_square_mod_384-x86_64.s"
+# elif defined(__APPLE__)
+#  include "mach-o/sha256-x86_64.s"
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "mach-o/ctx_inverse_mod_384-x86_64.s"
+#  else
+#   include "mach-o/ctq_inverse_mod_384-x86_64.s"
+#  endif
+#  include "mach-o/add_mod_384-x86_64.s"
+#  include "mach-o/add_mod_384x384-x86_64.s"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  define __sub_mod_384x384 __sub_mont_384x384
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "mach-o/mulx_mont_384-x86_64.s"
+#   include "mach-o/mulx_mont_256-x86_64.s"
+#  else
+#   include "mach-o/mulq_mont_384-x86_64.s"
+#   include "mach-o/mulq_mont_256-x86_64.s"
+#  endif
+#  include "mach-o/add_mod_256-x86_64.s"
+#  include "mach-o/ct_inverse_mod_256-x86_64.s"
+#  include "mach-o/div3w-x86_64.s"
+#  include "mach-o/ct_is_square_mod_384-x86_64.s"
+# endif
+#elif defined(__aarch64__)
+# if defined(__ELF__)
+#  include "elf/sha256-armv8.S"
+#  include "elf/ct_inverse_mod_384-armv8.S"
+#  include "elf/add_mod_384-armv8.S"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  include "elf/mul_mont_384-armv8.S"
+#  include "elf/mul_mont_256-armv8.S"
+#  include "elf/add_mod_256-armv8.S"
+#  include "elf/ct_inverse_mod_256-armv8.S"
+#  include "elf/div3w-armv8.S"
+#  include "elf/ct_is_square_mod_384-armv8.S"
+# elif defined(_WIN64)
+#  include "coff/sha256-armv8.S"
+#  include "coff/ct_inverse_mod_384-armv8.S"
+#  include "coff/add_mod_384-armv8.S"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  include "coff/mul_mont_384-armv8.S"
+#  include "coff/mul_mont_256-armv8.S"
+#  include "coff/add_mod_256-armv8.S"
+#  include "coff/ct_inverse_mod_256-armv8.S"
+#  include "coff/div3w-armv8.S"
+#  include "coff/ct_is_square_mod_384-armv8.S"
+# elif defined(__APPLE__)
+#  include "mach-o/sha256-armv8.S"
+#  include "mach-o/ct_inverse_mod_384-armv8.S"
+#  include "mach-o/add_mod_384-armv8.S"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  include "mach-o/mul_mont_384-armv8.S"
+#  include "mach-o/mul_mont_256-armv8.S"
+#  include "mach-o/add_mod_256-armv8.S"
+#  include "mach-o/ct_inverse_mod_256-armv8.S"
+#  include "mach-o/div3w-armv8.S"
+#  include "mach-o/ct_is_square_mod_384-armv8.S"
+# endif
+#elif defined(__BLST_NO_ASM__) || \
+      (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4)
+/* inaccurate way to detect a 32-bit processor, but it's close enough */
+#else
+# error "unsupported platform"
+#endif
diff --git a/blst/blst.h b/blst/blst.h
new file mode 100644
index 0000000..aaee107
--- /dev/null
+++ b/blst/blst.h
@@ -0,0 +1,480 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __BLST_H__
+#define __BLST_H__
+
+#ifdef __SIZE_TYPE__
+typedef __SIZE_TYPE__ size_t;
+#else
+#include <stddef.h>
+#endif
+
+#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \
+                            && defined(__UINT64_TYPE__)
+typedef __UINT8_TYPE__  uint8_t;
+typedef __UINT32_TYPE__ uint32_t;
+typedef __UINT64_TYPE__ uint64_t;
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#elif defined(__BLST_CGO__)
+typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901
+# define bool _Bool
+#else
+# define bool int
+#endif
+
+#ifdef SWIG
+# define DEFNULL =NULL
+#elif defined __cplusplus
+# define DEFNULL =0
+#else
+# define DEFNULL
+#endif
+
+typedef enum {
+    BLST_SUCCESS = 0,
+    BLST_BAD_ENCODING,
+    BLST_POINT_NOT_ON_CURVE,
+    BLST_POINT_NOT_IN_GROUP,
+    BLST_AGGR_TYPE_MISMATCH,
+    BLST_VERIFY_FAIL,
+    BLST_PK_IS_INFINITY,
+    BLST_BAD_SCALAR,
+} BLST_ERROR;
+
+typedef uint8_t byte;
+typedef uint64_t limb_t;
+
+typedef struct { byte b[256/8]; } blst_scalar;
+typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr;
+typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp;
+/* 0 is "real" part, 1 is "imaginary" */
+typedef struct { blst_fp fp[2]; } blst_fp2;
+typedef struct { blst_fp2 fp2[3]; } blst_fp6;
+typedef struct { blst_fp6 fp6[2]; } blst_fp12;
+
+void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]);
+void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a);
+void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]);
+void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a);
+void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]);
+void blst_bendian_from_scalar(byte out[32], const blst_scalar *a);
+void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]);
+void blst_lendian_from_scalar(byte out[32], const blst_scalar *a);
+bool blst_scalar_fr_check(const blst_scalar *a);
+bool blst_sk_check(const blst_scalar *a);
+bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a,
+                                           const blst_scalar *b);
+bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a,
+                                           const blst_scalar *b);
+bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a,
+                                           const blst_scalar *b);
+void blst_sk_inverse(blst_scalar *out, const blst_scalar *a);
+bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len);
+bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len);
+
+#ifndef SWIG
+/*
+ * BLS12-381-specifc Fr operations.
+ */
+void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b);
+void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b);
+void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a);
+void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count);
+void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count);
+void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b);
+void blst_fr_sqr(blst_fr *ret, const blst_fr *a);
+void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag);
+void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a);
+void blst_fr_inverse(blst_fr *ret, const blst_fr *a);
+
+void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]);
+void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a);
+void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a);
+void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a);
+
+/*
+ * BLS12-381-specifc Fp operations.
+ */
+void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b);
+void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b);
+void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a);
+void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a);
+void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count);
+void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b);
+void blst_fp_sqr(blst_fp *ret, const blst_fp *a);
+void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag);
+void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a);
+void blst_fp_inverse(blst_fp *ret, const blst_fp *a);
+bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a);
+
+void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]);
+void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a);
+void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]);
+void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a);
+void blst_fp_from_bendian(blst_fp *ret, const byte a[48]);
+void blst_bendian_from_fp(byte ret[48], const blst_fp *a);
+void blst_fp_from_lendian(blst_fp *ret, const byte a[48]);
+void blst_lendian_from_fp(byte ret[48], const blst_fp *a);
+
+/*
+ * BLS12-381-specifc Fp2 operations.
+ */
+void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
+void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
+void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count);
+void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
+void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag);
+void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a);
+bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a);
+
+/*
+ * BLS12-381-specifc Fp12 operations.
+ */
+void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a);
+void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a);
+void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b);
+void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a,
+                                             const blst_fp6 *xy00z0);
+void blst_fp12_conjugate(blst_fp12 *a);
+void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a);
+/* caveat lector! |n| has to be non-zero and not more than 3! */
+void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n);
+bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b);
+bool blst_fp12_is_one(const blst_fp12 *a);
+bool blst_fp12_in_group(const blst_fp12 *a);
+const blst_fp12 *blst_fp12_one();
+#endif  // SWIG
+
+/*
+ * BLS12-381-specifc point operations.
+ */
+typedef struct { blst_fp x, y, z; } blst_p1;
+typedef struct { blst_fp x, y; } blst_p1_affine;
+
+void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b);
+void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b);
+void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a,
+                                      const blst_p1_affine *b);
+void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a,
+                                                const blst_p1_affine *b);
+void blst_p1_double(blst_p1 *out, const blst_p1 *a);
+void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar,
+                                                  size_t nbits);
+void blst_p1_cneg(blst_p1 *p, bool cbit);
+void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in);
+void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in);
+bool blst_p1_on_curve(const blst_p1 *p);
+bool blst_p1_in_g1(const blst_p1 *p);
+bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b);
+bool blst_p1_is_inf(const blst_p1 *a);
+const blst_p1 *blst_p1_generator();
+
+bool blst_p1_affine_on_curve(const blst_p1_affine *p);
+bool blst_p1_affine_in_g1(const blst_p1_affine *p);
+bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b);
+bool blst_p1_affine_is_inf(const blst_p1_affine *a);
+const blst_p1_affine *blst_p1_affine_generator();
+
+typedef struct { blst_fp2 x, y, z; } blst_p2;
+typedef struct { blst_fp2 x, y; } blst_p2_affine;
+
+void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b);
+void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b);
+void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a,
+                                      const blst_p2_affine *b);
+void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a,
+                                                const blst_p2_affine *b);
+void blst_p2_double(blst_p2 *out, const blst_p2 *a);
+void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar,
+                                                  size_t nbits);
+void blst_p2_cneg(blst_p2 *p, bool cbit);
+void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in);
+void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in);
+bool blst_p2_on_curve(const blst_p2 *p);
+bool blst_p2_in_g2(const blst_p2 *p);
+bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b);
+bool blst_p2_is_inf(const blst_p2 *a);
+const blst_p2 *blst_p2_generator();
+
+bool blst_p2_affine_on_curve(const blst_p2_affine *p);
+bool blst_p2_affine_in_g2(const blst_p2_affine *p);
+bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b);
+bool blst_p2_affine_is_inf(const blst_p2_affine *a);
+const blst_p2_affine *blst_p2_affine_generator();
+
+/*
+ * Multi-scalar multiplications and other multi-point operations.
+ */
+
+void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[],
+                        size_t npoints);
+void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[],
+                                size_t npoints);
+
+size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints);
+void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits,
+                                    const blst_p1_affine *const points[],
+                                    size_t npoints);
+size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints);
+void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[],
+                         size_t wbits, size_t npoints,
+                         const byte *const scalars[], size_t nbits,
+                         limb_t *scratch);
+
+size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints);
+void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch);
+void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch,
+                             size_t bit0, size_t window);
+
+void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[],
+                        size_t npoints);
+void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[],
+                                size_t npoints);
+
+size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints);
+void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits,
+                                    const blst_p2_affine *const points[],
+                                    size_t npoints);
+size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints);
+void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[],
+                         size_t wbits, size_t npoints,
+                         const byte *const scalars[], size_t nbits,
+                         limb_t *scratch);
+
+size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints);
+void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch);
+void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch,
+                             size_t bit0, size_t window);
+
+/*
+ * Hash-to-curve operations.
+ */
+#ifndef SWIG
+void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL);
+void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL);
+#endif
+
+void blst_encode_to_g1(blst_p1 *out,
+                       const byte *msg, size_t msg_len,
+                       const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                       const byte *aug DEFNULL, size_t aug_len DEFNULL);
+void blst_hash_to_g1(blst_p1 *out,
+                     const byte *msg, size_t msg_len,
+                     const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                     const byte *aug DEFNULL, size_t aug_len DEFNULL);
+
+void blst_encode_to_g2(blst_p2 *out,
+                       const byte *msg, size_t msg_len,
+                       const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                       const byte *aug DEFNULL, size_t aug_len DEFNULL);
+void blst_hash_to_g2(blst_p2 *out,
+                     const byte *msg, size_t msg_len,
+                     const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                     const byte *aug DEFNULL, size_t aug_len DEFNULL);
+
+/*
+ * Zcash-compatible serialization/deserialization.
+ */
+void blst_p1_serialize(byte out[96], const blst_p1 *in);
+void blst_p1_compress(byte out[48], const blst_p1 *in);
+void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in);
+void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in);
+BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]);
+BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]);
+
+void blst_p2_serialize(byte out[192], const blst_p2 *in);
+void blst_p2_compress(byte out[96], const blst_p2 *in);
+void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in);
+void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in);
+BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]);
+BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]);
+
+/*
+ * Specification defines two variants, 'minimal-signature-size' and
+ * 'minimal-pubkey-size'. To unify appearance we choose to distinguish
+ * them by suffix referring to the public key type, more specifically
+ * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to
+ * 'minimal-signature-size'. It might appear a bit counterintuitive
+ * in sign call, but no matter how you twist it, something is bound to
+ * turn a little odd.
+ */
+/*
+ * Secret-key operations.
+ */
+void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
+                 const byte *info DEFNULL, size_t info_len DEFNULL);
+void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK);
+void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash,
+                                          const blst_scalar *SK);
+void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK);
+void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash,
+                                          const blst_scalar *SK);
+
+/*
+ * Pairing interface.
+ */
+#ifndef SWIG
+void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q,
+                                      const blst_p1_affine *P);
+void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f);
+void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q);
+void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68],
+                                            const blst_p1_affine *P);
+bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2);
+#endif
+
+#ifdef __BLST_CGO__
+typedef limb_t blst_pairing;
+#elif defined(__BLST_RUST_BINDGEN__)
+typedef struct {} blst_pairing;
+#else
+typedef struct blst_opaque blst_pairing;
+#endif
+
+size_t blst_pairing_sizeof();
+void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode,
+                       const byte *DST DEFNULL, size_t DST_len DEFNULL);
+const byte *blst_pairing_get_dst(const blst_pairing *ctx);
+void blst_pairing_commit(blst_pairing *ctx);
+BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx,
+                                           const blst_p2_affine *PK,
+                                           const blst_p1_affine *signature,
+                                           const byte *msg, size_t msg_len,
+                                           const byte *aug DEFNULL,
+                                           size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx,
+                                            const blst_p2_affine *PK,
+                                            bool pk_grpchk,
+                                            const blst_p1_affine *signature,
+                                            bool sig_grpchk,
+                                            const byte *msg, size_t msg_len,
+                                            const byte *aug DEFNULL,
+                                            size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx,
+                                                 const blst_p2_affine *PK,
+                                                 const blst_p1_affine *sig,
+                                                 const byte *scalar,
+                                                 size_t nbits,
+                                                 const byte *msg,
+                                                 size_t msg_len,
+                                                 const byte *aug DEFNULL,
+                                                 size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx,
+                                                  const blst_p2_affine *PK,
+                                                  bool pk_grpchk,
+                                                  const blst_p1_affine *sig,
+                                                  bool sig_grpchk,
+                                                  const byte *scalar,
+                                                  size_t nbits,
+                                                  const byte *msg,
+                                                  size_t msg_len,
+                                                  const byte *aug DEFNULL,
+                                                  size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx,
+                                           const blst_p1_affine *PK,
+                                           const blst_p2_affine *signature,
+                                           const byte *msg, size_t msg_len,
+                                           const byte *aug DEFNULL,
+                                           size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx,
+                                            const blst_p1_affine *PK,
+                                            bool pk_grpchk,
+                                            const blst_p2_affine *signature,
+                                            bool sig_grpchk,
+                                            const byte *msg, size_t msg_len,
+                                            const byte *aug DEFNULL,
+                                            size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx,
+                                                 const blst_p1_affine *PK,
+                                                 const blst_p2_affine *sig,
+                                                 const byte *scalar,
+                                                 size_t nbits,
+                                                 const byte *msg,
+                                                 size_t msg_len,
+                                                 const byte *aug DEFNULL,
+                                                 size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx,
+                                                  const blst_p1_affine *PK,
+                                                  bool pk_grpchk,
+                                                  const blst_p2_affine *sig,
+                                                  bool sig_grpchk,
+                                                  const byte *scalar,
+                                                  size_t nbits,
+                                                  const byte *msg,
+                                                  size_t msg_len,
+                                                  const byte *aug DEFNULL,
+                                                  size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1);
+bool blst_pairing_finalverify(const blst_pairing *ctx,
+                              const blst_fp12 *gtsig DEFNULL);
+
+
+/*
+ * Customarily applications aggregate signatures separately.
+ * In which case application would have to pass NULLs for |signature|
+ * to blst_pairing_aggregate calls and pass aggregated signature
+ * collected with these calls to blst_pairing_finalverify. Inputs are
+ * Zcash-compatible "straight-from-wire" byte vectors, compressed or
+ * not.
+ */
+BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in,
+                                              const byte *zwire);
+BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in,
+                                              const byte *zwire);
+
+void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature);
+void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature);
+
+/*
+ * "One-shot" CoreVerify entry points.
+ */
+BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk,
+                                     const blst_p2_affine *signature,
+                                     bool hash_or_encode,
+                                     const byte *msg, size_t msg_len,
+                                     const byte *DST DEFNULL,
+                                     size_t DST_len DEFNULL,
+                                     const byte *aug DEFNULL,
+                                     size_t aug_len DEFNULL);
+BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk,
+                                     const blst_p1_affine *signature,
+                                     bool hash_or_encode,
+                                     const byte *msg, size_t msg_len,
+                                     const byte *DST DEFNULL,
+                                     size_t DST_len DEFNULL,
+                                     const byte *aug DEFNULL,
+                                     size_t aug_len DEFNULL);
+
+extern const blst_p1_affine BLS12_381_G1;
+extern const blst_p1_affine BLS12_381_NEG_G1;
+extern const blst_p2_affine BLS12_381_G2;
+extern const blst_p2_affine BLS12_381_NEG_G2;
+
+#include "blst_aux.h"
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/blst/blst_aux.h b/blst/blst_aux.h
new file mode 100644
index 0000000..41c2901
--- /dev/null
+++ b/blst/blst_aux.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLST_AUX_H__
+#define __BLST_AUX_H__
+/*
+ * This file lists interfaces that might be promoted to blst.h or removed,
+ * depending on their proven/unproven worthiness.
+ */
+
+void blst_fr_to(blst_fr *ret, const blst_fr *a);
+void blst_fr_from(blst_fr *ret, const blst_fr *a);
+
+void blst_fp_to(blst_fp *ret, const blst_fp *a);
+void blst_fp_from(blst_fp *ret, const blst_fp *a);
+
+bool blst_fp_is_square(const blst_fp *a);
+bool blst_fp2_is_square(const blst_fp2 *a);
+
+void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in);
+void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in);
+
+/*
+ * Below functions produce both point and deserialized outcome of
+ * SkToPk and Sign. However, deserialized outputs are pre-decorated
+ * with sign and infinity bits. This means that you have to bring the
+ * output into compliance prior returning to application. If you want
+ * compressed point value, then do [equivalent of]
+ *
+ *  byte temp[96];
+ *  blst_sk_to_pk2_in_g1(temp, out_pk, SK);
+ *  temp[0] |= 0x80;
+ *  memcpy(out, temp, 48);
+ *
+ * Otherwise do
+ *
+ *  blst_sk_to_pk2_in_g1(out, out_pk, SK);
+ *  out[0] &= ~0x20;
+ *
+ * Either |out| or |out_<point>| can be NULL.
+ */
+void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk,
+                          const blst_scalar *SK);
+void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig,
+                         const blst_p2 *hash, const blst_scalar *SK);
+void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk,
+                          const blst_scalar *SK);
+void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig,
+                         const blst_p1 *hash, const blst_scalar *SK);
+
+typedef struct {} blst_uniq;
+
+size_t blst_uniq_sizeof(size_t n_nodes);
+void blst_uniq_init(blst_uniq *tree);
+bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len);
+
+#ifdef expand_message_xmd
+void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes,
+                        const unsigned char *aug, size_t aug_len,
+                        const unsigned char *msg, size_t msg_len,
+                        const unsigned char *DST, size_t DST_len);
+#else
+void blst_expand_message_xmd(byte *out, size_t out_len,
+                             const byte *msg, size_t msg_len,
+                             const byte *DST, size_t DST_len);
+#endif
+
+void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar,
+                                                            size_t nbits);
+void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar,
+                                                            size_t nbits);
+
+void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q,
+                                                   const blst_p1_affine *p);
+blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx);
+
+#endif
diff --git a/blst/bulk_addition.c b/blst/bulk_addition.c
new file mode 100644
index 0000000..81afc53
--- /dev/null
+++ b/blst/bulk_addition.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+#include "point.h"
+
+/*
+ * This implementation uses explicit addition formula:
+ *
+ * λ = (Y₂-Y₁)/(X₂-X₁)
+ * X₃ = λ²-(X₁+X₂)
+ * Y₃ = λ⋅(X₁-X₃)-Y₁
+ *
+ * But since we don't know if we'll have to add point to itself, we need
+ * to eventually resort to corresponding doubling formula:
+ *
+ * λ = 3X₁²/2Y₁
+ * X₃ = λ²-2X₁
+ * Y₃ = λ⋅(X₁-X₃)-Y₁
+ *
+ * The formulae use prohibitively expensive inversion, but whenever we
+ * have a lot of affine points to accumulate, we can amortize the cost
+ * by applying Montgomery's batch inversion approach. As a result,
+ * asymptotic[!] per-point cost for addition is as small as 5M+1S. For
+ * comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things
+ * considered, the improvement coefficient varies from 60% to 85%
+ * depending on platform and curve.
+ *
+ * THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an
+ * application that requires constant time-ness, speak up!]
+ */
+
+/*
+ * Calculate λ's numerator and denominator.
+ *
+ * input:	A	x1	y1	-
+ *		B	x2	y2	-
+ * output:
+ * if A!=B:	A	x1	y1	(x2-x1)*mul_acc
+ *		B	x2+x1	y2-y1	(x2-x1)
+ *
+ * if A==B:	A	x	y	2y*mul_acc
+ *		B	2x	3*x^2	2y
+ *
+ * if A==-B:	A	0	0	1*mul_acc
+ *		B	0	3*x^2	0
+ */
+#define HEAD(ptype, bits, field, one) \
+static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \
+{ \
+    ptype *A = AB, *B = AB+1; \
+    limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \
+                 vec_is_zero(B, sizeof(ptype##_affine));  \
+    static const vec##bits zero = { 0 }; \
+\
+    sub_##field(B->Z, B->X, A->X);		/* X2-X1  */ \
+    add_##field(B->X, B->X, A->X);		/* X2+X1  */ \
+    add_##field(A->Z, B->Y, A->Y);		/* Y2+Y1  */ \
+    sub_##field(B->Y, B->Y, A->Y);		/* Y2-Y1  */ \
+    if (vec_is_zero(B->Z, sizeof(B->Z))) {	/* X2==X1 */ \
+        inf = vec_is_zero(A->Z, sizeof(A->Z));	\
+        vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \
+        sqr_##field(B->Y, A->X);		\
+        mul_by_3_##field(B->Y, B->Y);		/* 3*X1^2 */ \
+        vec_copy(B->Z, A->Z, sizeof(B->Z));	/* 2*Y1   */ \
+    }						/* B->Y is numenator    */ \
+						/* B->Z is denominator  */ \
+    vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \
+    vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \
+    vec_select(A->Z, one,  B->Z, sizeof(A->Z), inf); \
+    vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \
+    if (mul_acc != NULL) \
+        mul_##field(A->Z, A->Z, mul_acc);	/* chain multiplication */\
+}
+
+/*
+ * Calculate λ and resulting coordinates.
+ *
+ * input:	A		x1			y1		-
+ *		B		x2+x1			nominator	-
+ * 		lambda		1/denominator
+ * output:	D		x3=(nom/den)^2-(x2+x1)	y3=(nom/den)(x1-x3)-y1
+ */
+#define TAIL(ptype, bits, field, one) \
+static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \
+{ \
+    ptype *A = AB, *B = AB+1; \
+    vec##bits llambda; \
+    limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \
+\
+    mul_##field(lambda, lambda, B->Y);		/* λ = (Y2-Y1)/(X2-X1)  */ \
+						/* alt. 3*X1^2/2*Y1     */ \
+    sqr_##field(llambda, lambda); \
+    sub_##field(D->X, llambda, B->X);		/* X3 = λ^2-X1-X2       */ \
+\
+    sub_##field(D->Y, A->X, D->X);   \
+    mul_##field(D->Y, D->Y, lambda); \
+    sub_##field(D->Y, D->Y, A->Y);		/* Y3 = λ*(X1-X3)-Y1    */ \
+\
+    vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \
+    vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \
+}
+
+/*
+ * |points[]| is volatile buffer with |X|s and |Y|s initially holding
+ * input affine coordinates, and with |Z|s being used as additional
+ * temporary storage [unrelated to Jacobian coordinates]. |sum| is
+ * in-/output, initialize to infinity accordingly.
+ */
+#define ADDITION_BTREE(prefix, ptype, bits, field, one) \
+HEAD(ptype, bits, field, one) \
+TAIL(ptype, bits, field, one) \
+static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \
+{ \
+    ptype *dst; \
+    void *mul_acc; \
+    size_t i; \
+\
+    while (n >= 16) { \
+        if (n & 1) \
+            ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \
+        n /= 2; \
+        for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \
+            ptype##_head(points, mul_acc); \
+\
+        reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \
+\
+        for (dst = points, i = n; --i;) { \
+            dst--; points -= 2; \
+            mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \
+            ptype##_tail(dst, points, points[-2].Z); \
+            mul_##field(points[-2].Z, points[0].Z, points[1].Z); \
+        } \
+        dst--; points -= 2; \
+        ptype##_tail(dst, points, points[0].Z); \
+        points = dst; \
+    } \
+    while (n--) \
+        ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \
+} \
+\
+void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \
+                               size_t npoints) \
+{ \
+    /* Performance with 288K scratch is within 1-2-3% from optimal */ \
+    const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 2048 : 1024; \
+    ptype *scratch = alloca((npoints > stride ? stride : npoints) * \
+                            sizeof(ptype)); \
+    const ptype##_affine *point = NULL; \
+\
+    vec_zero(sum, sizeof(*sum)); \
+    while (npoints) { \
+        size_t i, j = npoints > stride ? stride : npoints; \
+        for (i=0; i<j; i++) { \
+            point = *points ? *points++ : point+1; \
+            vec_copy(&scratch[i], point, sizeof(*point)); \
+        } \
+        ptype##s_accumulate(sum, scratch, j); \
+        npoints -= j; \
+    } \
+}
+
+ADDITION_BTREE(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p2)
+
+ADDITION_BTREE(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
diff --git a/blst/client_min_pk.c b/blst/client_min_pk.c
new file mode 100644
index 0000000..0fcf563
--- /dev/null
+++ b/blst/client_min_pk.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "keygen.c"
+#include "e2.c"
+#include "hash_to_field.c"
+#include "map_to_g2.c"
+#include "e1.c"
+#include "exp.c"
+#include "sqrt.c"
+#include "recip.c"
+#include "consts.c"
+#include "vect.c"
+#include "exports.c"
diff --git a/blst/client_min_sig.c b/blst/client_min_sig.c
new file mode 100644
index 0000000..8e4663d
--- /dev/null
+++ b/blst/client_min_sig.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "keygen.c"
+#include "e1.c"
+#include "hash_to_field.c"
+#include "map_to_g1.c"
+#include "e2.c"
+#include "exp.c"
+#include "sqrt.c"
+#include "recip.c"
+#include "consts.c"
+#include "vect.c"
+#include "exports.c"
diff --git a/blst/consts.c b/blst/consts.c
new file mode 100644
index 0000000..021c878
--- /dev/null
+++ b/blst/consts.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "consts.h"
+
+/* z = -0xd201000000010000 */
+const vec384 BLS12_381_P = {    /* (z-1)^2 * (z^4 - z^2 + 1)/3 + z */
+    TO_LIMB_T(0xb9feffffffffaaab), TO_LIMB_T(0x1eabfffeb153ffff),
+    TO_LIMB_T(0x6730d2a0f6b0f624), TO_LIMB_T(0x64774b84f38512bf),
+    TO_LIMB_T(0x4b1ba7b6434bacd7), TO_LIMB_T(0x1a0111ea397fe69a)
+};
+const limb_t BLS12_381_p0 = (limb_t)0x89f3fffcfffcfffd;  /* -1/P */
+
+const radix384 BLS12_381_Rx = { /* (1<<384)%P, "radix", one-in-Montgomery */
+  { { ONE_MONT_P },
+    { 0 } }
+};
+
+const vec384 BLS12_381_RR = {   /* (1<<768)%P, "radix"^2, to-Montgomery */
+    TO_LIMB_T(0xf4df1f341c341746), TO_LIMB_T(0x0a76e6a609d104f1),
+    TO_LIMB_T(0x8de5476c4c95b6d5), TO_LIMB_T(0x67eb88a9939d83c0),
+    TO_LIMB_T(0x9a793e85b519952d), TO_LIMB_T(0x11988fe592cae3aa)
+};
+
+const vec256 BLS12_381_r = {    /* z^4 - z^2 + 1, group order */
+    TO_LIMB_T(0xffffffff00000001), TO_LIMB_T(0x53bda402fffe5bfe),
+    TO_LIMB_T(0x3339d80809a1d805), TO_LIMB_T(0x73eda753299d7d48)
+};
+
+const vec256 BLS12_381_rRR = {  /* (1<<512)%r, "radix"^2, to-Montgomery */
+    TO_LIMB_T(0xc999e990f3f29c6d), TO_LIMB_T(0x2b6cedcb87925c23),
+    TO_LIMB_T(0x05d314967254398f), TO_LIMB_T(0x0748d9d99f59ff11)
+};
diff --git a/blst/consts.h b/blst/consts.h
new file mode 100644
index 0000000..cb391b8
--- /dev/null
+++ b/blst/consts.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_CONST_H__
+#define __BLS12_381_ASM_CONST_H__
+#include "vect.h"
+
+extern const vec384 BLS12_381_P;
+extern const limb_t BLS12_381_p0;
+static const limb_t p0 = (limb_t)0x89f3fffcfffcfffd;  /* -1/P */
+typedef union { vec384 p12[12]; vec384x p2; vec384 p; } radix384;
+extern const radix384 BLS12_381_Rx; /* (1<<384)%P, "radix", one-in-Montgomery */
+extern const vec384 BLS12_381_RR;   /* (1<<768)%P, "radix"^2, to-Montgomery   */
+
+#define ONE_MONT_P TO_LIMB_T(0x760900000002fffd), \
+                   TO_LIMB_T(0xebf4000bc40c0002), \
+                   TO_LIMB_T(0x5f48985753c758ba), \
+                   TO_LIMB_T(0x77ce585370525745), \
+                   TO_LIMB_T(0x5c071a97a256ec6d), \
+                   TO_LIMB_T(0x15f65ec3fa80e493)
+
+#define ZERO_384 (BLS12_381_Rx.p2[1])
+
+extern const vec256 BLS12_381_r;    /* order */
+static const limb_t r0 = (limb_t)0xfffffffeffffffff;  /* -1/r */
+extern const vec256 BLS12_381_rRR;  /* (1<<512)%r, "radix"^2, to-Montgomery   */
+
+#endif
diff --git a/blst/e1.c b/blst/e1.c
new file mode 100644
index 0000000..47fca14
--- /dev/null
+++ b/blst/e1.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "point.h"
+#include "fields.h"
+#include "errors.h"
+
+/*
+ * y^2 = x^3 + B
+ */
+static const vec384 B_E1 = {        /* (4 << 384) % P */
+    TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a),
+    TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7),
+    TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e)
+};
+
+const POINTonE1 BLS12_381_G1 = {    /* generator point [in Montgomery] */
+  /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905
+   *    a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */
+  { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5),
+    TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747),
+    TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) },
+  /* (0x08b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af6
+   *    00db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1 << 384) % P */
+  { TO_LIMB_T(0xbaac93d50ce72271), TO_LIMB_T(0x8c22631a7918fd8e),
+    TO_LIMB_T(0xdd595f13570725ce), TO_LIMB_T(0x51ac582950405194),
+    TO_LIMB_T(0x0e1c8c3fad0059c0), TO_LIMB_T(0x0bbc3efc5008a26a) },
+  { ONE_MONT_P }
+};
+
+const POINTonE1 BLS12_381_NEG_G1 = { /* negative generator [in Montgomery] */
+  /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905
+   *    a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */
+  { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5),
+    TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747),
+    TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) },
+  /* (0x114d1d6855d545a8aa7d76c8cf2e21f267816aef1db507c9
+   *    6655b9d5caac42364e6f38ba0ecb751bad54dcd6b939c2ca << 384) % P */
+  { TO_LIMB_T(0xff526c2af318883a), TO_LIMB_T(0x92899ce4383b0270),
+    TO_LIMB_T(0x89d7738d9fa9d055), TO_LIMB_T(0x12caf35ba344c12a),
+    TO_LIMB_T(0x3cff1b76964b5317), TO_LIMB_T(0x0e44d2ede9774430) },
+  { ONE_MONT_P }
+};
+
+static inline void mul_by_b_onE1(vec384 out, const vec384 in)
+{   lshift_fp(out, in, 2);   }
+
+static inline void mul_by_4b_onE1(vec384 out, const vec384 in)
+{   lshift_fp(out, in, 4);   }
+
+static void POINTonE1_cneg(POINTonE1 *p, bool_t cbit)
+{   cneg_fp(p->Y, p->Y, cbit);   }
+
+void blst_p1_cneg(POINTonE1 *a, int cbit)
+{   POINTonE1_cneg(a, is_zero(cbit) ^ 1);   }
+
+static void POINTonE1_from_Jacobian(POINTonE1 *out, const POINTonE1 *in)
+{
+    vec384 Z, ZZ;
+    limb_t inf = vec_is_zero(in->Z, sizeof(in->Z));
+
+    reciprocal_fp(Z, in->Z);                            /* 1/Z   */
+
+    sqr_fp(ZZ, Z);
+    mul_fp(out->X, in->X, ZZ);                          /* X = X/Z^2 */
+
+    mul_fp(ZZ, ZZ, Z);
+    mul_fp(out->Y, in->Y, ZZ);                          /* Y = Y/Z^3 */
+
+    vec_select(out->Z, in->Z, BLS12_381_G1.Z,
+                       sizeof(BLS12_381_G1.Z), inf);    /* Z = inf ? 0 : 1 */
+}
+
+void blst_p1_from_jacobian(POINTonE1 *out, const POINTonE1 *a)
+{   POINTonE1_from_Jacobian(out, a);   }
+
+static void POINTonE1_to_affine(POINTonE1_affine *out, const POINTonE1 *in)
+{
+    POINTonE1 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) {
+        POINTonE1_from_Jacobian(&p, in);
+        in = &p;
+    }
+    vec_copy(out, in, sizeof(*out));
+}
+
+void blst_p1_to_affine(POINTonE1_affine *out, const POINTonE1 *a)
+{   POINTonE1_to_affine(out, a);   }
+
+void blst_p1_from_affine(POINTonE1 *out, const POINTonE1_affine *a)
+{
+    vec_copy(out, a, sizeof(*a));
+    vec_select(out->Z, a->X, BLS12_381_Rx.p, sizeof(out->Z),
+                       vec_is_zero(a, sizeof(*a)));
+}
+
+static bool_t POINTonE1_affine_on_curve(const POINTonE1_affine *p)
+{
+    vec384 XXX, YY;
+
+    sqr_fp(XXX, p->X);
+    mul_fp(XXX, XXX, p->X);                             /* X^3 */
+    add_fp(XXX, XXX, B_E1);                             /* X^3 + B */
+
+    sqr_fp(YY, p->Y);                                   /* Y^2 */
+
+    return vec_is_equal(XXX, YY, sizeof(XXX));
+}
+
+int blst_p1_affine_on_curve(const POINTonE1_affine *p)
+{   return (int)(POINTonE1_affine_on_curve(p) | vec_is_zero(p, sizeof(*p)));   }
+
+static bool_t POINTonE1_on_curve(const POINTonE1 *p)
+{
+    vec384 XXX, YY, BZ6;
+    limb_t inf = vec_is_zero(p->Z, sizeof(p->Z));
+
+    sqr_fp(BZ6, p->Z);
+    mul_fp(BZ6, BZ6, p->Z);
+    sqr_fp(BZ6, BZ6);                                   /* Z^6 */
+    mul_by_b_onE1(BZ6, BZ6);                            /* B*Z^6 */
+
+    sqr_fp(XXX, p->X);
+    mul_fp(XXX, XXX, p->X);                             /* X^3 */
+    add_fp(XXX, XXX, BZ6);                              /* X^3 + B*Z^6 */
+
+    sqr_fp(YY, p->Y);                                   /* Y^2 */
+
+    return vec_is_equal(XXX, YY, sizeof(XXX)) | inf;
+}
+
+int blst_p1_on_curve(const POINTonE1 *p)
+{   return (int)POINTonE1_on_curve(p);   }
+
+static limb_t POINTonE1_affine_Serialize_BE(unsigned char out[96],
+                                            const POINTonE1_affine *in)
+{
+    vec384 temp;
+
+    from_fp(temp, in->X);
+    be_bytes_from_limbs(out, temp, sizeof(temp));
+
+    from_fp(temp, in->Y);
+    be_bytes_from_limbs(out + 48, temp, sizeof(temp));
+
+    return sgn0_pty_mod_384(temp, BLS12_381_P);
+}
+
+void blst_p1_affine_serialize(unsigned char out[96],
+                              const POINTonE1_affine *in)
+{
+    if (vec_is_zero(in->X, 2*sizeof(in->X))) {
+        bytes_zero(out, 96);
+        out[0] = 0x40;    /* infinitiy bit */
+    } else {
+        (void)POINTonE1_affine_Serialize_BE(out, in);
+    }
+}
+
+static limb_t POINTonE1_Serialize_BE(unsigned char out[96],
+                                     const POINTonE1 *in)
+{
+    POINTonE1 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) {
+        POINTonE1_from_Jacobian(&p, in);
+        in = &p;
+    }
+
+    return POINTonE1_affine_Serialize_BE(out, (const POINTonE1_affine *)in);
+}
+
+static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in)
+{
+    if (vec_is_zero(in->Z, sizeof(in->Z))) {
+        bytes_zero(out, 96);
+        out[0] = 0x40;    /* infinitiy bit */
+    } else {
+        (void)POINTonE1_Serialize_BE(out, in);
+    }
+}
+
+void blst_p1_serialize(unsigned char out[96], const POINTonE1 *in)
+{   POINTonE1_Serialize(out, in);   }
+
+static limb_t POINTonE1_affine_Compress_BE(unsigned char out[48],
+                                           const POINTonE1_affine *in)
+{
+    vec384 temp;
+
+    from_fp(temp, in->X);
+    be_bytes_from_limbs(out, temp, sizeof(temp));
+
+    return sgn0_pty_mont_384(in->Y, BLS12_381_P, p0);
+}
+
+void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in)
+{
+    if (vec_is_zero(in->X, 2*sizeof(in->X))) {
+        bytes_zero(out, 48);
+        out[0] = 0xc0;    /* compressed and infinitiy bits */
+    } else {
+        limb_t sign = POINTonE1_affine_Compress_BE(out, in);
+        out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
+    }
+}
+
+static limb_t POINTonE1_Compress_BE(unsigned char out[48],
+                                    const POINTonE1 *in)
+{
+    POINTonE1 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) {
+        POINTonE1_from_Jacobian(&p, in);
+        in = &p;
+    }
+
+    return POINTonE1_affine_Compress_BE(out, (const POINTonE1_affine *)in);
+}
+
+void blst_p1_compress(unsigned char out[48], const POINTonE1 *in)
+{
+    if (vec_is_zero(in->Z, sizeof(in->Z))) {
+        bytes_zero(out, 48);
+        out[0] = 0xc0;    /* compressed and infinitiy bits */
+    } else {
+        limb_t sign = POINTonE1_Compress_BE(out, in);
+        out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
+    }
+}
+
+static limb_t POINTonE1_Uncompress_BE(POINTonE1_affine *out,
+                                      const unsigned char in[48])
+{
+    POINTonE1_affine ret;
+    vec384 temp;
+
+    limbs_from_be_bytes(ret.X, in, sizeof(ret.X));
+    /* clear top 3 bits in case caller was conveying some information there */
+    ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3;
+    add_fp(temp, ret.X, ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X, sizeof(temp)))
+        return (limb_t)0 - BLST_BAD_ENCODING;
+    mul_fp(ret.X, ret.X, BLS12_381_RR);
+
+    sqr_fp(ret.Y, ret.X);
+    mul_fp(ret.Y, ret.Y, ret.X);
+    add_fp(ret.Y, ret.Y, B_E1);                         /* X^3 + B */
+    if (!sqrt_fp(ret.Y, ret.Y))
+        return (limb_t)0 - BLST_POINT_NOT_ON_CURVE;
+
+    vec_copy(out, &ret, sizeof(ret));
+
+    return sgn0_pty_mont_384(out->Y, BLS12_381_P, p0);
+}
+
+static BLST_ERROR POINTonE1_Uncompress_Z(POINTonE1_affine *out,
+                                         const unsigned char in[48])
+{
+    unsigned char in0 = in[0];
+    limb_t sgn0_pty;
+
+    if ((in0 & 0x80) == 0)      /* compressed bit */
+        return BLST_BAD_ENCODING;
+
+    if (in0 & 0x40) {           /* infinity bit */
+        if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 47)) {
+            vec_zero(out, sizeof(*out));
+            return BLST_SUCCESS;
+        } else {
+            return BLST_BAD_ENCODING;
+        }
+    }
+
+    sgn0_pty = POINTonE1_Uncompress_BE(out, in);
+
+    if (sgn0_pty > 3)
+        return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */
+
+    sgn0_pty >>= 1; /* skip over parity bit */
+    sgn0_pty ^= (in0 & 0x20) >> 5;
+    cneg_fp(out->Y, out->Y, sgn0_pty);
+
+    /* (0,±2) is not in group, but application might want to ignore? */
+    return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP
+                                               : BLST_SUCCESS;
+}
+
+BLST_ERROR blst_p1_uncompress(POINTonE1_affine *out, const unsigned char in[48])
+{   return POINTonE1_Uncompress_Z(out, in);   }
+
+static BLST_ERROR POINTonE1_Deserialize_BE(POINTonE1_affine *out,
+                                           const unsigned char in[96])
+{
+    POINTonE1_affine ret;
+    vec384 temp;
+
+    limbs_from_be_bytes(ret.X, in, sizeof(ret.X));
+    limbs_from_be_bytes(ret.Y, in + 48, sizeof(ret.Y));
+
+    /* clear top 3 bits in case caller was conveying some information there */
+    ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3;
+    add_fp(temp, ret.X, ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X, sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    add_fp(temp, ret.Y, ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.Y, sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    mul_fp(ret.X, ret.X, BLS12_381_RR);
+    mul_fp(ret.Y, ret.Y, BLS12_381_RR);
+
+    if (!POINTonE1_affine_on_curve(&ret))
+        return BLST_POINT_NOT_ON_CURVE;
+
+    vec_copy(out, &ret, sizeof(ret));
+
+    /* (0,±2) is not in group, but application might want to ignore? */
+    return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP
+                                               : BLST_SUCCESS;
+}
+
+static BLST_ERROR POINTonE1_Deserialize_Z(POINTonE1_affine *out,
+                                          const unsigned char in[96])
+{
+    unsigned char in0 = in[0];
+
+    if ((in0 & 0xe0) == 0)
+        return POINTonE1_Deserialize_BE(out, in);
+
+    if (in0 & 0x80)             /* compressed bit */
+        return POINTonE1_Uncompress_Z(out, in);
+
+    if (in0 & 0x40) {           /* infinity bit */
+        if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) {
+            vec_zero(out, sizeof(*out));
+            return BLST_SUCCESS;
+        }
+    }
+
+    return BLST_BAD_ENCODING;
+}
+
+BLST_ERROR blst_p1_deserialize(POINTonE1_affine *out,
+                               const unsigned char in[96])
+{   return POINTonE1_Deserialize_Z(out, in);   }
+
+#include "ec_ops.h"
+POINT_DADD_IMPL(POINTonE1, 384, fp)
+POINT_DADD_AFFINE_IMPL_A0(POINTonE1, 384, fp, BLS12_381_Rx.p)
+POINT_ADD_IMPL(POINTonE1, 384, fp)
+POINT_ADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p)
+POINT_DOUBLE_IMPL_A0(POINTonE1, 384, fp)
+POINT_IS_EQUAL_IMPL(POINTonE1, 384, fp)
+
+void blst_p1_add(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b)
+{   POINTonE1_add(out, a, b);   }
+
+void blst_p1_add_or_double(POINTonE1 *out, const POINTonE1 *a,
+                                           const POINTonE1 *b)
+{   POINTonE1_dadd(out, a, b, NULL);   }
+
+void blst_p1_add_affine(POINTonE1 *out, const POINTonE1 *a,
+                                        const POINTonE1_affine *b)
+{   POINTonE1_add_affine(out, a, b);   }
+
+void blst_p1_add_or_double_affine(POINTonE1 *out, const POINTonE1 *a,
+                                                  const POINTonE1_affine *b)
+{   POINTonE1_dadd_affine(out, a, b);   }
+
+void blst_p1_double(POINTonE1 *out, const POINTonE1 *a)
+{   POINTonE1_double(out, a);   }
+
+int blst_p1_is_equal(const POINTonE1 *a, const POINTonE1 *b)
+{   return (int)POINTonE1_is_equal(a, b);   }
+
+#include "ec_mult.h"
+POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 4)
+POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 5)
+
+#ifdef __BLST_PRIVATE_TESTMODE__
+POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE1)
+
+DECLARE_PRIVATE_POINTXZ(POINTonE1, 384)
+POINT_LADDER_PRE_IMPL(POINTonE1, 384, fp)
+POINT_LADDER_STEP_IMPL_A0(POINTonE1, 384, fp, onE1)
+POINT_LADDER_POST_IMPL_A0(POINTonE1, 384, fp, onE1)
+POINT_MULT_SCALAR_LADDER_IMPL(POINTonE1)
+#endif
+
+static const vec384 beta = {            /* such that beta^3 - 1 = 0  */
+    /* -1/2 * (1 + sqrt(-3)) = ((P-2)^(P-2)) * (1 + (P-3)^((P+1)/4)) */
+    /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4
+          897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac << 384) % P */
+    TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2),
+    TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e),
+    TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741)
+};
+
+static void sigma(POINTonE1 *out, const POINTonE1 *in)
+{
+    vec_copy(out->X, in->X, 2*sizeof(out->X));
+    mul_fp(out->Z, in->Z, beta);
+}
+
+/* Gallant-Lambert-Vanstone, ~45% faster than POINTonE1_mult_w5 */
+static void POINTonE1_mult_glv(POINTonE1 *out, const POINTonE1 *in,
+                               const pow256 SK)
+{
+    union { vec256 l; pow256 s; } val;
+
+    /* SK/z^2 [in constant time] */
+
+    limbs_from_le_bytes(val.l, SK, 32);
+    div_by_zz(val.l);
+    le_bytes_from_limbs(val.s, val.l, 32);
+
+    {
+        const byte *scalars[2] = { val.s+16, val.s };
+        POINTonE1 table[2][1<<(5-1)];   /* 4.5KB */
+        size_t i;
+
+        POINTonE1_precompute_w5(table[0], in);
+        for (i = 0; i < 1<<(5-1); i++) {
+            mul_fp(table[1][i].X, table[0][i].X, beta);
+            cneg_fp(table[1][i].Y, table[0][i].Y, 1);
+            vec_copy(table[1][i].Z, table[0][i].Z, sizeof(table[1][i].Z));
+        }
+
+        POINTonE1s_mult_w5(out, NULL, 2, scalars, 128, table);
+        POINTonE1_cneg(out, 1);
+        mul_fp(out->Z, out->Z, beta);
+        mul_fp(out->Z, out->Z, beta);
+    }
+
+    vec_zero(val.l, sizeof(val));   /* scrub the copy of SK */
+}
+
+static void POINTonE1_sign(POINTonE1 *out, const POINTonE1 *in, const pow256 SK)
+{
+    vec384 Z, ZZ;
+    limb_t inf;
+
+    POINTonE1_mult_glv(out, in, SK);
+
+    /* convert to affine to remove possible bias in out->Z */
+    inf = vec_is_zero(out->Z, sizeof(out->Z));
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    flt_reciprocal_fp(Z, out->Z);                       /* 1/Z   */
+#else
+    reciprocal_fp(Z, out->Z);                           /* 1/Z   */
+#endif
+
+    sqr_fp(ZZ, Z);
+    mul_fp(out->X, out->X, ZZ);                         /* X = X/Z^2 */
+
+    mul_fp(ZZ, ZZ, Z);
+    mul_fp(out->Y, out->Y, ZZ);                         /* Y = Y/Z^3 */
+
+    vec_select(out->Z, out->Z, BLS12_381_G1.Z, sizeof(BLS12_381_G1.Z),
+                       inf);                            /* Z = inf ? 0 : 1 */
+}
+
+void blst_sk_to_pk_in_g1(POINTonE1 *out, const pow256 SK)
+{   POINTonE1_sign(out, &BLS12_381_G1, SK);   }
+
+void blst_sign_pk_in_g2(POINTonE1 *out, const POINTonE1 *msg, const pow256 SK)
+{   POINTonE1_sign(out, msg, SK);   }
+
+void blst_sk_to_pk2_in_g1(unsigned char out[96], POINTonE1_affine *PK,
+                          const pow256 SK)
+{
+    POINTonE1 P[1];
+
+    POINTonE1_sign(P, &BLS12_381_G1, SK);
+    if (PK != NULL)
+        vec_copy(PK, P, sizeof(*PK));
+    if (out != NULL) {
+        limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P);
+        out[0] |= (sgn0_pty & 2) << 4;      /* pre-decorate */
+        out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6;
+    }
+}
+
+void blst_sign_pk2_in_g2(unsigned char out[96], POINTonE1_affine *sig,
+                         const POINTonE1 *hash, const pow256 SK)
+{
+    POINTonE1 P[1];
+
+    POINTonE1_sign(P, hash, SK);
+    if (sig != NULL)
+        vec_copy(sig, P, sizeof(*sig));
+    if (out != NULL) {
+        limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P);
+        out[0] |= (sgn0_pty & 2) << 4;      /* pre-decorate */
+        out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6;
+    }
+}
+
+void blst_p1_mult(POINTonE1 *out, const POINTonE1 *a,
+                                  const byte *scalar, size_t nbits)
+{
+    if (nbits < 176) {
+        if (nbits)
+            POINTonE1_mult_w4(out, a, scalar, nbits);
+        else
+            vec_zero(out, sizeof(*out));
+    } else if (nbits <= 256) {
+        union { vec256 l; pow256 s; } val;
+        size_t i, j, top, mask = (size_t)0 - 1;
+
+        /* this is not about constant-time-ness, but branch optimization */
+        for (top = (nbits + 7)/8, i=0, j=0; i<sizeof(val.s);) {
+            val.s[i++] = scalar[j] & mask;
+            mask = 0 - ((i - top) >> (8*sizeof(top)-1));
+            j += 1 & mask;
+        }
+
+        if (check_mod_256(val.s, BLS12_381_r))  /* z^4 is the formal limit */
+            POINTonE1_mult_glv(out, a, val.s);
+        else    /* should never be the case, added for formal completeness */
+            POINTonE1_mult_w5(out, a, scalar, nbits);
+
+        vec_zero(val.l, sizeof(val));
+    } else {    /* should never be the case, added for formal completeness */
+        POINTonE1_mult_w5(out, a, scalar, nbits);
+    }
+}
+
+void blst_p1_unchecked_mult(POINTonE1 *out, const POINTonE1 *a,
+                                            const byte *scalar, size_t nbits)
+{
+    if (nbits)
+        POINTonE1_mult_w4(out, a, scalar, nbits);
+    else
+        vec_zero(out, sizeof(*out));
+}
+
+int blst_p1_affine_is_equal(const POINTonE1_affine *a,
+                            const POINTonE1_affine *b)
+{   return (int)vec_is_equal(a, b, sizeof(*a));   }
+
+int blst_p1_is_inf(const POINTonE1 *p)
+{   return (int)vec_is_zero(p->Z, sizeof(p->Z));   }
+
+const POINTonE1 *blst_p1_generator(void)
+{   return &BLS12_381_G1;   }
+
+int blst_p1_affine_is_inf(const POINTonE1_affine *p)
+{   return (int)vec_is_zero(p, sizeof(*p));   }
+
+const POINTonE1_affine *blst_p1_affine_generator(void)
+{   return (const POINTonE1_affine *)&BLS12_381_G1;   }
diff --git a/blst/e2.c b/blst/e2.c
new file mode 100644
index 0000000..eafc486
--- /dev/null
+++ b/blst/e2.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "point.h"
+#include "fields.h"
+#include "errors.h"
+
+/*
+ * y^2 = x^3 + B
+ */
+static const vec384x B_E2 = {       /* 4 + 4*i */
+  { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a),
+    TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7),
+    TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) },
+  { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a),
+    TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7),
+    TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }
+};
+
+const POINTonE2 BLS12_381_G2 = {    /* generator point [in Montgomery] */
+{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02
+        b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */
+  { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a),
+    TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9),
+    TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) },
+  /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a
+        b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */
+  { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3),
+    TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367),
+    TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) }
+},
+{ /* (0x0ce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a7
+        6d429a695160d12c923ac9cc3baca289e193548608b82801 << 384) % P */
+  { TO_LIMB_T(0x4c730af860494c4a), TO_LIMB_T(0x597cfa1f5e369c5a),
+    TO_LIMB_T(0xe7e6856caa0a635a), TO_LIMB_T(0xbbefb5e96e0d495f),
+    TO_LIMB_T(0x07d3a975f0ef25a2), TO_LIMB_T(0x0083fd8e7e80dae5) },
+  /* (0x0606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af
+        267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be << 384) % P */
+  { TO_LIMB_T(0xadc0fc92df64b05d), TO_LIMB_T(0x18aa270a2b1461dc),
+    TO_LIMB_T(0x86adac6a3be4eba0), TO_LIMB_T(0x79495c4ec93da33a),
+    TO_LIMB_T(0xe7175850a43ccaed), TO_LIMB_T(0x0b2bc2a163de1bf2) },
+},
+{ { ONE_MONT_P }, { 0 } }
+};
+
+const POINTonE2 BLS12_381_NEG_G2 = { /* negative generator [in Montgomery] */
+{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02
+        b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */
+  { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a),
+    TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9),
+    TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) },
+  /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a
+        b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */
+  { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3),
+    TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367),
+    TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) }
+},
+{ /* (0x0d1b3cc2c7027888be51d9ef691d77bcb679afda66c73f17
+        f9ee3837a55024f78c71363275a75d75d86bab79f74782aa << 384) % P */
+  { TO_LIMB_T(0x6d8bf5079fb65e61), TO_LIMB_T(0xc52f05df531d63a5),
+    TO_LIMB_T(0x7f4a4d344ca692c9), TO_LIMB_T(0xa887959b8577c95f),
+    TO_LIMB_T(0x4347fe40525c8734), TO_LIMB_T(0x197d145bbaff0bb5) },
+  /* (0x13fa4d4a0ad8b1ce186ed5061789213d993923066dddaf10
+        40bc3ff59f825c78df74f2d75467e25e0f55f8a00fa030ed << 384) % P */
+  { TO_LIMB_T(0x0c3e036d209afa4e), TO_LIMB_T(0x0601d8f4863f9e23),
+    TO_LIMB_T(0xe0832636bacc0a84), TO_LIMB_T(0xeb2def362a476f84),
+    TO_LIMB_T(0x64044f659f0ee1e9), TO_LIMB_T(0x0ed54f48d5a1caa7) }
+},
+{ { ONE_MONT_P }, { 0 } }
+};
+
+static void mul_by_b_onE2(vec384x out, const vec384x in)
+{
+    sub_fp(out[0], in[0], in[1]);
+    add_fp(out[1], in[0], in[1]);
+    lshift_fp(out[0], out[0], 2);
+    lshift_fp(out[1], out[1], 2);
+}
+
+static void mul_by_4b_onE2(vec384x out, const vec384x in)
+{
+    sub_fp(out[0], in[0], in[1]);
+    add_fp(out[1], in[0], in[1]);
+    lshift_fp(out[0], out[0], 4);
+    lshift_fp(out[1], out[1], 4);
+}
+
+static void POINTonE2_cneg(POINTonE2 *p, bool_t cbit)
+{   cneg_fp2(p->Y, p->Y, cbit);   }
+
+void blst_p2_cneg(POINTonE2 *a, int cbit)
+{   POINTonE2_cneg(a, is_zero(cbit) ^ 1);   }
+
+static void POINTonE2_from_Jacobian(POINTonE2 *out, const POINTonE2 *in)
+{
+    vec384x Z, ZZ;
+    limb_t inf = vec_is_zero(in->Z, sizeof(in->Z));
+
+    reciprocal_fp2(Z, in->Z);                           /* 1/Z */
+
+    sqr_fp2(ZZ, Z);
+    mul_fp2(out->X, in->X, ZZ);                         /* X = X/Z^2 */
+
+    mul_fp2(ZZ, ZZ, Z);
+    mul_fp2(out->Y, in->Y, ZZ);                         /* Y = Y/Z^3 */
+
+    vec_select(out->Z, in->Z, BLS12_381_G2.Z,
+                       sizeof(BLS12_381_G2.Z), inf);    /* Z = inf ? 0 : 1 */
+}
+
+void blst_p2_from_jacobian(POINTonE2 *out, const POINTonE2 *a)
+{   POINTonE2_from_Jacobian(out, a);   }
+
+static void POINTonE2_to_affine(POINTonE2_affine *out, const POINTonE2 *in)
+{
+    POINTonE2 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) {
+        POINTonE2_from_Jacobian(&p, in);
+        in = &p;
+    }
+    vec_copy(out, in, sizeof(*out));
+}
+
+void blst_p2_to_affine(POINTonE2_affine *out, const POINTonE2 *a)
+{   POINTonE2_to_affine(out, a);   }
+
+void blst_p2_from_affine(POINTonE2 *out, const POINTonE2_affine *a)
+{
+    vec_copy(out, a, sizeof(*a));
+    vec_select(out->Z, a->X, BLS12_381_Rx.p2, sizeof(out->Z),
+                       vec_is_zero(a, sizeof(*a)));
+}
+
+static bool_t POINTonE2_affine_on_curve(const POINTonE2_affine *p)
+{
+    vec384x XXX, YY;
+
+    sqr_fp2(XXX, p->X);
+    mul_fp2(XXX, XXX, p->X);                            /* X^3 */
+    add_fp2(XXX, XXX, B_E2);                            /* X^3 + B */
+
+    sqr_fp2(YY, p->Y);                                  /* Y^2 */
+
+    return vec_is_equal(XXX, YY, sizeof(XXX));
+}
+
+int blst_p2_affine_on_curve(const POINTonE2_affine *p)
+{   return (int)(POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p)));   }
+
+static bool_t POINTonE2_on_curve(const POINTonE2 *p)
+{
+    vec384x XXX, YY, BZ6;
+    limb_t inf = vec_is_zero(p->Z, sizeof(p->Z));
+
+    sqr_fp2(BZ6, p->Z);
+    mul_fp2(BZ6, BZ6, p->Z);
+    sqr_fp2(XXX, BZ6);                                  /* Z^6 */
+    mul_by_b_onE2(BZ6, XXX);                            /* B*Z^6 */
+
+    sqr_fp2(XXX, p->X);
+    mul_fp2(XXX, XXX, p->X);                            /* X^3 */
+    add_fp2(XXX, XXX, BZ6);                             /* X^3 + B*Z^6 */
+
+    sqr_fp2(YY, p->Y);                                  /* Y^2 */
+
+    return vec_is_equal(XXX, YY, sizeof(XXX)) | inf;
+}
+
+int blst_p2_on_curve(const POINTonE2 *p)
+{   return (int)POINTonE2_on_curve(p);   }
+
+static limb_t POINTonE2_affine_Serialize_BE(unsigned char out[192],
+                                            const POINTonE2_affine *in)
+{
+    vec384x temp;
+
+    from_fp(temp[1], in->X[1]);
+    be_bytes_from_limbs(out, temp[1], sizeof(temp[1]));
+    from_fp(temp[0], in->X[0]);
+    be_bytes_from_limbs(out + 48, temp[0], sizeof(temp[0]));
+
+    from_fp(temp[1], in->Y[1]);
+    be_bytes_from_limbs(out + 96, temp[1], sizeof(temp[1]));
+    from_fp(temp[0], in->Y[0]);
+    be_bytes_from_limbs(out + 144, temp[0], sizeof(temp[0]));
+
+    return sgn0_pty_mod_384x(temp, BLS12_381_P);
+}
+
+void blst_p2_affine_serialize(unsigned char out[192],
+                              const POINTonE2_affine *in)
+{
+    if (vec_is_zero(in->X, 2*sizeof(in->X))) {
+        bytes_zero(out, 192);
+        out[0] = 0x40;    /* infinitiy bit */
+    } else {
+        (void)POINTonE2_affine_Serialize_BE(out, in);
+    }
+}
+
+static limb_t POINTonE2_Serialize_BE(unsigned char out[192],
+                                     const POINTonE2 *in)
+{
+    POINTonE2 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) {
+        POINTonE2_from_Jacobian(&p, in);
+        in = &p;
+    }
+
+    return POINTonE2_affine_Serialize_BE(out, (const POINTonE2_affine *)in);
+}
+
+static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in)
+{
+    if (vec_is_zero(in->Z, sizeof(in->Z))) {
+        bytes_zero(out, 192);
+        out[0] = 0x40;    /* infinitiy bit */
+    } else {
+        (void)POINTonE2_Serialize_BE(out, in);
+    }
+}
+
+void blst_p2_serialize(unsigned char out[192], const POINTonE2 *in)
+{   POINTonE2_Serialize(out, in);   }
+
+static limb_t POINTonE2_affine_Compress_BE(unsigned char out[96],
+                                           const POINTonE2_affine *in)
+{
+    vec384 temp;
+
+    from_fp(temp, in->X[1]);
+    be_bytes_from_limbs(out, temp, sizeof(temp));
+    from_fp(temp, in->X[0]);
+    be_bytes_from_limbs(out + 48, temp, sizeof(temp));
+
+    return sgn0_pty_mont_384x(in->Y, BLS12_381_P, p0);
+}
+
+void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in)
+{
+    if (vec_is_zero(in->X, 2*sizeof(in->X))) {
+        bytes_zero(out, 96);
+        out[0] = 0xc0;    /* compressed and infinitiy bits */
+    } else {
+        limb_t sign = POINTonE2_affine_Compress_BE(out, in);
+        out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
+    }
+}
+
+static limb_t POINTonE2_Compress_BE(unsigned char out[96],
+                                    const POINTonE2 *in)
+{
+    POINTonE2 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) {
+        POINTonE2_from_Jacobian(&p, in);
+        in = &p;
+    }
+
+    return POINTonE2_affine_Compress_BE(out, (const POINTonE2_affine *)in);
+}
+
+void blst_p2_compress(unsigned char out[96], const POINTonE2 *in)
+{
+    if (vec_is_zero(in->Z, sizeof(in->Z))) {
+        bytes_zero(out, 96);
+        out[0] = 0xc0;    /* compressed and infinitiy bits */
+    } else {
+        limb_t sign = POINTonE2_Compress_BE(out, in);
+        out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
+    }
+}
+
+static limb_t POINTonE2_Uncompress_BE(POINTonE2_affine *out,
+                                      const unsigned char in[96])
+{
+    POINTonE2_affine ret;
+    vec384 temp;
+
+    limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1]));
+    limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0]));
+
+    /* clear top 3 bits in case caller was conveying some information there */
+    ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3;
+    add_fp(temp, ret.X[1], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X[1], sizeof(temp)))
+        return (limb_t)0 - BLST_BAD_ENCODING;
+
+    add_fp(temp, ret.X[0], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X[0], sizeof(temp)))
+        return (limb_t)0 - BLST_BAD_ENCODING;
+
+    mul_fp(ret.X[0], ret.X[0], BLS12_381_RR);
+    mul_fp(ret.X[1], ret.X[1], BLS12_381_RR);
+
+    sqr_fp2(ret.Y, ret.X);
+    mul_fp2(ret.Y, ret.Y, ret.X);
+    add_fp2(ret.Y, ret.Y, B_E2);                        /* X^3 + B */
+    if (!sqrt_fp2(ret.Y, ret.Y))
+        return (limb_t)0 - BLST_POINT_NOT_ON_CURVE;
+
+    vec_copy(out, &ret, sizeof(ret));
+
+    return sgn0_pty_mont_384x(out->Y, BLS12_381_P, p0);
+}
+
+static BLST_ERROR POINTonE2_Uncompress_Z(POINTonE2_affine *out,
+                                         const unsigned char in[96])
+{
+    unsigned char in0 = in[0];
+    limb_t sgn0_pty;
+
+    if ((in0 & 0x80) == 0)      /* compressed bit */
+        return BLST_BAD_ENCODING;
+
+    if (in0 & 0x40) {           /* infinity bit */
+        if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) {
+            vec_zero(out, sizeof(*out));
+            return BLST_SUCCESS;
+        } else {
+            return BLST_BAD_ENCODING;
+        }
+    }
+
+    sgn0_pty = POINTonE2_Uncompress_BE(out, in);
+
+    if (sgn0_pty > 3)
+        return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */
+
+    sgn0_pty >>= 1; /* skip over parity bit */
+    sgn0_pty ^= (in0 & 0x20) >> 5;
+    cneg_fp2(out->Y, out->Y, sgn0_pty);
+
+    return BLST_SUCCESS;
+}
+
+BLST_ERROR blst_p2_uncompress(POINTonE2_affine *out, const unsigned char in[96])
+{   return POINTonE2_Uncompress_Z(out, in);   }
+
+static BLST_ERROR POINTonE2_Deserialize_BE(POINTonE2_affine *out,
+                                           const unsigned char in[192])
+{
+    POINTonE2_affine ret;
+    vec384 temp;
+
+    limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1]));
+    limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0]));
+    limbs_from_be_bytes(ret.Y[1], in + 96, sizeof(ret.Y[1]));
+    limbs_from_be_bytes(ret.Y[0], in + 144, sizeof(ret.Y[0]));
+
+    /* clear top 3 bits in case caller was conveying some information there */
+    ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3;
+    add_fp(temp, ret.X[1], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X[1], sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    add_fp(temp, ret.X[0], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X[0], sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    add_fp(temp, ret.Y[1], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.Y[1], sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    add_fp(temp, ret.Y[0], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.Y[0], sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    mul_fp(ret.X[0], ret.X[0], BLS12_381_RR);
+    mul_fp(ret.X[1], ret.X[1], BLS12_381_RR);
+    mul_fp(ret.Y[0], ret.Y[0], BLS12_381_RR);
+    mul_fp(ret.Y[1], ret.Y[1], BLS12_381_RR);
+
+    if (!POINTonE2_affine_on_curve(&ret))
+        return BLST_POINT_NOT_ON_CURVE;
+
+    vec_copy(out, &ret, sizeof(ret));
+
+    return BLST_SUCCESS;
+}
+
+static BLST_ERROR POINTonE2_Deserialize_Z(POINTonE2_affine *out,
+                                          const unsigned char in[192])
+{
+    unsigned char in0 = in[0];
+
+    if ((in0 & 0xe0) == 0)
+        return POINTonE2_Deserialize_BE(out, in);
+
+    if (in0 & 0x80)             /* compressed bit */
+        return POINTonE2_Uncompress_Z(out, in);
+
+    if (in0 & 0x40) {           /* infinity bit */
+        if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 191)) {
+            vec_zero(out, sizeof(*out));
+            return BLST_SUCCESS;
+        }
+    }
+
+    return BLST_BAD_ENCODING;
+}
+
+BLST_ERROR blst_p2_deserialize(POINTonE2_affine *out,
+                               const unsigned char in[192])
+{   return POINTonE2_Deserialize_Z(out, in);   }
+
+#include "ec_ops.h"
+POINT_DADD_IMPL(POINTonE2, 384x, fp2)
+POINT_DADD_AFFINE_IMPL_A0(POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
+POINT_ADD_IMPL(POINTonE2, 384x, fp2)
+POINT_ADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
+POINT_DOUBLE_IMPL_A0(POINTonE2, 384x, fp2)
+POINT_IS_EQUAL_IMPL(POINTonE2, 384x, fp2)
+
+void blst_p2_add(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b)
+{   POINTonE2_add(out, a, b);   }
+
+void blst_p2_add_or_double(POINTonE2 *out, const POINTonE2 *a,
+                                           const POINTonE2 *b)
+{   POINTonE2_dadd(out, a, b, NULL);   }
+
+void blst_p2_add_affine(POINTonE2 *out, const POINTonE2 *a,
+                                        const POINTonE2_affine *b)
+{   POINTonE2_add_affine(out, a, b);   }
+
+void blst_p2_add_or_double_affine(POINTonE2 *out, const POINTonE2 *a,
+                                                  const POINTonE2_affine *b)
+{   POINTonE2_dadd_affine(out, a, b);   }
+
+void blst_p2_double(POINTonE2 *out, const POINTonE2 *a)
+{   POINTonE2_double(out, a);   }
+
+int blst_p2_is_equal(const POINTonE2 *a, const POINTonE2 *b)
+{   return (int)POINTonE2_is_equal(a, b);   }
+
+#include "ec_mult.h"
+POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 4)
+POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 5)
+
+#ifdef __BLST_PRIVATE_TESTMODE__
+POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE2)
+
+DECLARE_PRIVATE_POINTXZ(POINTonE2, 384x)
+POINT_LADDER_PRE_IMPL(POINTonE2, 384x, fp2)
+POINT_LADDER_STEP_IMPL_A0(POINTonE2, 384x, fp2, onE2)
+POINT_LADDER_POST_IMPL_A0(POINTonE2, 384x, fp2, onE2)
+POINT_MULT_SCALAR_LADDER_IMPL(POINTonE2)
+#endif
+
+static void psi(POINTonE2 *out, const POINTonE2 *in)
+{
+    static const vec384x frobenius_x = { /* 1/(1 + i)^((P-1)/3) */
+      { 0 },
+      { /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4
+              897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad << 384) % P */
+        TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5),
+        TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024),
+        TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) }
+    };
+    static const vec384x frobenius_y = { /* 1/(1 + i)^((P-1)/2) */
+      { /* (0x135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60
+              ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2 << 384) % P */
+        TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183),
+        TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18),
+        TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) },
+      { /* (0x06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e
+              77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09 << 384) % P */
+        TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c),
+        TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7),
+        TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) },
+    };
+
+    vec_copy(out, in, sizeof(*out));
+    cneg_fp(out->X[1], out->X[1], 1);   mul_fp2(out->X, out->X, frobenius_x);
+    cneg_fp(out->Y[1], out->Y[1], 1);   mul_fp2(out->Y, out->Y, frobenius_y);
+    cneg_fp(out->Z[1], out->Z[1], 1);
+}
+
+/* Galbraith-Lin-Scott, ~67% faster than POINTonE2_mul_w5 */
+static void POINTonE2_mult_gls(POINTonE2 *out, const POINTonE2 *in,
+                               const pow256 SK)
+{
+    union { vec256 l; pow256 s; } val;
+
+    /* break down SK to "digits" with |z| as radix [in constant time] */
+
+    limbs_from_le_bytes(val.l, SK, 32);
+    div_by_zz(val.l);
+    div_by_z(val.l);
+    div_by_z(val.l + NLIMBS(256)/2);
+    le_bytes_from_limbs(val.s, val.l, 32);
+
+    {
+        const byte *scalars[2] = { val.s, NULL };
+        POINTonE2 table[4][1<<(5-1)];   /* 18KB */
+        size_t i;
+
+        POINTonE2_precompute_w5(table[0], in);
+        for (i = 0; i < 1<<(5-1); i++) {
+            psi(&table[1][i], &table[0][i]);
+            psi(&table[2][i], &table[1][i]);
+            psi(&table[3][i], &table[2][i]);
+            POINTonE2_cneg(&table[1][i], 1); /* account for z being negative */
+            POINTonE2_cneg(&table[3][i], 1);
+        }
+
+        POINTonE2s_mult_w5(out, NULL, 4, scalars, 64, table);
+    }
+
+    vec_zero(val.l, sizeof(val));   /* scrub the copy of SK */
+}
+
+static void POINTonE2_sign(POINTonE2 *out, const POINTonE2 *in, const pow256 SK)
+{
+    vec384x Z, ZZ;
+    limb_t inf;
+
+    POINTonE2_mult_gls(out, in, SK);
+
+    /* convert to affine to remove possible bias in out->Z */
+    inf = vec_is_zero(out->Z, sizeof(out->Z));
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    flt_reciprocal_fp2(Z, out->Z);                      /* 1/Z   */
+#else
+    reciprocal_fp2(Z, out->Z);                          /* 1/Z   */
+#endif
+
+    sqr_fp2(ZZ, Z);
+    mul_fp2(out->X, out->X, ZZ);                        /* X = X/Z^2 */
+
+    mul_fp2(ZZ, ZZ, Z);
+    mul_fp2(out->Y, out->Y, ZZ);                        /* Y = Y/Z^3 */
+
+    vec_select(out->Z, out->Z, BLS12_381_G2.Z, sizeof(BLS12_381_G2.Z),
+                       inf);                            /* Z = inf ? 0 : 1 */
+}
+
+void blst_sk_to_pk_in_g2(POINTonE2 *out, const pow256 SK)
+{   POINTonE2_sign(out, &BLS12_381_G2, SK);   }
+
+void blst_sign_pk_in_g1(POINTonE2 *out, const POINTonE2 *msg, const pow256 SK)
+{   POINTonE2_sign(out, msg, SK);   }
+
+void blst_sk_to_pk2_in_g2(unsigned char out[192], POINTonE2_affine *PK,
+                          const pow256 SK)
+{
+    POINTonE2 P[1];
+
+    POINTonE2_sign(P, &BLS12_381_G2, SK);
+    if (PK != NULL)
+        vec_copy(PK, P, sizeof(*PK));
+    if (out != NULL) {
+        limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P);
+        out[0] |= (sgn0_pty & 2) << 4;      /* pre-decorate */
+        out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6;
+    }
+}
+
+void blst_sign_pk2_in_g1(unsigned char out[192], POINTonE2_affine *sig,
+                         const POINTonE2 *hash, const pow256 SK)
+{
+    POINTonE2 P[1];
+
+    POINTonE2_sign(P, hash, SK);
+    if (sig != NULL)
+        vec_copy(sig, P, sizeof(*sig));
+    if (out != NULL) {
+        limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P);
+        out[0] |= (sgn0_pty & 2) << 4;      /* pre-decorate */
+        out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6;
+    }
+}
+
+void blst_p2_mult(POINTonE2 *out, const POINTonE2 *a,
+                                  const byte *scalar, size_t nbits)
+{
+    if (nbits < 144) {
+        if (nbits)
+            POINTonE2_mult_w4(out, a, scalar, nbits);
+        else
+            vec_zero(out, sizeof(*out));
+    } else if (nbits <= 256) {
+        union { vec256 l; pow256 s; } val;
+        size_t i, j, top, mask = (size_t)0 - 1;
+
+        /* this is not about constant-time-ness, but branch optimization */
+        for (top = (nbits + 7)/8, i=0, j=0; i<sizeof(val.s);) {
+            val.s[i++] = scalar[j] & mask;
+            mask = 0 - ((i - top) >> (8*sizeof(top)-1));
+            j += 1 & mask;
+        }
+
+        if (check_mod_256(val.s, BLS12_381_r))  /* z^4 is the formal limit */
+            POINTonE2_mult_gls(out, a, val.s);
+        else    /* should never be the case, added for formal completeness */
+            POINTonE2_mult_w5(out, a, scalar, nbits);
+
+        vec_zero(val.l, sizeof(val));
+    } else {    /* should never be the case, added for formal completeness */
+        POINTonE2_mult_w5(out, a, scalar, nbits);
+    }
+}
+
+void blst_p2_unchecked_mult(POINTonE2 *out, const POINTonE2 *a,
+                                            const byte *scalar, size_t nbits)
+{
+    if (nbits)
+        POINTonE2_mult_w4(out, a, scalar, nbits);
+    else
+        vec_zero(out, sizeof(*out));
+}
+
+int blst_p2_affine_is_equal(const POINTonE2_affine *a,
+                            const POINTonE2_affine *b)
+{   return (int)vec_is_equal(a, b, sizeof(*a));   }
+
+int blst_p2_is_inf(const POINTonE2 *p)
+{   return (int)vec_is_zero(p->Z, sizeof(p->Z));   }
+
+const POINTonE2 *blst_p2_generator(void)
+{   return &BLS12_381_G2;   }
+
+int blst_p2_affine_is_inf(const POINTonE2_affine *p)
+{   return (int)vec_is_zero(p, sizeof(*p));   }
+
+const POINTonE2_affine *blst_p2_affine_generator(void)
+{   return (const POINTonE2_affine *)&BLS12_381_G2;   }
diff --git a/blst/ec_mult.h b/blst/ec_mult.h
new file mode 100644
index 0000000..192f733
--- /dev/null
+++ b/blst/ec_mult.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_EC_MULT_H__
+#define __BLS12_381_ASM_EC_MULT_H__
+
+#include "point.h"
+
+/* Works up to 9 bits */
+static limb_t get_wval(const byte *d, size_t off, size_t bits)
+{
+    size_t top = off + bits - 1;
+    limb_t ret;
+
+    ret = ((limb_t)d[top / 8] << 8) | d[off / 8];
+
+    return ret >> (off%8);
+}
+
+/* Works up to 25 bits. */
+static limb_t get_wval_limb(const byte *d, size_t off, size_t bits)
+{
+    size_t i, top = (off + bits - 1)/8;
+    limb_t ret, mask = (limb_t)0 - 1;
+
+    d   += off/8;
+    top -= off/8-1;
+
+    /* this is not about constant-time-ness, but branch optimization */
+    for (ret=0, i=0; i<4;) {
+        ret |= (*d & mask) << (8*i);
+        mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1));
+        d += 1 & mask;
+    }
+
+    return ret >> (off%8);
+}
+
+/*
+ * Window value encoding that utilizes the fact that -P is trivially
+ * calculated, which allows to halve the size of pre-computed table,
+ * is attributed to A. D. Booth, hence the name of the subroutines...
+ */
+static limb_t booth_encode(limb_t wval, size_t sz)
+{
+    limb_t mask = 0 - (wval >> sz);     /* "sign" bit -> mask */
+
+    wval = (wval + 1) >> 1;
+    wval = (wval & ~mask) | ((0-wval) & mask);
+
+    /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */
+    return wval;
+}
+
+/*
+ * Key feature of these constant-time subroutines is that they tolerate
+ * zeros in most significant bit positions of the scalar[s], or in other
+ * words, zero-padded scalar values. This means that one can and should
+ * pass order's bit-length, which is customarily publicly known, instead
+ * of the factual scalars' bit-lengths. This is facilitated by point
+ * addition subroutines implemented to handle points at infinity, which
+ * are encoded as Z==0. [Doubling agorithms handle such points at
+ * infinity "naturally," since resulting Z is product of original Z.]
+ */
+#define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \
+static void ptype##_gather_booth_w##SZ(ptype *restrict p, \
+                                       const ptype table[1<<(SZ-1)], \
+                                       limb_t booth_idx) \
+{ \
+    size_t i; \
+    bool_t booth_sign = (booth_idx >> SZ) & 1; \
+\
+    booth_idx &= (1<<SZ) - 1; \
+    vec_zero(p, sizeof(ptype)); /* implicit infinity at table[-1] */\
+    /* ~6% with -Os, ~2% with -O3 ... */\
+    for (i = 1; i <= 1<<(SZ-1); i++) \
+        ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \
+\
+    ptype##_cneg(p, booth_sign); \
+} \
+\
+static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \
+{ \
+    size_t i, j; \
+                                      /* row[-1] is implicit infinity */\
+    vec_copy(&row[0], point, sizeof(ptype));        /* row[0]=p*1     */\
+    ptype##_double(&row[1],  point);                /* row[1]=p*(1+1) */\
+    for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \
+        ptype##_add(&row[i], &row[j], &row[j-1]),   /* row[2]=p*(2+1) */\
+        ptype##_double(&row[i+1], &row[j]);         /* row[3]=p*(2+2) */\
+}                                                   /* row[4] ...     */\
+\
+static void ptype##s_mult_w##SZ(ptype *ret, \
+                                const ptype *points[], size_t npoints, \
+                                const byte *scalars[], size_t bits, \
+                                ptype table[][1<<(SZ-1)]) \
+{ \
+    limb_t wmask, wval; \
+    size_t i, j, window, nbytes; \
+    const byte *scalar, **scalar_s = scalars; \
+    ptype temp[1]; \
+\
+    if (table == NULL) \
+        table = (ptype (*)[1<<(SZ-1)])alloca((1<<(SZ-1)) * sizeof(ptype) * \
+                                             npoints); \
+\
+    if (points != NULL) { \
+        const ptype *point = NULL; \
+        for (i = 0; i < npoints; i++) \
+            point = *points ? *points++ : point+1, \
+            ptype##_precompute_w##SZ(table[i], point); \
+    } \
+\
+    nbytes = (bits + 7)/8; /* convert |bits| to bytes */ \
+    scalar = *scalar_s++; \
+\
+    /* top excess bits modulo target window size */ \
+    window = bits % SZ; /* yes, it may be zero */ \
+    wmask = ((limb_t)1 << (window + 1)) - 1; \
+\
+    bits -= window; \
+    if (bits > 0) \
+        wval = get_wval(scalar, bits - 1, window + 1) & wmask; \
+    else \
+        wval = (scalar[0] << 1) & wmask; \
+\
+    wval = booth_encode(wval, SZ); \
+    ptype##_gather_booth_w##SZ(ret, table[0], wval); \
+\
+    i = 1; \
+    while (bits > 0) { \
+        for (; i < npoints; i++) { \
+            scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \
+            wval = get_wval(scalar, bits - 1, window + 1) & wmask; \
+            wval = booth_encode(wval, SZ); \
+            ptype##_gather_booth_w##SZ(temp, table[i], wval); \
+            ptype##_dadd(ret, ret, temp, NULL); \
+        } \
+\
+        for (j = 0; j < SZ; j++) \
+            ptype##_double(ret, ret); \
+\
+        window = SZ; \
+        wmask = ((limb_t)1 << (window + 1)) - 1; \
+        bits -= window; \
+        i = 0; scalar_s = scalars; \
+    } \
+\
+    for (; i < npoints; i++) { \
+        scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \
+        wval = (scalar[0] << 1) & wmask; \
+        wval = booth_encode(wval, SZ); \
+        ptype##_gather_booth_w##SZ(temp, table[i], wval); \
+        ptype##_dadd(ret, ret, temp, NULL); \
+    } \
+} \
+\
+static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \
+                               const byte *scalar, size_t bits) \
+{ \
+    limb_t wmask, wval; \
+    size_t j, window; \
+    ptype temp[1]; \
+    ptype table[1<<(SZ-1)]; \
+\
+    ptype##_precompute_w##SZ(table, point); \
+\
+    /* top excess bits modulo target window size */ \
+    window = bits % SZ;  /* yes, it may be zero */ \
+    wmask = ((limb_t)1 << (window + 1)) - 1; \
+\
+    bits -= window; \
+    wval = bits ? get_wval(scalar, bits - 1, window + 1) \
+                : (limb_t)scalar[0] << 1; \
+    wval &= wmask; \
+    wval = booth_encode(wval, SZ); \
+    ptype##_gather_booth_w##SZ(ret, table, wval); \
+\
+    while (bits > 0) { \
+        for (j = 0; j < SZ; j++) \
+            ptype##_double(ret, ret); \
+\
+        window = SZ; \
+        wmask = ((limb_t)1 << (window + 1)) - 1; \
+        bits -= window; \
+\
+        wval = bits ? get_wval(scalar, bits - 1, window + 1) \
+                    : (limb_t)scalar[0] << 1; \
+        wval &= wmask; \
+        wval = booth_encode(wval, SZ); \
+        ptype##_gather_booth_w##SZ(temp, table, wval); \
+        if (bits > 0) ptype##_add(ret, ret, temp); \
+        else          ptype##_dadd(ret, ret, temp, NULL); \
+    } \
+}
+
+#if 0
+/* ~50%, or ~2x[!] slower than w5... */
+#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \
+static void ptype##_mult_ladder(ptype *ret, const ptype *p, \
+                                const byte *scalar, size_t bits) \
+{ \
+    ptype sum[1]; \
+    bool_t bit, pbit = 0; \
+\
+    vec_copy(sum, p, sizeof(ptype)); \
+    vec_zero(ret, sizeof(ptype));   /* infinity */ \
+\
+    while (bits--) { \
+        bit = is_bit_set(scalar, bits); \
+        bit ^= pbit; \
+        ptype##_cswap(ret, sum, bit); \
+        ptype##_add(sum, sum, ret); \
+        ptype##_double(ret, ret); \
+        pbit ^= bit; \
+    } \
+    ptype##_cswap(ret, sum, pbit); \
+}
+#else
+/* >40% better performance than above, [and ~30% slower than w5]... */
+#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \
+static void ptype##_mult_ladder(ptype *out, const ptype *p, \
+                                const byte *scalar, size_t bits) \
+{ \
+    ptype##xz sum[1]; \
+    ptype##xz pxz[1]; \
+    ptype##xz ret[1]; \
+    bool_t bit, pbit = 0; \
+\
+    ptype##xz_ladder_pre(pxz, p); \
+    vec_copy(sum, pxz, sizeof(ptype##xz)); \
+    vec_zero(ret, sizeof(ptype##xz));   /* infinity */ \
+\
+    while (bits--) { \
+        bit = is_bit_set(scalar, bits); \
+        bit ^= pbit; \
+        ptype##xz_cswap(ret, sum, bit); \
+        ptype##xz_ladder_step(ret, sum, pxz); \
+        pbit ^= bit; \
+    } \
+    ptype##xz_cswap(ret, sum, pbit); \
+    ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \
+}
+#endif
+
+/*
+ * Sole reason for existence of this implementation is that addition
+ * with affine point renders a share of multiplications redundant by
+ * virtue of Z==1. And since pre-defined generator point can be and
+ * customarily is instantiated affine, it would be hardly appropriate
+ * to pass on this opportunity. Though while it's faster than the
+ * generic ladder implementation, by ~25%, it's not faster than XZ one
+ * above, <15% slower. Just in case, it's faster than generic ladder
+ * even if one accounts for prior conversion to affine coordinates,
+ * so that choice [for resource-constrained case] is actually between
+ * this plus said conversion and XZ ladder...
+ *
+ * To summarize, if ptype##_mult_w5 executed in one unit of time, then
+ * - naive ptype##_mult_ladder would execute in ~2;
+ * - XZ version above - in ~1.4;
+ * - ptype##_affine_mult_ladder below - in ~1.65;
+ * - [small-footprint ptype##_to_affine would run in ~0.18].
+ *
+ * Caveat lector, |p_affine|*(order+2) produces wrong result, because
+ * addition doesn't handle doubling. Indeed, P*(order+1) is P and it
+ * fails to add with itself producing infinity in last addition. But
+ * as long as |scalar| is reduced modulo order, as it should be, it's
+ * not a problem...
+ */
+#define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \
+static void ptype##_affine_mult_ladder(ptype *ret, \
+                                       const ptype##_affine *p_affine, \
+                                       const byte *scalar, size_t bits) \
+{ \
+    ptype sum[1]; \
+    bool_t bit; \
+\
+    vec_zero(ret, sizeof(ptype));   /* infinity */ \
+\
+    while (bits--) { \
+        ptype##_double(ret, ret); \
+        ptype##_add_affine(sum, ret, p_affine); \
+        bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \
+        ptype##_ccopy(ret, sum, bit); \
+    } \
+}
+#endif
diff --git a/blst/ec_ops.h b/blst/ec_ops.h
new file mode 100644
index 0000000..0d531f8
--- /dev/null
+++ b/blst/ec_ops.h
@@ -0,0 +1,787 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_384_ASM_EC_OPS_H__
+#define __BLS12_384_ASM_EC_OPS_H__
+/*
+ * Addition that can handle doubling [as well as points at infinity,
+ * which are encoded as Z==0] in constant time. It naturally comes at
+ * cost, but this subroutine should be called only when independent
+ * points are processed, which is considered reasonable compromise.
+ * For example, ptype##s_mult_w5 calls it, but since *major* gain is
+ * result of pure doublings being effectively divided by amount of
+ * points, slightly slower addition can be tolerated. But what is the
+ * additional cost more specifically? Best addition result is 11M+5S,
+ * while this routine takes 13M+5S (+1M+1S if a4!=0), as per
+ *
+ * -------------+-------------
+ * addition     | doubling
+ * -------------+-------------
+ * U1 = X1*Z2^2 | U1 = X1
+ * U2 = X2*Z1^2 |
+ * S1 = Y1*Z2^3 | S1 = Y1
+ * S2 = Y2*Z1^3 |
+ * zz = Z1*Z2   | zz = Z1
+ * H = U2-U1    | H' = 2*Y1
+ * R = S2-S1    | R' = 3*X1^2[+a*Z1^4]
+ * sx = U1+U2   | sx = X1+X1
+ * -------------+-------------
+ * H!=0 || R!=0 | H==0 && R==0
+ *
+ *      X3 = R^2-H^2*sx
+ *      Y3 = R*(H^2*U1-X3)-H^3*S1
+ *      Z3 = H*zz
+ *
+ * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is
+ * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0.
+ */
+#define POINT_DADD_IMPL(ptype, bits, field) \
+static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \
+                         const vec##bits a4) \
+{ \
+    ptype p3; /* starts as (U1, S1, zz) from addition side */\
+    struct { vec##bits H, R, sx; } add, dbl; \
+    bool_t p1inf, p2inf, is_dbl; \
+\
+    add_##field(dbl.sx, p1->X, p1->X);  /* sx = X1+X1 */\
+    sqr_##field(dbl.R, p1->X);          /* X1^2 */\
+    mul_by_3_##field(dbl.R, dbl.R);     /* R = 3*X1^2 */\
+    add_##field(dbl.H, p1->Y, p1->Y);   /* H = 2*Y1 */\
+\
+    p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \
+    sqr_##field(p3.X, p2->Z);           /* Z2^2 */\
+    mul_##field(p3.Z, p1->Z, p2->Z);    /* Z1*Z2 */\
+    p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \
+    sqr_##field(add.H, p1->Z);          /* Z1^2 */\
+\
+    if (a4 != NULL) { \
+        sqr_##field(p3.Y, add.H);       /* Z1^4, [borrow p3.Y] */\
+        mul_##field(p3.Y, p3.Y, a4);    \
+        add_##field(dbl.R, dbl.R, p3.Y);/* R = 3*X1^2+a*Z1^4 */\
+    } \
+\
+    mul_##field(p3.Y, p1->Y, p2->Z);    \
+    mul_##field(p3.Y, p3.Y, p3.X);      /* S1 = Y1*Z2^3 */\
+    mul_##field(add.R, p2->Y, p1->Z);   \
+    mul_##field(add.R, add.R, add.H);   /* S2 = Y2*Z1^3 */\
+    sub_##field(add.R, add.R, p3.Y);    /* R = S2-S1 */\
+\
+    mul_##field(p3.X, p3.X, p1->X);     /* U1 = X1*Z2^2 */\
+    mul_##field(add.H, add.H, p2->X);   /* U2 = X2*Z1^2 */\
+\
+    add_##field(add.sx, add.H, p3.X);   /* sx = U1+U2 */\
+    sub_##field(add.H, add.H, p3.X);    /* H = U2-U1 */\
+\
+    /* make the choice between addition and doubling */\
+    is_dbl = vec_is_zero(add.H, 2*sizeof(add.H));      \
+    vec_select(&p3, p1, &p3, sizeof(p3), is_dbl);      \
+    vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \
+    /* |p3| and |add| hold all inputs now, |p3| will hold output */\
+\
+    mul_##field(p3.Z, p3.Z, add.H);     /* Z3 = H*Z1*Z2 */\
+\
+    sqr_##field(dbl.H, add.H);          /* H^2 */\
+    mul_##field(dbl.R, dbl.H, add.H);   /* H^3 */\
+    mul_##field(dbl.R, dbl.R, p3.Y);    /* H^3*S1 */\
+    mul_##field(p3.Y, dbl.H, p3.X);     /* H^2*U1 */\
+\
+    mul_##field(dbl.H, dbl.H, add.sx);  /* H^2*sx */\
+    sqr_##field(p3.X, add.R);           /* R^2 */\
+    sub_##field(p3.X, p3.X, dbl.H);     /* X3 = R^2-H^2*sx */\
+\
+    sub_##field(p3.Y, p3.Y, p3.X);      /* H^2*U1-X3 */\
+    mul_##field(p3.Y, p3.Y, add.R);     /* R*(H^2*U1-X3) */\
+    sub_##field(p3.Y, p3.Y, dbl.R);     /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\
+\
+    vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \
+    vec_select(out, p2, &p3, sizeof(ptype), p1inf); \
+}
+
+/*
+ * Addition with affine point that can handle doubling [as well as
+ * points at infinity, with |p1| being encoded as Z==0 and |p2| as
+ * X,Y==0] in constant time. But at what additional cost? Best
+ * addition result is 7M+4S, while this routine takes 8M+5S, as per
+ *
+ * -------------+-------------
+ * addition     | doubling
+ * -------------+-------------
+ * U1 = X1      | U1 = X2
+ * U2 = X2*Z1^2 |
+ * S1 = Y1      | S1 = Y2
+ * S2 = Y2*Z1^3 |
+ * H = U2-X1    | H' = 2*Y2
+ * R = S2-Y1    | R' = 3*X2^2[+a]
+ * sx = X1+U2   | sx = X2+X2
+ * zz = H*Z1    | zz = H'
+ * -------------+-------------
+ * H!=0 || R!=0 | H==0 && R==0
+ *
+ *      X3 = R^2-H^2*sx
+ *      Y3 = R*(H^2*U1-X3)-H^3*S1
+ *      Z3 = zz
+ *
+ * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is
+ * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0.
+ */
+#define POINT_DADD_AFFINE_IMPL_A0(ptype, bits, field, one) \
+static void ptype##_dadd_affine(ptype *out, const ptype *p1, \
+                                            const ptype##_affine *p2) \
+{ \
+    ptype p3; /* starts as (,, H*Z1) from addition side */\
+    struct { vec##bits H, R, sx; } add, dbl; \
+    bool_t p1inf, p2inf, is_dbl; \
+\
+    p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \
+    add_##field(dbl.sx, p2->X, p2->X);  /* sx = X2+X2 */\
+    sqr_##field(dbl.R, p2->X);          /* X2^2 */\
+    mul_by_3_##field(dbl.R, dbl.R);     /* R = 3*X2^2 */\
+    add_##field(dbl.H, p2->Y, p2->Y);   /* H = 2*Y2 */\
+\
+    p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \
+    sqr_##field(add.H, p1->Z);          /* Z1^2 */\
+    mul_##field(add.R, add.H, p1->Z);   /* Z1^3 */\
+    mul_##field(add.R, add.R, p2->Y);   /* S2 = Y2*Z1^3 */\
+    sub_##field(add.R, add.R, p1->Y);   /* R = S2-Y1 */\
+\
+    mul_##field(add.H, add.H, p2->X);   /* U2 = X2*Z1^2 */\
+\
+    add_##field(add.sx, add.H, p1->X);  /* sx = X1+U2 */\
+    sub_##field(add.H, add.H, p1->X);   /* H = U2-X1 */\
+\
+    mul_##field(p3.Z, add.H, p1->Z);    /* Z3 = H*Z1 */\
+\
+    /* make the choice between addition and doubling */ \
+    is_dbl = vec_is_zero(add.H, 2*sizeof(add.H));       \
+    vec_select(p3.X, p2, p1, 2*sizeof(p3.X), is_dbl);   \
+    vec_select(p3.Z, dbl.H, p3.Z, sizeof(p3.Z), is_dbl);\
+    vec_select(&add, &dbl, &add, sizeof(add), is_dbl);  \
+    /* |p3| and |add| hold all inputs now, |p3| will hold output */\
+\
+    sqr_##field(dbl.H, add.H);          /* H^2 */\
+    mul_##field(dbl.R, dbl.H, add.H);   /* H^3 */\
+    mul_##field(dbl.R, dbl.R, p3.Y);    /* H^3*S1 */\
+    mul_##field(p3.Y, dbl.H, p3.X);     /* H^2*U1 */\
+\
+    mul_##field(dbl.H, dbl.H, add.sx);  /* H^2*sx */\
+    sqr_##field(p3.X, add.R);           /* R^2 */\
+    sub_##field(p3.X, p3.X, dbl.H);     /* X3 = R^2-H^2*sx */\
+\
+    sub_##field(p3.Y, p3.Y, p3.X);      /* H^2*U1-X3 */\
+    mul_##field(p3.Y, p3.Y, add.R);     /* R*(H^2*U1-X3) */\
+    sub_##field(p3.Y, p3.Y, dbl.R);     /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\
+\
+    vec_select(p3.X, p2,  p3.X, 2*sizeof(p3.X), p1inf); \
+    vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \
+    vec_select(out, p1, &p3, sizeof(ptype), p2inf); \
+}
+
+/*
+ * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
+ * with twist to handle either input at infinity, which are encoded as Z==0.
+ */
+#define POINT_ADD_IMPL(ptype, bits, field) \
+static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2) \
+{ \
+    ptype p3; \
+    vec##bits Z1Z1, Z2Z2, U1, S1, H, I, J; \
+    bool_t p1inf, p2inf; \
+\
+    p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \
+    sqr_##field(Z1Z1, p1->Z);           /* Z1Z1 = Z1^2 */\
+\
+    mul_##field(p3.Z, Z1Z1, p1->Z);     /* Z1*Z1Z1 */\
+    mul_##field(p3.Z, p3.Z, p2->Y);     /* S2 = Y2*Z1*Z1Z1 */\
+\
+    p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \
+    sqr_##field(Z2Z2, p2->Z);           /* Z2Z2 = Z2^2 */\
+\
+    mul_##field(S1, Z2Z2, p2->Z);       /* Z2*Z2Z2 */\
+    mul_##field(S1, S1, p1->Y);         /* S1 = Y1*Z2*Z2Z2 */\
+\
+    sub_##field(p3.Z, p3.Z, S1);        /* S2-S1 */\
+    add_##field(p3.Z, p3.Z, p3.Z);      /* r = 2*(S2-S1) */\
+\
+    mul_##field(U1, p1->X, Z2Z2);       /* U1 = X1*Z2Z2 */\
+    mul_##field(H,  p2->X, Z1Z1);       /* U2 = X2*Z1Z1 */\
+\
+    sub_##field(H, H, U1);              /* H = U2-U1 */\
+\
+    add_##field(I, H, H);               /* 2*H */\
+    sqr_##field(I, I);                  /* I = (2*H)^2 */\
+\
+    mul_##field(J, H, I);               /* J = H*I */\
+    mul_##field(S1, S1, J);             /* S1*J */\
+\
+    mul_##field(p3.Y, U1, I);           /* V = U1*I */\
+\
+    sqr_##field(p3.X, p3.Z);            /* r^2 */\
+    sub_##field(p3.X, p3.X, J);         /* r^2-J */\
+    sub_##field(p3.X, p3.X, p3.Y);      \
+    sub_##field(p3.X, p3.X, p3.Y);      /* X3 = r^2-J-2*V */\
+\
+    sub_##field(p3.Y, p3.Y, p3.X);      /* V-X3 */\
+    mul_##field(p3.Y, p3.Y, p3.Z);      /* r*(V-X3) */\
+    sub_##field(p3.Y, p3.Y, S1);        \
+    sub_##field(p3.Y, p3.Y, S1);        /* Y3 = r*(V-X3)-2*S1*J */\
+\
+    add_##field(p3.Z, p1->Z, p2->Z);    /* Z1+Z2 */\
+    sqr_##field(p3.Z, p3.Z);            /* (Z1+Z2)^2 */\
+    sub_##field(p3.Z, p3.Z, Z1Z1);      /* (Z1+Z2)^2-Z1Z1 */\
+    sub_##field(p3.Z, p3.Z, Z2Z2);      /* (Z1+Z2)^2-Z1Z1-Z2Z2 */\
+    mul_##field(p3.Z, p3.Z, H);         /* Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H */\
+\
+    vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \
+    vec_select(out, p2, &p3, sizeof(ptype), p1inf); \
+}
+
+/*
+ * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl
+ * with twist to handle either input at infinity, with |p1| encoded as Z==0,
+ * and |p2| as X==Y==0.
+ */
+#define POINT_ADD_AFFINE_IMPL(ptype, bits, field, one) \
+static void ptype##_add_affine(ptype *out, const ptype *p1, \
+                                           const ptype##_affine *p2) \
+{ \
+    ptype p3; \
+    vec##bits Z1Z1, H, HH, I, J; \
+    bool_t p1inf, p2inf; \
+\
+    p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \
+\
+    sqr_##field(Z1Z1, p1->Z);           /* Z1Z1 = Z1^2 */\
+\
+    mul_##field(p3.Z, Z1Z1, p1->Z);     /* Z1*Z1Z1 */\
+    mul_##field(p3.Z, p3.Z, p2->Y);     /* S2 = Y2*Z1*Z1Z1 */\
+\
+    p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \
+\
+    mul_##field(H, p2->X, Z1Z1);        /* U2 = X2*Z1Z1 */\
+    sub_##field(H, H, p1->X);           /* H = U2-X1 */\
+\
+    sqr_##field(HH, H);                 /* HH = H^2 */\
+    add_##field(I, HH, HH);             \
+    add_##field(I, I, I);               /* I = 4*HH */\
+\
+    mul_##field(p3.Y, p1->X, I);        /* V = X1*I */\
+    mul_##field(J, H, I);               /* J = H*I */\
+    mul_##field(I, J, p1->Y);           /* Y1*J */\
+\
+    sub_##field(p3.Z, p3.Z, p1->Y);     /* S2-Y1 */\
+    add_##field(p3.Z, p3.Z, p3.Z);      /* r = 2*(S2-Y1) */\
+\
+    sqr_##field(p3.X, p3.Z);            /* r^2 */\
+    sub_##field(p3.X, p3.X, J);         /* r^2-J */\
+    sub_##field(p3.X, p3.X, p3.Y);      \
+    sub_##field(p3.X, p3.X, p3.Y);      /* X3 = r^2-J-2*V */\
+\
+    sub_##field(p3.Y, p3.Y, p3.X);      /* V-X3 */\
+    mul_##field(p3.Y, p3.Y, p3.Z);      /* r*(V-X3) */\
+    sub_##field(p3.Y, p3.Y, I);         \
+    sub_##field(p3.Y, p3.Y, I);         /* Y3 = r*(V-X3)-2*Y1*J */\
+\
+    add_##field(p3.Z, p1->Z, H);        /* Z1+H */\
+    sqr_##field(p3.Z, p3.Z);            /* (Z1+H)^2 */\
+    sub_##field(p3.Z, p3.Z, Z1Z1);      /* (Z1+H)^2-Z1Z1 */\
+    sub_##field(p3.Z, p3.Z, HH);        /* Z3 = (Z1+H)^2-Z1Z1-HH */\
+\
+    vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \
+    vec_select(p3.X, p2,  p3.X, 2*sizeof(p3.X), p1inf); \
+    vec_select(out, p1, &p3, sizeof(ptype), p2inf); \
+}
+
+/*
+ * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
+ */
+#define POINT_DOUBLE_IMPL_A0(ptype, bits, field) \
+static void ptype##_double(ptype *p3, const ptype *p1) \
+{ \
+    vec##bits A, B, C; \
+\
+    sqr_##field(A, p1->X);              /* A = X1^2 */\
+    sqr_##field(B, p1->Y);              /* B = Y1^2 */\
+    sqr_##field(C, B);                  /* C = B^2 */\
+\
+    add_##field(B, B, p1->X);           /* X1+B */\
+    sqr_##field(B, B);                  /* (X1+B)^2 */\
+    sub_##field(B, B, A);               /* (X1+B)^2-A */\
+    sub_##field(B, B, C);               /* (X1+B)^2-A-C */\
+    add_##field(B, B, B);               /* D = 2*((X1+B)^2-A-C) */\
+\
+    mul_by_3_##field(A, A);             /* E = 3*A */\
+\
+    sqr_##field(p3->X, A);              /* F = E^2 */\
+    sub_##field(p3->X, p3->X, B);       \
+    sub_##field(p3->X, p3->X, B);       /* X3 = F-2*D */\
+\
+    add_##field(p3->Z, p1->Z, p1->Z);   /* 2*Z1 */\
+    mul_##field(p3->Z, p3->Z, p1->Y);   /* Z3 = 2*Z1*Y1 */\
+\
+    mul_by_8_##field(C, C);             /* 8*C */\
+    sub_##field(p3->Y, B, p3->X);       /* D-X3 */\
+    mul_##field(p3->Y, p3->Y, A);       /* E*(D-X3) */\
+    sub_##field(p3->Y, p3->Y, C);       /* Y3 = E*(D-X3)-8*C */\
+}
+
+#define POINT_LADDER_PRE_IMPL(ptype, bits, field) \
+static void ptype##xz_ladder_pre(ptype##xz *pxz, const ptype *p) \
+{ \
+    mul_##field(pxz->X, p->X, p->Z);    /* X2 = X1*Z1 */\
+    sqr_##field(pxz->Z, p->Z);          \
+    mul_##field(pxz->Z, pxz->Z, p->Z);  /* Z2 = Z1^3 */\
+}
+
+/*
+ * https://hyperelliptic.org/EFD/g1p/auto-shortw-xz.html#ladder-ladd-2002-it-3
+ * with twist to handle either input at infinity, which are encoded as Z==0.
+ * Just in case, order of doubling and addition is reverse in comparison to
+ * hyperelliptic.org entry. This was done to minimize temporary storage.
+ *
+ * XZ1 is |p|, XZ2&XZ4 are in&out |r|, XZ3&XZ5 are in&out |s|.
+ */
+#define POINT_LADDER_STEP_IMPL_A0(ptype, bits, field, suffix4b) \
+static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \
+                                  const ptype##xz *p) \
+{ \
+    ptype##xz p5; \
+    vec##bits A, B, C, D, XX, ZZ; \
+    bool_t r_inf, s_inf; \
+                                        /* s += r */\
+    mul_##field(A, r->X, s->X);         /* A = X2*X3 */\
+    mul_##field(B, r->Z, s->Z);         /* B = Z2*Z3 */\
+    mul_##field(C, r->X, s->Z);         /* C = X2*Z3 */\
+    mul_##field(D, r->Z, s->X);         /* D = X3*Z2 */\
+\
+    sqr_##field(A, A);                  /* (A[-a*B])^2 */\
+    add_##field(p5.X, C, D);            /* C+D */\
+    mul_##field(p5.X, p5.X, B);         /* B*(C+D) */\
+    mul_by_4b_##suffix4b(B, p5.X);      /* b4*B*(C+D) */\
+    sub_##field(p5.X, A, B);            /* (A[-a*B])^2-b4*B*(C+D) */\
+    mul_##field(p5.X, p5.X, p->Z);      /* X5 = Z1*((A[-a*B])^2-b4*B*(C+D)) */\
+\
+    sub_##field(p5.Z, C, D);            /* C-D */\
+    sqr_##field(p5.Z, p5.Z);            /* (C-D)^2 */\
+    mul_##field(p5.Z, p5.Z, p->X);      /* Z5 = X1*(C-D)^2 */\
+\
+    r_inf = vec_is_zero(r->Z, sizeof(r->Z)); \
+    s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \
+\
+    vec_select(&p5, r, &p5, sizeof(ptype##xz), s_inf); \
+    vec_select(s,   s, &p5, sizeof(ptype##xz), r_inf); \
+                                        /* r *= 2 */\
+    sqr_##field(XX, r->X);              /* XX = X2^2 */\
+    sqr_##field(ZZ, r->Z);              /* ZZ = Z2^2 */\
+\
+    add_##field(r->Z, r->X, r->Z);      /* X2+Z2 */\
+    sqr_##field(r->Z, r->Z);            /* (X2+Z2)^2 */\
+    sub_##field(r->Z, r->Z, XX);        /* (X2+Z2)^2-XX */\
+    sub_##field(r->Z, r->Z, ZZ);        /* E = (X2+Z2)^2-XX-ZZ */\
+\
+    sqr_##field(A, XX);                 /* (XX[-a*ZZ])^2 */\
+    mul_##field(B, r->Z, ZZ);           /* E*ZZ */\
+    mul_by_4b_##suffix4b(C, B);         /* b4*E*ZZ */\
+    sub_##field(r->X, A, C);            /* X4 = (XX[-a*ZZ])^2-b4*E*ZZ */\
+\
+    sqr_##field(ZZ, ZZ);                /* ZZ^2 */\
+    mul_by_4b_##suffix4b(B, ZZ);        /* b4*ZZ^2 */\
+    mul_##field(r->Z, r->Z, XX);        /* E*(XX[+a*ZZ]) */\
+    add_##field(r->Z, r->Z, r->Z);      /* 2*E*(XX[+a*ZZ]) */\
+    add_##field(r->Z, r->Z, B);         /* Z4 = 2*E*(XX[+a*ZZ])+b4*ZZ^2 */\
+}
+
+/*
+ * Recover the |r|'s y-coordinate using Eq. (8) from Brier-Joye,
+ * "Weierstraß Elliptic Curves and Side-Channel Attacks", with XZ twist
+ * and conversion to Jacobian coordinates from <openssl>/.../ecp_smpl.c,
+ * and with twist to recover from |s| at infinity [which occurs when
+ * multiplying by (order-1)].
+ *
+ * X4 = 2*Y1*X2*Z3*Z1*Z2
+ * Y4 = 2*b*Z3*(Z1*Z2)^2 + Z3*(a*Z1*Z2+X1*X2)*(X1*Z2+X2*Z1) - X3*(X1*Z2-X2*Z1)^2
+ * Z4 = 2*Y1*Z3*Z2^2*Z1
+ *
+ * Z3x2 = 2*Z3
+ * Y1Z3x2 = Y1*Z3x2
+ * Z1Z2 = Z1*Z2
+ * X1Z2 = X1*Z2
+ * X2Z1 = X2*Z1
+ * X4 = Y1Z3x2*X2*Z1Z2
+ * A = b*Z3x2*(Z1Z2)^2
+ * B = Z3*(a*Z1Z2+X1*X2)*(X1Z2+X2Z1)
+ * C = X3*(X1Z2-X2Z1)^2
+ * Y4 = A+B-C
+ * Z4 = Y1Z3x2*Z1Z2*Z2
+ *
+ * XZ1 is |p|, XZ2 is |r|, XZ3 is |s|, 'a' is 0.
+ */
+#define POINT_LADDER_POST_IMPL_A0(ptype, bits, field, suffixb) \
+static void ptype##xz_ladder_post(ptype *p4, \
+                                  const ptype##xz *r, const ptype##xz *s, \
+                                  const ptype##xz *p, const vec##bits Y1) \
+{ \
+    vec##bits Z3x2, Y1Z3x2, Z1Z2, X1Z2, X2Z1, A, B, C; \
+    bool_t s_inf; \
+\
+    add_##field(Z3x2, s->Z, s->Z);      /* Z3x2 = 2*Z3 */\
+    mul_##field(Y1Z3x2, Y1, Z3x2);      /* Y1Z3x2 = Y1*Z3x2 */\
+    mul_##field(Z1Z2, p->Z, r->Z);      /* Z1Z2 = Z1*Z2 */\
+    mul_##field(X1Z2, p->X, r->Z);      /* X1Z2 = X1*Z2 */\
+    mul_##field(X2Z1, r->X, p->Z);      /* X2Z1 = X2*Z1 */\
+\
+    mul_##field(p4->X, Y1Z3x2, r->X);   /* Y1Z3x2*X2 */\
+    mul_##field(p4->X, p4->X, Z1Z2);    /* X4 = Y1Z3x2*X2*Z1Z2 */\
+\
+    sqr_##field(A, Z1Z2);               /* (Z1Z2)^2 */\
+    mul_##field(B, A, Z3x2);            /* Z3x2*(Z1Z2)^2 */\
+    mul_by_b_##suffixb(A, B);           /* A = b*Z3x2*(Z1Z2)^2 */\
+\
+    mul_##field(B, p->X, r->X);         /* [a*Z1Z2+]X1*X2 */\
+    mul_##field(B, B, s->Z);            /* Z3*([a*Z1Z2+]X1*X2) */\
+    add_##field(C, X1Z2, X2Z1);         /* X1Z2+X2Z1 */\
+    mul_##field(B, B, C);               /* B = Z3*([a*Z2Z1+]X1*X2)*(X1Z2+X2Z1) */\
+\
+    sub_##field(C, X1Z2, X2Z1);         /* X1Z2-X2Z1 */\
+    sqr_##field(C, C);                  /* (X1Z2-X2Z1)^2 */\
+    mul_##field(C, C, s->X);            /* C = X3*(X1Z2-X2Z1)^2 */\
+\
+    add_##field(A, A, B);               /* A+B */\
+    sub_##field(A, A, C);               /* Y4 = A+B-C */\
+\
+    mul_##field(p4->Z, Z1Z2, r->Z);     /* Z1Z2*Z2 */\
+    mul_##field(p4->Z, p4->Z, Y1Z3x2);  /* Y1Z3x2*Z1Z2*Z2 */\
+\
+    s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \
+    vec_select(p4->X, p->X, p4->X, sizeof(p4->X), s_inf); \
+    vec_select(p4->Y, Y1,   A,     sizeof(p4->Y), s_inf); \
+    vec_select(p4->Z, p->Z, p4->Z, sizeof(p4->Z), s_inf); \
+    ptype##_cneg(p4, s_inf); \
+                                        /* to Jacobian */\
+    mul_##field(p4->X, p4->X, p4->Z);   /* X4 = X4*Z4 */\
+    sqr_##field(B, p4->Z);              \
+    mul_##field(p4->Y, p4->Y, B);       /* Y4 = Y4*Z4^2 */\
+}
+
+#define POINT_IS_EQUAL_IMPL(ptype, bits, field) \
+static limb_t ptype##_is_equal(const ptype *p1, const ptype *p2) \
+{ \
+    vec##bits Z1Z1, Z2Z2; \
+    ptype##_affine a1, a2; \
+    bool_t is_inf1 = vec_is_zero(p1->Z, sizeof(p1->Z)); \
+    bool_t is_inf2 = vec_is_zero(p2->Z, sizeof(p2->Z)); \
+\
+    sqr_##field(Z1Z1, p1->Z);           /* Z1Z1 = Z1^2 */\
+    sqr_##field(Z2Z2, p2->Z);           /* Z2Z2 = Z2^2 */\
+\
+    mul_##field(a1.X, p1->X, Z2Z2);     /* U1 = X1*Z2Z2 */\
+    mul_##field(a2.X, p2->X, Z1Z1);     /* U2 = X2*Z1Z1 */\
+\
+    mul_##field(a1.Y, p1->Y, p2->Z);    /* Y1*Z2 */\
+    mul_##field(a2.Y, p2->Y, p1->Z);    /* Y2*Z1 */\
+\
+    mul_##field(a1.Y, a1.Y, Z2Z2);      /* S1 = Y1*Z2*Z2Z2 */\
+    mul_##field(a2.Y, a2.Y, Z1Z1);      /* S2 = Y2*Z1*Z1Z1 */\
+\
+    return vec_is_equal(&a1, &a2, sizeof(a1)) & (is_inf1 ^ is_inf2 ^ 1); \
+}
+
+/*
+ * https://eprint.iacr.org/2015/1060, algorithm 7 with a twist to handle
+ * |p3| pointing at either |p1| or |p2|. This is resolved by adding |t5|
+ * and replacing few first references to |X3| in the formula, up to step
+ * 21, with it. 12M[+27A], doubling and infinity are handled by the
+ * formula itself. Infinity is to be encoded as [0, !0, 0].
+ */
+#define POINT_PROJ_DADD_IMPL_A0(ptype, bits, field, suffixb) \
+static void ptype##proj_dadd(ptype##proj *p3, const ptype##proj *p1, \
+                                              const ptype##proj *p2) \
+{ \
+    vec##bits t0, t1, t2, t3, t4, t5; \
+\
+    mul_##field(t0, p1->X, p2->X);      /* 1.     t0 = X1*X2 */\
+    mul_##field(t1, p1->Y, p2->Y);      /* 2.     t1 = Y1*Y2 */\
+    mul_##field(t2, p1->Z, p2->Z);      /* 3.     t2 = Z1*Z2 */\
+    add_##field(t3, p1->X, p1->Y);      /* 4.     t3 = X1+Y1 */\
+    add_##field(t4, p2->X, p2->Y);      /* 5.     t4 = X2+Y2 */\
+    mul_##field(t3, t3, t4);            /* 6.     t3 = t3*t4 */\
+    add_##field(t4, t0, t1);            /* 7.     t4 = t0+t1 */\
+    sub_##field(t3, t3, t4);            /* 8.     t3 = t3-t4 */\
+    add_##field(t4, p1->Y, p1->Z);      /* 9.     t4 = Y1+Z1 */\
+    add_##field(t5, p2->Y, p2->Z);      /* 10.    t5 = Y2+Z2 */\
+    mul_##field(t4, t4, t5);            /* 11.    t4 = t4*t5 */\
+    add_##field(t5, t1, t2);            /* 12.    t5 = t1+t2 */\
+    sub_##field(t4, t4, t5);            /* 13.    t4 = t4-t5 */\
+    add_##field(t5, p1->X, p1->Z);      /* 14.    t5 = X1+Z1 */\
+    add_##field(p3->Y, p2->X, p2->Z);   /* 15.    Y3 = X2+Z2 */\
+    mul_##field(t5, t5, p3->Y);         /* 16.    t5 = t5*Y3 */\
+    add_##field(p3->Y, t0, t2);         /* 17.    Y3 = t0+t2 */\
+    sub_##field(p3->Y, t5, p3->Y);      /* 18.    Y3 = t5-Y3 */\
+    mul_by_3_##field(t0, t0);           /* 19-20. t0 = 3*t0  */\
+    mul_by_3_##field(t5, t2);           /* 21.    t5 = 3*t2  */\
+    mul_by_b_##suffixb(t2, t5);         /* 21.    t2 = b*t5  */\
+    add_##field(p3->Z, t1, t2);         /* 22.    Z3 = t1+t2 */\
+    sub_##field(t1, t1, t2);            /* 23.    t1 = t1-t2 */\
+    mul_by_3_##field(t5, p3->Y);        /* 24.    t5 = 3*Y3  */\
+    mul_by_b_##suffixb(p3->Y, t5);      /* 24.    Y3 = b*t5  */\
+    mul_##field(p3->X, t4, p3->Y);      /* 25.    X3 = t4*Y3 */\
+    mul_##field(t2, t3, t1);            /* 26.    t2 = t3*t1 */\
+    sub_##field(p3->X, t2, p3->X);      /* 27.    X3 = t2-X3 */\
+    mul_##field(p3->Y, p3->Y, t0);      /* 28.    Y3 = Y3*t0 */\
+    mul_##field(t1, t1, p3->Z);         /* 29.    t1 = t1*Z3 */\
+    add_##field(p3->Y, t1, p3->Y);      /* 30.    Y3 = t1+Y3 */\
+    mul_##field(t0, t0, t3);            /* 31.    t0 = t0*t3 */\
+    mul_##field(p3->Z, p3->Z, t4);      /* 32.    Z3 = Z3*t4 */\
+    add_##field(p3->Z, p3->Z, t0);      /* 33.    Z3 = Z3+t0 */\
+}
+
+/*
+ * https://eprint.iacr.org/2015/1060, algorithm 8 with a twist to handle
+ * |p2| being infinity encoded as [0, 0]. 11M[+21A].
+ */
+#define POINT_PROJ_DADD_AFFINE_IMPL_A0(ptype, bits, field, suffixb) \
+static void ptype##proj_dadd_affine(ptype##proj *out, const ptype##proj *p1, \
+                                                      const ptype##_affine *p2) \
+{ \
+    ptype##proj p3[1]; \
+    vec##bits t0, t1, t2, t3, t4; \
+    limb_t p2inf = vec_is_zero(p2, sizeof(*p2)); \
+\
+    mul_##field(t0, p1->X, p2->X);      /* 1.     t0 = X1*X2 */\
+    mul_##field(t1, p1->Y, p2->Y);      /* 2.     t1 = Y1*Y2 */\
+    add_##field(t3, p1->X, p1->Y);      /* 3.     t3 = X1+Y1 */\
+    add_##field(t4, p2->X, p2->Y);      /* 4.     t4 = X2+Y2 */\
+    mul_##field(t3, t3, t4);            /* 5.     t3 = t3*t4 */\
+    add_##field(t4, t0, t1);            /* 6.     t4 = t0+t1 */\
+    sub_##field(t3, t3, t4);            /* 7.     t3 = t3-t4 */\
+    mul_##field(t4, p2->Y, p1->Z);      /* 8.     t4 = Y2*Z1 */\
+    add_##field(t4, t4, p1->Y);         /* 9.     t4 = t4+Y1 */\
+    mul_##field(p3->Y, p2->X, p1->Z);   /* 10.    Y3 = X2*Z1 */\
+    add_##field(p3->Y, p3->Y, p1->X);   /* 11.    Y3 = Y3+X1 */\
+    mul_by_3_##field(t0, t0);           /* 12-13. t0 = 3*t0  */\
+    mul_by_b_##suffixb(t2, p1->Z);      /* 14.    t2 = b*Z1  */\
+    mul_by_3_##field(t2, t2);           /* 14.    t2 = 3*t2  */\
+    add_##field(p3->Z, t1, t2);         /* 15.    Z3 = t1+t2 */\
+    sub_##field(t1, t1, t2);            /* 16.    t1 = t1-t2 */\
+    mul_by_b_##suffixb(t2, p3->Y);      /* 17.    t2 = b*Y3  */\
+    mul_by_3_##field(p3->Y, t2);        /* 17.    Y3 = 3*t2  */\
+    mul_##field(p3->X, t4, p3->Y);      /* 18.    X3 = t4*Y3 */\
+    mul_##field(t2, t3, t1);            /* 19.    t2 = t3*t1 */\
+    sub_##field(p3->X, t2, p3->X);      /* 20.    X3 = t2-X3 */\
+    mul_##field(p3->Y, p3->Y, t0);      /* 21.    Y3 = Y3*t0 */\
+    mul_##field(t1, t1, p3->Z);         /* 22.    t1 = t1*Z3 */\
+    add_##field(p3->Y, t1, p3->Y);      /* 23.    Y3 = t1+Y3 */\
+    mul_##field(t0, t0, t3);            /* 24.    t0 = t0*t3 */\
+    mul_##field(p3->Z, p3->Z, t4);      /* 25.    Z3 = Z3*t4 */\
+    add_##field(p3->Z, p3->Z, t0);      /* 26.    Z3 = Z3+t0 */\
+\
+    vec_select(out, p1, p3, sizeof(*out), p2inf); \
+}
+
+/*
+ * https://eprint.iacr.org/2015/1060, algorithm 9 with a twist to handle
+ * |p3| pointing at |p1|. This is resolved by adding |t3| to hold X*Y
+ * and reordering operations to bring references to |p1| forward.
+ * 6M+2S[+13A].
+ */
+#define POINT_PROJ_DOUBLE_IMPL_A0(ptype, bits, field, suffixb) \
+static void ptype##proj_double(ptype##proj *p3, const ptype##proj *p1) \
+{ \
+    vec##bits t0, t1, t2, t3; \
+\
+    sqr_##field(t0, p1->Y);             /* 1.     t0 = Y*Y   */\
+    mul_##field(t1, p1->Y, p1->Z);      /* 5.     t1 = Y*Z   */\
+    sqr_##field(t2, p1->Z);             /* 6.     t2 = Z*Z   */\
+    mul_##field(t3, p1->X, p1->Y);      /* 16.    t3 = X*Y   */\
+    lshift_##field(p3->Z, t0, 3);       /* 2-4.   Z3 = 8*t0  */\
+    mul_by_b_##suffixb(p3->X, t2);      /* 7.     t2 = b*t2  */\
+    mul_by_3_##field(t2, p3->X);        /* 7.     t2 = 3*t2  */\
+    mul_##field(p3->X, t2, p3->Z);      /* 8.     X3 = t2*Z3 */\
+    add_##field(p3->Y, t0, t2);         /* 9.     Y3 = t0+t2 */\
+    mul_##field(p3->Z, t1, p3->Z);      /* 10.    Z3 = t1*Z3 */\
+    mul_by_3_##field(t2, t2);           /* 11-12. t2 = 3*t2  */\
+    sub_##field(t0, t0, t2);            /* 13.    t0 = t0-t2 */\
+    mul_##field(p3->Y, t0, p3->Y);      /* 14.    Y3 = t0*Y3 */\
+    add_##field(p3->Y, p3->X, p3->Y);   /* 15.    Y3 = X3+Y3 */\
+    mul_##field(p3->X, t0, t3);         /* 17.    X3 = t0*t3 */\
+    add_##field(p3->X, p3->X, p3->X);   /* 18.    X3 = X3+X3 */\
+}
+
+#define POINT_PROJ_TO_JACOBIAN_IMPL(ptype, bits, field) \
+static void ptype##proj_to_Jacobian(ptype *out, const ptype##proj *in) \
+{ \
+    vec##bits ZZ; \
+\
+    sqr_##field(ZZ, in->Z); \
+    mul_##field(out->X, in->X, in->Z); \
+    mul_##field(out->Y, in->Y, ZZ); \
+    vec_copy(out->Z, in->Z, sizeof(out->Z)); \
+}
+
+#define POINT_TO_PROJECTIVE_IMPL(ptype, bits, field, one) \
+static void ptype##_to_projective(ptype##proj *out, const ptype *in) \
+{ \
+    vec##bits ZZ; \
+    limb_t is_inf = vec_is_zero(in->Z, sizeof(in->Z)); \
+\
+    sqr_##field(ZZ, in->Z); \
+    mul_##field(out->X, in->X, in->Z); \
+    vec_select(out->Y, one, in->Y, sizeof(out->Y), is_inf); \
+    mul_##field(out->Z, ZZ, in->Z); \
+}
+
+/******************* !!!!! NOT CONSTANT TIME !!!!! *******************/
+
+/*
+ * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s
+ * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+ * with twist to handle either input at infinity. Addition costs 12M+2S,
+ * while conditional doubling - 4M+6M+3S.
+ */
+#define POINTXYZZ_DADD_IMPL(ptype, bits, field) \
+static void ptype##xyzz_dadd(ptype##xyzz *p3, const ptype##xyzz *p1, \
+                                              const ptype##xyzz *p2) \
+{ \
+    vec##bits U, S, P, R; \
+\
+    if (vec_is_zero(p2->ZZZ, 2*sizeof(p2->ZZZ))) { \
+        vec_copy(p3, p1, sizeof(*p3));  \
+        return; \
+    } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \
+        vec_copy(p3, p2, sizeof(*p3));  \
+        return; \
+    } \
+\
+    mul_##field(U, p1->X, p2->ZZ);              /* U1 = X1*ZZ2 */\
+    mul_##field(S, p1->Y, p2->ZZZ);             /* S1 = Y1*ZZZ2 */\
+    mul_##field(P, p2->X, p1->ZZ);              /* U2 = X2*ZZ1 */\
+    mul_##field(R, p2->Y, p1->ZZZ);             /* S2 = Y2*ZZZ1 */\
+    sub_##field(P, P, U);                       /* P = U2-U1 */\
+    sub_##field(R, R, S);                       /* R = S2-S1 */\
+\
+    if (!vec_is_zero(P, sizeof(P))) {           /* X1!=X2 */\
+        vec##bits PP, PPP, Q;                   /* add |p1| and |p2| */\
+\
+        sqr_##field(PP, P);                     /* PP = P^2 */\
+        mul_##field(PPP, PP, P);                /* PPP = P*PP */\
+        mul_##field(Q, U, PP);                  /* Q = U1*PP */\
+        sqr_##field(p3->X, R);                  /* R^2 */\
+        add_##field(P, Q, Q); \
+        sub_##field(p3->X, p3->X, PPP);         /* R^2-PPP */\
+        sub_##field(p3->X, p3->X, P);           /* X3 = R^2-PPP-2*Q */\
+        sub_##field(Q, Q, p3->X); \
+        mul_##field(Q, Q, R);                   /* R*(Q-X3) */\
+        mul_##field(p3->Y, S, PPP);             /* S1*PPP */\
+        sub_##field(p3->Y, Q, p3->Y);           /* Y3 = R*(Q-X3)-S1*PPP */\
+        mul_##field(p3->ZZ, p1->ZZ, p2->ZZ);    /* ZZ1*ZZ2 */\
+        mul_##field(p3->ZZZ, p1->ZZZ, p2->ZZZ); /* ZZZ1*ZZZ2 */\
+        mul_##field(p3->ZZ, p3->ZZ, PP);        /* ZZ3 = ZZ1*ZZ2*PP */\
+        mul_##field(p3->ZZZ, p3->ZZZ, PPP);     /* ZZZ3 = ZZZ1*ZZZ2*PPP */\
+    } else if (vec_is_zero(R, sizeof(R))) {     /* X1==X2 && Y1==Y2 */\
+        vec##bits V, W, M;                      /* double |p1| */\
+\
+        add_##field(U, p1->Y, p1->Y);           /* U = 2*Y1 */\
+        sqr_##field(V, U);                      /* V = U^2 */\
+        mul_##field(W, V, U);                   /* W = U*V */\
+        mul_##field(S, p1->X, V);               /* S = X1*V */\
+        sqr_##field(M, p1->X); \
+        mul_by_3_##field(M, M);                 /* M = 3*X1^2[+a*ZZ1^2] */\
+        sqr_##field(p3->X, M); \
+        add_##field(U, S, S);                   /* 2*S */\
+        sub_##field(p3->X, p3->X, U);           /* X3 = M^2-2*S */\
+        mul_##field(p3->Y, W, p1->Y);           /* W*Y1 */\
+        sub_##field(S, S, p3->X); \
+        mul_##field(S, S, M);                   /* M*(S-X3) */\
+        sub_##field(p3->Y, S, p3->Y);           /* Y3 = M*(S-X3)-W*Y1 */\
+        mul_##field(p3->ZZ, p1->ZZ, V);         /* ZZ3 = V*ZZ1 */\
+        mul_##field(p3->ZZZ, p1->ZZZ, W);       /* ZZ3 = W*ZZZ1 */\
+    } else {                                    /* X1==X2 && Y1==-Y2 */\
+        vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ));   /* set |p3| to infinity */\
+    } \
+}
+
+/*
+ * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-madd-2008-s
+ * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-mdbl-2008-s-1
+ * with twists to handle even subtractions and either input at infinity.
+ * Addition costs 8M+2S, while conditional doubling - 2M+4M+3S.
+ */
+#define POINTXYZZ_DADD_AFFINE_IMPL(ptype, bits, field, one) \
+static void ptype##xyzz_dadd_affine(ptype##xyzz *p3, const ptype##xyzz *p1, \
+                                                     const ptype##_affine *p2, \
+                                                     bool_t subtract) \
+{ \
+    vec##bits P, R; \
+\
+    if (vec_is_zero(p2, sizeof(*p2))) { \
+        vec_copy(p3, p1, sizeof(*p3));  \
+        return; \
+    } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \
+        vec_copy(p3->X, p2->X, 2*sizeof(p3->X));\
+        cneg_##field(p3->ZZZ, one, subtract);   \
+        vec_copy(p3->ZZ, one, sizeof(p3->ZZ));  \
+        return; \
+    } \
+\
+    mul_##field(P, p2->X, p1->ZZ);              /* U2 = X2*ZZ1 */\
+    mul_##field(R, p2->Y, p1->ZZZ);             /* S2 = Y2*ZZZ1 */\
+    cneg_##field(R, R, subtract); \
+    sub_##field(P, P, p1->X);                   /* P = U2-X1 */\
+    sub_##field(R, R, p1->Y);                   /* R = S2-Y1 */\
+\
+    if (!vec_is_zero(P, sizeof(P))) {           /* X1!=X2 */\
+        vec##bits PP, PPP, Q;                   /* add |p2| to |p1| */\
+\
+        sqr_##field(PP, P);                     /* PP = P^2 */\
+        mul_##field(PPP, PP, P);                /* PPP = P*PP */\
+        mul_##field(Q, p1->X, PP);              /* Q = X1*PP */\
+        sqr_##field(p3->X, R);                  /* R^2 */\
+        add_##field(P, Q, Q); \
+        sub_##field(p3->X, p3->X, PPP);         /* R^2-PPP */\
+        sub_##field(p3->X, p3->X, P);           /* X3 = R^2-PPP-2*Q */\
+        sub_##field(Q, Q, p3->X); \
+        mul_##field(Q, Q, R);                   /* R*(Q-X3) */\
+        mul_##field(p3->Y, p1->Y, PPP);         /* Y1*PPP */\
+        sub_##field(p3->Y, Q, p3->Y);           /* Y3 = R*(Q-X3)-Y1*PPP */\
+        mul_##field(p3->ZZ, p1->ZZ, PP);        /* ZZ3 = ZZ1*PP */\
+        mul_##field(p3->ZZZ, p1->ZZZ, PPP);     /* ZZZ3 = ZZZ1*PPP */\
+    } else if (vec_is_zero(R, sizeof(R))) {     /* X1==X2 && Y1==Y2 */\
+        vec##bits U, S, M;                      /* double |p2| */\
+\
+        add_##field(U, p2->Y, p2->Y);           /* U = 2*Y1 */\
+        sqr_##field(p3->ZZ, U);                 /* [ZZ3 =] V = U^2 */\
+        mul_##field(p3->ZZZ, p3->ZZ, U);        /* [ZZZ3 =] W = U*V */\
+        mul_##field(S, p2->X, p3->ZZ);          /* S = X1*V */\
+        sqr_##field(M, p2->X); \
+        mul_by_3_##field(M, M);                 /* M = 3*X1^2[+a] */\
+        sqr_##field(p3->X, M); \
+        add_##field(U, S, S);                   /* 2*S */\
+        sub_##field(p3->X, p3->X, U);           /* X3 = M^2-2*S */\
+        mul_##field(p3->Y, p3->ZZZ, p2->Y);     /* W*Y1 */\
+        sub_##field(S, S, p3->X); \
+        mul_##field(S, S, M);                   /* M*(S-X3) */\
+        sub_##field(p3->Y, S, p3->Y);           /* Y3 = M*(S-X3)-W*Y1 */\
+        cneg_##field(p3->ZZZ, p3->ZZZ, subtract); \
+    } else {                                    /* X1==X2 && Y1==-Y2 */\
+        vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ));   /* set |p3| to infinity */\
+    } \
+}
+
+#define POINTXYZZ_TO_JACOBIAN_IMPL(ptype, bits, field) \
+static void ptype##xyzz_to_Jacobian(ptype *out, const ptype##xyzz *in) \
+{ \
+    mul_##field(out->X, in->X, in->ZZ); \
+    mul_##field(out->Y, in->Y, in->ZZZ); \
+    vec_copy(out->Z, in->ZZ, sizeof(out->Z)); \
+}
+
+#define POINT_TO_XYZZ_IMPL(ptype, bits, field) \
+static void ptype##_to_xyzz(ptype##xyzz *out, const ptype *in) \
+{ \
+    vec_copy(out->X, in->X, 2*sizeof(out->X)); \
+    sqr_##field(out->ZZ, in->Z); \
+    mul_##field(out->ZZZ, out->ZZ, in->Z); \
+}
+
+#endif
diff --git a/blst/elf/add_mod_256-armv8.S b/blst/elf/add_mod_256-armv8.S
new file mode 100644
index 0000000..57476aa
--- /dev/null
+++ b/blst/elf/add_mod_256-armv8.S
@@ -0,0 +1,379 @@
+.text
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,%function
+.align	5
+add_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	adds	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	adcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	adcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	stp	x8,x9,[x0]
+	csel	x11,x11,x2,lo
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	add_mod_256,.-add_mod_256
+
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,%function
+.align	5
+mul_by_3_mod_256:
+	ldp	x12,x13,[x1]
+	ldp	x14,x15,[x1,#16]
+
+	adds	x8,x12,x12
+	ldp	x4,x5,[x2]
+	adcs	x9,x13,x13
+	ldp	x6,x7,[x2,#16]
+	adcs	x10,x14,x14
+	adcs	x11,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	csel	x11,x11,x2,lo
+
+	adds	x8,x8,x12
+	adcs	x9,x9,x13
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	stp	x8,x9,[x0]
+	csel	x11,x11,x2,lo
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,%function
+.align	5
+lshift_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+.Loop_lshift_mod_256:
+	adds	x8,x8,x8
+	sub	x2,x2,#1
+	adcs	x9,x9,x9
+	adcs	x10,x10,x10
+	adcs	x11,x11,x11
+	adc	x3,xzr,xzr
+
+	subs	x12,x8,x4
+	sbcs	x13,x9,x5
+	sbcs	x14,x10,x6
+	sbcs	x15,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x12,lo
+	csel	x9,x9,x13,lo
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+
+	cbnz	x2,.Loop_lshift_mod_256
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	lshift_mod_256,.-lshift_mod_256
+
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,%function
+.align	5
+rshift_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+.Loop_rshift:
+	adds	x12,x8,x4
+	sub	x2,x2,#1
+	adcs	x13,x9,x5
+	adcs	x14,x10,x6
+	adcs	x15,x11,x7
+	adc	x3,xzr,xzr
+	tst	x8,#1
+
+	csel	x12,x12,x8,ne
+	csel	x13,x13,x9,ne
+	csel	x14,x14,x10,ne
+	csel	x15,x15,x11,ne
+	csel	x3,x3,xzr,ne
+
+	extr	x8,x13,x12,#1
+	extr	x9,x14,x13,#1
+	extr	x10,x15,x14,#1
+	extr	x11,x3,x15,#1
+
+	cbnz	x2,.Loop_rshift
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	rshift_mod_256,.-rshift_mod_256
+
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,%function
+.align	5
+cneg_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x4,x5,[x3]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x12,x4,x8
+	ldp	x6,x7,[x3,#16]
+	orr	x4,x8,x9
+	sbcs	x13,x5,x9
+	orr	x5,x10,x11
+	sbcs	x14,x6,x10
+	orr	x3,x4,x5
+	sbc	x15,x7,x11
+
+	cmp	x3,#0
+	csetm	x3,ne
+	ands	x2,x2,x3
+
+	csel	x8,x8,x12,eq
+	csel	x9,x9,x13,eq
+	csel	x10,x10,x14,eq
+	stp	x8,x9,[x0]
+	csel	x11,x11,x15,eq
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	cneg_mod_256,.-cneg_mod_256
+
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,%function
+.align	5
+sub_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	stp	x8,x9,[x0]
+	adc	x11,x11,x7
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	sub_mod_256,.-sub_mod_256
+
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,%function
+.align	5
+check_mod_256:
+	ldp	x8,x9,[x0]
+	ldp	x10,x11,[x0,#16]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	subs	xzr,x8,x4
+	sbcs	xzr,x9,x5
+	orr	x8,x8,x9
+	sbcs	xzr,x10,x6
+	orr	x8,x8,x10
+	sbcs	xzr,x11,x7
+	orr	x8,x8,x11
+	sbc	x1,xzr,xzr
+
+	cmp	x8,#0
+	mov	x0,#1
+	csel	x0,x0,xzr,ne
+	and	x0,x0,x1
+
+	ret
+.size	check_mod_256,.-check_mod_256
+
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,%function
+.align	5
+add_n_check_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	adds	x8,x8,x12
+	ldp	x4,x5,[x3]
+	adcs	x9,x9,x13
+	ldp	x6,x7,[x3,#16]
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	csel	x11,x11,x2,lo
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	csel	x0, x17, xzr, ne
+
+	ret
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,%function
+.align	5
+sub_n_check_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	subs	x8,x8,x12
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	adc	x11,x11,x7
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	csel	x0, x17, xzr, ne
+
+	ret
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
diff --git a/blst/elf/add_mod_256-x86_64.s b/blst/elf/add_mod_256-x86_64.s
new file mode 100644
index 0000000..2f41781
--- /dev/null
+++ b/blst/elf/add_mod_256-x86_64.s
@@ -0,0 +1,572 @@
+.text	
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,@function
+.align	32
+add_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loaded_a_add_mod_256:
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	movq	%r9,%rsi
+	adcq	24(%rdx),%r11
+	sbbq	%rdx,%rdx
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_mod_256,.-add_mod_256
+
+
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,@function
+.align	32
+mul_by_3_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	%rdx,%rcx
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%rsi,%rdx
+	movq	24(%rsi),%r11
+
+	call	__lshift_mod_256
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	jmp	.Loaded_a_add_mod_256
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.type	__lshift_mod_256,@function
+.align	32
+__lshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r8,%rax
+	adcq	%r10,%r10
+	movq	%r9,%rsi
+	adcq	%r11,%r11
+	sbbq	%r12,%r12
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	cmovcq	%rbx,%r10
+	cmovcq	%rbp,%r11
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__lshift_mod_256,.-__lshift_mod_256
+
+
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,@function
+.align	32
+lshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loop_lshift_mod_256:
+	call	__lshift_mod_256
+	decl	%edx
+	jnz	.Loop_lshift_mod_256
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	lshift_mod_256,.-lshift_mod_256
+
+
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,@function
+.align	32
+rshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rbp
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loop_rshift_mod_256:
+	movq	%rbp,%r8
+	andq	$1,%rbp
+	movq	0(%rcx),%rax
+	negq	%rbp
+	movq	8(%rcx),%rsi
+	movq	16(%rcx),%rbx
+
+	andq	%rbp,%rax
+	andq	%rbp,%rsi
+	andq	%rbp,%rbx
+	andq	24(%rcx),%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	adcq	%rbx,%r10
+	adcq	%rbp,%r11
+	sbbq	%rax,%rax
+
+	shrq	$1,%r8
+	movq	%r9,%rbp
+	shrq	$1,%r9
+	movq	%r10,%rbx
+	shrq	$1,%r10
+	movq	%r11,%rsi
+	shrq	$1,%r11
+
+	shlq	$63,%rbp
+	shlq	$63,%rbx
+	orq	%r8,%rbp
+	shlq	$63,%rsi
+	orq	%rbx,%r9
+	shlq	$63,%rax
+	orq	%rsi,%r10
+	orq	%rax,%r11
+
+	decl	%edx
+	jnz	.Loop_rshift_mod_256
+
+	movq	%rbp,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	rshift_mod_256,.-rshift_mod_256
+
+
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,@function
+.align	32
+cneg_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	0(%rsi),%r12
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%r12,%r8
+	movq	24(%rsi),%r11
+	orq	%r9,%r12
+	orq	%r10,%r12
+	orq	%r11,%r12
+	movq	$-1,%rbp
+
+	movq	0(%rcx),%rax
+	cmovnzq	%rbp,%r12
+	movq	8(%rcx),%rsi
+	movq	16(%rcx),%rbx
+	andq	%r12,%rax
+	movq	24(%rcx),%rbp
+	andq	%r12,%rsi
+	andq	%r12,%rbx
+	andq	%r12,%rbp
+
+	subq	%r8,%rax
+	sbbq	%r9,%rsi
+	sbbq	%r10,%rbx
+	sbbq	%r11,%rbp
+
+	orq	%rdx,%rdx
+
+	cmovzq	%r8,%rax
+	cmovzq	%r9,%rsi
+	movq	%rax,0(%rdi)
+	cmovzq	%r10,%rbx
+	movq	%rsi,8(%rdi)
+	cmovzq	%r11,%rbp
+	movq	%rbx,16(%rdi)
+	movq	%rbp,24(%rdi)
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	cneg_mod_256,.-cneg_mod_256
+
+
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,@function
+.align	32
+sub_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%rax
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%rsi
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rbx
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbp
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%rax
+	andq	%rdx,%rsi
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_mod_256,.-sub_mod_256
+
+
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,@function
+.align	32
+check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+
+	movq	%rax,%r8
+	orq	%r9,%rax
+	orq	%r10,%rax
+	orq	%r11,%rax
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	%rsi,%rsi
+
+	movq	$1,%rdx
+	cmpq	$0,%rax
+	cmovneq	%rdx,%rax
+	andq	%rsi,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	check_mod_256,.-check_mod_256
+
+
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,@function
+.align	32
+add_n_check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	movq	%r9,%rsi
+	adcq	24(%rdx),%r11
+	sbbq	%rdx,%rdx
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	orq	%r9,%r8
+	orq	%r11,%r10
+	orq	%r10,%r8
+	movq	$1,%rax
+	cmovzq	%r8,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,@function
+.align	32
+sub_n_check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%rax
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%rsi
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rbx
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbp
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%rax
+	andq	%rdx,%rsi
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	orq	%r9,%r8
+	orq	%r11,%r10
+	orq	%r10,%r8
+	movq	$1,%rax
+	cmovzq	%r8,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/add_mod_384-armv8.S b/blst/elf/add_mod_384-armv8.S
new file mode 100644
index 0000000..55e0888
--- /dev/null
+++ b/blst/elf/add_mod_384-armv8.S
@@ -0,0 +1,931 @@
+.text
+
+.globl	add_mod_384
+.hidden	add_mod_384
+.type	add_mod_384,%function
+.align	5
+add_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	add_mod_384,.-add_mod_384
+
+.type	__add_mod_384,%function
+.align	5
+__add_mod_384:
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+__add_mod_384_ab_are_loaded:
+	adds	x10,x10,x16
+	adcs	x11,x11,x17
+	adcs	x12,x12,x19
+	adcs	x13,x13,x20
+	adcs	x14,x14,x21
+	adcs	x15,x15,x22
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csel	x10,x10,x16,lo
+	csel	x11,x11,x17,lo
+	csel	x12,x12,x19,lo
+	csel	x13,x13,x20,lo
+	csel	x14,x14,x21,lo
+	csel	x15,x15,x22,lo
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.globl	add_mod_384x
+.hidden	add_mod_384x
+.type	add_mod_384x,%function
+.align	5
+add_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	add_mod_384x,.-add_mod_384x
+
+.globl	rshift_mod_384
+.hidden	rshift_mod_384
+.type	rshift_mod_384,%function
+.align	5
+rshift_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+.Loop_rshift_mod_384:
+	sub	x2,x2,#1
+	bl	__rshift_mod_384
+	cbnz	x2,.Loop_rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	rshift_mod_384,.-rshift_mod_384
+
+.type	__rshift_mod_384,%function
+.align	5
+__rshift_mod_384:
+	sbfx	x22,x10,#0,#1
+	and	x16,x22,x4
+	and	x17,x22,x5
+	adds	x10,x10,x16
+	and	x19,x22,x6
+	adcs	x11,x11,x17
+	and	x20,x22,x7
+	adcs	x12,x12,x19
+	and	x21,x22,x8
+	adcs	x13,x13,x20
+	and	x22,x22,x9
+	adcs	x14,x14,x21
+	extr	x10,x11,x10,#1	// a[0:5] >>= 1
+	adcs	x15,x15,x22
+	extr	x11,x12,x11,#1
+	adc	x22,xzr,xzr
+	extr	x12,x13,x12,#1
+	extr	x13,x14,x13,#1
+	extr	x14,x15,x14,#1
+	extr	x15,x22,x15,#1
+	ret
+.size	__rshift_mod_384,.-__rshift_mod_384
+
+.globl	div_by_2_mod_384
+.hidden	div_by_2_mod_384
+.type	div_by_2_mod_384,%function
+.align	5
+div_by_2_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	div_by_2_mod_384,.-div_by_2_mod_384
+
+.globl	lshift_mod_384
+.hidden	lshift_mod_384
+.type	lshift_mod_384,%function
+.align	5
+lshift_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+.Loop_lshift_mod_384:
+	sub	x2,x2,#1
+	bl	__lshift_mod_384
+	cbnz	x2,.Loop_lshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	lshift_mod_384,.-lshift_mod_384
+
+.type	__lshift_mod_384,%function
+.align	5
+__lshift_mod_384:
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csel	x10,x10,x16,lo
+	csel	x11,x11,x17,lo
+	csel	x12,x12,x19,lo
+	csel	x13,x13,x20,lo
+	csel	x14,x14,x21,lo
+	csel	x15,x15,x22,lo
+
+	ret
+.size	__lshift_mod_384,.-__lshift_mod_384
+
+.globl	mul_by_3_mod_384
+.hidden	mul_by_3_mod_384
+.type	mul_by_3_mod_384,%function
+.align	5
+mul_by_3_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_3_mod_384,.-mul_by_3_mod_384
+
+.globl	mul_by_8_mod_384
+.hidden	mul_by_8_mod_384
+.type	mul_by_8_mod_384,%function
+.align	5
+mul_by_8_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_8_mod_384,.-mul_by_8_mod_384
+
+.globl	mul_by_3_mod_384x
+.hidden	mul_by_3_mod_384x
+.type	mul_by_3_mod_384x,%function
+.align	5
+mul_by_3_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1,#48]
+	ldp	x19,x20,[x1,#64]
+	ldp	x21,x22,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
+
+.globl	mul_by_8_mod_384x
+.hidden	mul_by_8_mod_384x
+.type	mul_by_8_mod_384x,%function
+.align	5
+mul_by_8_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
+
+.globl	cneg_mod_384
+.hidden	cneg_mod_384
+.type	cneg_mod_384,%function
+.align	5
+cneg_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x4,x5,[x3]
+	ldp	x12,x13,[x1,#16]
+	ldp	x6,x7,[x3,#16]
+
+	subs	x16,x4,x10
+	ldp	x14,x15,[x1,#32]
+	ldp	x8,x9,[x3,#32]
+	orr	x3,x10,x11
+	sbcs	x17,x5,x11
+	orr	x3,x3,x12
+	sbcs	x19,x6,x12
+	orr	x3,x3,x13
+	sbcs	x20,x7,x13
+	orr	x3,x3,x14
+	sbcs	x21,x8,x14
+	orr	x3,x3,x15
+	sbc	x22,x9,x15
+
+	cmp	x3,#0
+	csetm	x3,ne
+	ands	x2,x2,x3
+
+	csel	x10,x10,x16,eq
+	csel	x11,x11,x17,eq
+	csel	x12,x12,x19,eq
+	csel	x13,x13,x20,eq
+	stp	x10,x11,[x0]
+	csel	x14,x14,x21,eq
+	stp	x12,x13,[x0,#16]
+	csel	x15,x15,x22,eq
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	cneg_mod_384,.-cneg_mod_384
+
+.globl	sub_mod_384
+.hidden	sub_mod_384
+.type	sub_mod_384,%function
+.align	5
+sub_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	sub_mod_384,.-sub_mod_384
+
+.type	__sub_mod_384,%function
+.align	5
+__sub_mod_384:
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+	subs	x10,x10,x16
+	sbcs	x11,x11,x17
+	sbcs	x12,x12,x19
+	sbcs	x13,x13,x20
+	sbcs	x14,x14,x21
+	sbcs	x15,x15,x22
+	sbc	x3,xzr,xzr
+
+	and	x16,x4,x3
+	and	x17,x5,x3
+	adds	x10,x10,x16
+	and	x19,x6,x3
+	adcs	x11,x11,x17
+	and	x20,x7,x3
+	adcs	x12,x12,x19
+	and	x21,x8,x3
+	adcs	x13,x13,x20
+	and	x22,x9,x3
+	adcs	x14,x14,x21
+	adc	x15,x15,x22
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	sub_mod_384x
+.hidden	sub_mod_384x
+.type	sub_mod_384x,%function
+.align	5
+sub_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	sub_mod_384x,.-sub_mod_384x
+
+.globl	mul_by_1_plus_i_mod_384x
+.hidden	mul_by_1_plus_i_mod_384x
+.type	mul_by_1_plus_i_mod_384x,%function
+.align	5
+mul_by_1_plus_i_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+	add	x2,x1,#48
+
+	bl	__sub_mod_384			// a->re - a->im
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
+
+.globl	sgn0_pty_mod_384
+.hidden	sgn0_pty_mod_384
+.type	sgn0_pty_mod_384,%function
+.align	5
+sgn0_pty_mod_384:
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x0,x10,#1
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x3,x3,xzr
+
+	mvn	x3,x3
+	and	x3,x3,#2
+	orr	x0,x0,x3
+
+	ret
+.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
+
+.globl	sgn0_pty_mod_384x
+.hidden	sgn0_pty_mod_384x
+.type	sgn0_pty_mod_384x,%function
+.align	5
+sgn0_pty_mod_384x:
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x2,x10,#1
+	orr	x3,x10,x11
+	adds	x10,x10,x10
+	orr	x3,x3,x12
+	adcs	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	ldp	x10,x11,[x0,#48]
+	ldp	x12,x13,[x0,#64]
+	ldp	x14,x15,[x0,#80]
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x2,x2,x16
+
+	and	x0,x10,#1
+	orr	x1,x10,x11
+	adds	x10,x10,x10
+	orr	x1,x1,x12
+	adcs	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x0,x0,x16
+
+	cmp	x3,#0
+	csel	x3,x0,x2,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	x1,#0
+	csel	x1,x0,x2,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3	// pack sign and parity
+
+	ret
+.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
+.globl	vec_select_48
+.hidden	vec_select_48
+.type	vec_select_48,%function
+.align	5
+vec_select_48:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+.size	vec_select_48,.-vec_select_48
+.globl	vec_select_96
+.hidden	vec_select_96
+.type	vec_select_96,%function
+.align	5
+vec_select_96:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+.size	vec_select_96,.-vec_select_96
+.globl	vec_select_192
+.hidden	vec_select_192
+.type	vec_select_192,%function
+.align	5
+vec_select_192:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+.size	vec_select_192,.-vec_select_192
+.globl	vec_select_144
+.hidden	vec_select_144
+.type	vec_select_144,%function
+.align	5
+vec_select_144:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+.size	vec_select_144,.-vec_select_144
+.globl	vec_select_288
+.hidden	vec_select_288
+.type	vec_select_288,%function
+.align	5
+vec_select_288:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+.size	vec_select_288,.-vec_select_288
+.globl	vec_prefetch
+.hidden	vec_prefetch
+.type	vec_prefetch,%function
+.align	5
+vec_prefetch:
+	add	x1, x1, x0
+	sub	x1, x1, #1
+	mov	x2, #64
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	prfm	pldl1keep, [x0]
+	ret
+.size	vec_prefetch,.-vec_prefetch
diff --git a/blst/elf/add_mod_384-x86_64.s b/blst/elf/add_mod_384-x86_64.s
new file mode 100644
index 0000000..df61986
--- /dev/null
+++ b/blst/elf/add_mod_384-x86_64.s
@@ -0,0 +1,1809 @@
+.text	
+
+.globl	add_mod_384
+.hidden	add_mod_384
+.type	add_mod_384,@function
+.align	32
+add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__add_mod_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_mod_384,.-add_mod_384
+
+.type	__add_mod_384,@function
+.align	32
+__add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__add_mod_384_a_is_loaded:
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__add_mod_384,.-__add_mod_384
+
+.globl	add_mod_384x
+.hidden	add_mod_384x
+.type	add_mod_384x,@function
+.align	32
+add_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$24,%rsp
+.cfi_adjust_cfa_offset	24
+
+
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	leaq	48(%rsi),%rsi
+	leaq	48(%rdx),%rdx
+	leaq	48(%rdi),%rdi
+	call	__add_mod_384
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	movq	24+0(%rsp),%r15
+.cfi_restore	%r15
+	movq	24+8(%rsp),%r14
+.cfi_restore	%r14
+	movq	24+16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24+24(%rsp),%r12
+.cfi_restore	%r12
+	movq	24+32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	24+40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24+48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_mod_384x,.-add_mod_384x
+
+
+.globl	rshift_mod_384
+.hidden	rshift_mod_384
+.type	rshift_mod_384,@function
+.align	32
+rshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+.Loop_rshift_mod_384:
+	call	__rshift_mod_384
+	decl	%edx
+	jnz	.Loop_rshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	rshift_mod_384,.-rshift_mod_384
+
+.type	__rshift_mod_384,@function
+.align	32
+__rshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rsi
+	movq	0(%rcx),%r14
+	andq	%r8,%rsi
+	movq	8(%rcx),%r15
+	negq	%rsi
+	movq	16(%rcx),%rax
+	andq	%rsi,%r14
+	movq	24(%rcx),%rbx
+	andq	%rsi,%r15
+	movq	32(%rcx),%rbp
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	40(%rcx),%rsi
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rax
+	adcq	%r11,%rbx
+	adcq	%r12,%rbp
+	adcq	%r13,%rsi
+	sbbq	%r13,%r13
+
+	shrq	$1,%r14
+	movq	%r15,%r8
+	shrq	$1,%r15
+	movq	%rax,%r9
+	shrq	$1,%rax
+	movq	%rbx,%r10
+	shrq	$1,%rbx
+	movq	%rbp,%r11
+	shrq	$1,%rbp
+	movq	%rsi,%r12
+	shrq	$1,%rsi
+	shlq	$63,%r8
+	shlq	$63,%r9
+	orq	%r14,%r8
+	shlq	$63,%r10
+	orq	%r15,%r9
+	shlq	$63,%r11
+	orq	%rax,%r10
+	shlq	$63,%r12
+	orq	%rbx,%r11
+	shlq	$63,%r13
+	orq	%rbp,%r12
+	orq	%rsi,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__rshift_mod_384,.-__rshift_mod_384
+
+.globl	div_by_2_mod_384
+.hidden	div_by_2_mod_384
+.type	div_by_2_mod_384,@function
+.align	32
+div_by_2_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	%rdx,%rcx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	call	__rshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	div_by_2_mod_384,.-div_by_2_mod_384
+
+
+.globl	lshift_mod_384
+.hidden	lshift_mod_384
+.type	lshift_mod_384,@function
+.align	32
+lshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+.Loop_lshift_mod_384:
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	movq	%r8,%r14
+	adcq	%r11,%r11
+	movq	%r9,%r15
+	adcq	%r12,%r12
+	movq	%r10,%rax
+	adcq	%r13,%r13
+	movq	%r11,%rbx
+	sbbq	%rdi,%rdi
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdi
+
+	movq	(%rsp),%rdi
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	cmovcq	%rbx,%r11
+	cmovcq	%rbp,%r12
+	cmovcq	%rsi,%r13
+
+	decl	%edx
+	jnz	.Loop_lshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	lshift_mod_384,.-lshift_mod_384
+
+.type	__lshift_mod_384,@function
+.align	32
+__lshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	movq	%r8,%r14
+	adcq	%r11,%r11
+	movq	%r9,%r15
+	adcq	%r12,%r12
+	movq	%r10,%rax
+	adcq	%r13,%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	cmovcq	%rbx,%r11
+	cmovcq	%rbp,%r12
+	cmovcq	%rsi,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__lshift_mod_384,.-__lshift_mod_384
+
+
+.globl	mul_by_3_mod_384
+.hidden	mul_by_3_mod_384
+.type	mul_by_3_mod_384,@function
+.align	32
+mul_by_3_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_3_mod_384,.-mul_by_3_mod_384
+
+.globl	mul_by_8_mod_384
+.hidden	mul_by_8_mod_384
+.type	mul_by_8_mod_384,@function
+.align	32
+mul_by_8_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_8_mod_384,.-mul_by_8_mod_384
+
+
+.globl	mul_by_3_mod_384x
+.hidden	mul_by_3_mod_384x
+.type	mul_by_3_mod_384x,@function
+.align	32
+mul_by_3_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	(%rsp),%rsi
+	leaq	48(%rdi),%rdi
+
+	movq	48(%rsi),%r8
+	movq	56(%rsi),%r9
+	movq	64(%rsi),%r10
+	movq	72(%rsi),%r11
+	movq	80(%rsi),%r12
+	movq	88(%rsi),%r13
+
+	call	__lshift_mod_384
+
+	movq	$48,%rdx
+	addq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
+
+.globl	mul_by_8_mod_384x
+.hidden	mul_by_8_mod_384x
+.type	mul_by_8_mod_384x,@function
+.align	32
+mul_by_8_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rsi
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	%r8,48+0(%rdi)
+	movq	%r9,48+8(%rdi)
+	movq	%r10,48+16(%rdi)
+	movq	%r11,48+24(%rdi)
+	movq	%r12,48+32(%rdi)
+	movq	%r13,48+40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
+
+
+.globl	cneg_mod_384
+.hidden	cneg_mod_384
+.type	cneg_mod_384,@function
+.align	32
+cneg_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdx
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%rdx,%r8
+	movq	24(%rsi),%r11
+	orq	%r9,%rdx
+	movq	32(%rsi),%r12
+	orq	%r10,%rdx
+	movq	40(%rsi),%r13
+	orq	%r11,%rdx
+	movq	$-1,%rsi
+	orq	%r12,%rdx
+	orq	%r13,%rdx
+
+	movq	0(%rcx),%r14
+	cmovnzq	%rsi,%rdx
+	movq	8(%rcx),%r15
+	movq	16(%rcx),%rax
+	andq	%rdx,%r14
+	movq	24(%rcx),%rbx
+	andq	%rdx,%r15
+	movq	32(%rcx),%rbp
+	andq	%rdx,%rax
+	movq	40(%rcx),%rsi
+	andq	%rdx,%rbx
+	movq	0(%rsp),%rcx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	subq	%r8,%r14
+	sbbq	%r9,%r15
+	sbbq	%r10,%rax
+	sbbq	%r11,%rbx
+	sbbq	%r12,%rbp
+	sbbq	%r13,%rsi
+
+	orq	%rcx,%rcx
+
+	cmovzq	%r8,%r14
+	cmovzq	%r9,%r15
+	cmovzq	%r10,%rax
+	movq	%r14,0(%rdi)
+	cmovzq	%r11,%rbx
+	movq	%r15,8(%rdi)
+	cmovzq	%r12,%rbp
+	movq	%rax,16(%rdi)
+	cmovzq	%r13,%rsi
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rsi,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	cneg_mod_384,.-cneg_mod_384
+
+
+.globl	sub_mod_384
+.hidden	sub_mod_384
+.type	sub_mod_384,@function
+.align	32
+sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sub_mod_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_mod_384,.-sub_mod_384
+
+.type	__sub_mod_384,@function
+.align	32
+__sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	sub_mod_384x
+.hidden	sub_mod_384x
+.type	sub_mod_384x,@function
+.align	32
+sub_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$24,%rsp
+.cfi_adjust_cfa_offset	24
+
+
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	leaq	48(%rsi),%rsi
+	leaq	48(%rdx),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__sub_mod_384
+
+	movq	24+0(%rsp),%r15
+.cfi_restore	%r15
+	movq	24+8(%rsp),%r14
+.cfi_restore	%r14
+	movq	24+16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24+24(%rsp),%r12
+.cfi_restore	%r12
+	movq	24+32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	24+40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24+48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_mod_384x,.-sub_mod_384x
+.globl	mul_by_1_plus_i_mod_384x
+.hidden	mul_by_1_plus_i_mod_384x
+.type	mul_by_1_plus_i_mod_384x,@function
+.align	32
+mul_by_1_plus_i_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$56,%rsp
+.cfi_adjust_cfa_offset	56
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rbx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rcx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+	movq	%rdi,48(%rsp)
+	sbbq	%rdi,%rdi
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rbx
+	sbbq	80(%rsi),%rcx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rsi,%rsi
+
+	movq	%r8,0(%rsp)
+	movq	0(%rdx),%r8
+	movq	%r9,8(%rsp)
+	movq	8(%rdx),%r9
+	movq	%r10,16(%rsp)
+	movq	16(%rdx),%r10
+	movq	%r11,24(%rsp)
+	movq	24(%rdx),%r11
+	movq	%r12,32(%rsp)
+	andq	%rsi,%r8
+	movq	32(%rdx),%r12
+	movq	%r13,40(%rsp)
+	andq	%rsi,%r9
+	movq	40(%rdx),%r13
+	andq	%rsi,%r10
+	andq	%rsi,%r11
+	andq	%rsi,%r12
+	andq	%rsi,%r13
+	movq	48(%rsp),%rsi
+
+	addq	%r8,%r14
+	movq	0(%rsp),%r8
+	adcq	%r9,%r15
+	movq	8(%rsp),%r9
+	adcq	%r10,%rax
+	movq	16(%rsp),%r10
+	adcq	%r11,%rbx
+	movq	24(%rsp),%r11
+	adcq	%r12,%rcx
+	movq	32(%rsp),%r12
+	adcq	%r13,%rbp
+	movq	40(%rsp),%r13
+
+	movq	%r14,0(%rsi)
+	movq	%r8,%r14
+	movq	%r15,8(%rsi)
+	movq	%rax,16(%rsi)
+	movq	%r9,%r15
+	movq	%rbx,24(%rsi)
+	movq	%rcx,32(%rsi)
+	movq	%r10,%rax
+	movq	%rbp,40(%rsi)
+
+	subq	0(%rdx),%r8
+	movq	%r11,%rbx
+	sbbq	8(%rdx),%r9
+	sbbq	16(%rdx),%r10
+	movq	%r12,%rcx
+	sbbq	24(%rdx),%r11
+	sbbq	32(%rdx),%r12
+	movq	%r13,%rbp
+	sbbq	40(%rdx),%r13
+	sbbq	$0,%rdi
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,48(%rsi)
+	cmovcq	%rbx,%r11
+	movq	%r9,56(%rsi)
+	cmovcq	%rcx,%r12
+	movq	%r10,64(%rsi)
+	cmovcq	%rbp,%r13
+	movq	%r11,72(%rsi)
+	movq	%r12,80(%rsi)
+	movq	%r13,88(%rsi)
+
+	movq	56+0(%rsp),%r15
+.cfi_restore	%r15
+	movq	56+8(%rsp),%r14
+.cfi_restore	%r14
+	movq	56+16(%rsp),%r13
+.cfi_restore	%r13
+	movq	56+24(%rsp),%r12
+.cfi_restore	%r12
+	movq	56+32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	56+40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56+48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
+.globl	sgn0_pty_mod_384
+.hidden	sgn0_pty_mod_384
+.type	sgn0_pty_mod_384,@function
+.align	32
+sgn0_pty_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%rcx
+	movq	40(%rdi),%rdx
+
+	xorq	%rax,%rax
+	movq	%r8,%rdi
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rax
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%rdi
+	andq	$2,%rax
+	orq	%rdi,%rax
+
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
+
+.globl	sgn0_pty_mod_384x
+.hidden	sgn0_pty_mod_384x
+.type	sgn0_pty_mod_384x,@function
+.align	32
+sgn0_pty_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	48(%rdi),%r8
+	movq	56(%rdi),%r9
+	movq	64(%rdi),%r10
+	movq	72(%rdi),%r11
+	movq	80(%rdi),%rcx
+	movq	88(%rdi),%rdx
+
+	movq	%r8,%rbx
+	orq	%r9,%r8
+	orq	%r10,%r8
+	orq	%r11,%r8
+	orq	%rcx,%r8
+	orq	%rdx,%r8
+
+	leaq	0(%rdi),%rax
+	xorq	%rdi,%rdi
+	movq	%rbx,%rbp
+	addq	%rbx,%rbx
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rdi
+
+	subq	0(%rsi),%rbx
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rdi
+
+	movq	%r8,0(%rsp)
+	notq	%rdi
+	andq	$1,%rbp
+	andq	$2,%rdi
+	orq	%rbp,%rdi
+
+	movq	0(%rax),%r8
+	movq	8(%rax),%r9
+	movq	16(%rax),%r10
+	movq	24(%rax),%r11
+	movq	32(%rax),%rcx
+	movq	40(%rax),%rdx
+
+	movq	%r8,%rbx
+	orq	%r9,%r8
+	orq	%r10,%r8
+	orq	%r11,%r8
+	orq	%rcx,%r8
+	orq	%rdx,%r8
+
+	xorq	%rax,%rax
+	movq	%rbx,%rbp
+	addq	%rbx,%rbx
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rax
+
+	subq	0(%rsi),%rbx
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%rbx
+
+	notq	%rax
+
+	testq	%r8,%r8
+	cmovzq	%rdi,%rbp
+
+	testq	%rbx,%rbx
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%rbp
+	andq	$2,%rax
+	orq	%rbp,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
+.globl	vec_select_48
+.hidden	vec_select_48
+.type	vec_select_48,@function
+.align	32
+vec_select_48:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	24(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	24(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	24(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-24(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-24(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-24(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-24(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-24(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-24(%rdi)
+	pand	%xmm4,%xmm0
+	pand	%xmm5,%xmm1
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_48,.-vec_select_48
+.globl	vec_select_96
+.hidden	vec_select_96
+.type	vec_select_96,@function
+.align	32
+vec_select_96:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	48(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	48(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	48(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-48(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-48(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-48(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-48(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-48(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-48(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-48(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-48(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-48(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-48(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-48(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-48(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-48(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-48(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-48(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-48(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_96,.-vec_select_96
+.globl	vec_select_192
+.hidden	vec_select_192
+.type	vec_select_192,@function
+.align	32
+vec_select_192:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	96(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	96(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	96(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	128+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	128+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	144+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	144+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,144-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	160+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	160+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,160-96(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,176-96(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_192,.-vec_select_192
+.globl	vec_select_144
+.hidden	vec_select_144
+.type	vec_select_144,@function
+.align	32
+vec_select_144:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	72(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	72(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	72(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-72(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-72(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-72(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-72(%rdi)
+	pand	%xmm4,%xmm0
+	pand	%xmm5,%xmm1
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-72(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_144,.-vec_select_144
+.globl	vec_select_288
+.hidden	vec_select_288
+.type	vec_select_288,@function
+.align	32
+vec_select_288:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	144(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	144(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	144(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	128+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	128+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	144+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	144+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,144-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	160+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	160+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,160-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	176+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	176+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,176-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	192+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	192+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,192-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	208+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	208+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,208-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	224+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	224+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,224-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	240+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	240+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,240-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	256+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	256+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,256-144(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,272-144(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_288,.-vec_select_288
+.globl	vec_prefetch
+.hidden	vec_prefetch
+.type	vec_prefetch,@function
+.align	32
+vec_prefetch:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	leaq	-1(%rdi,%rsi,1),%rsi
+	movq	$64,%rax
+	xorq	%r8,%r8
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	prefetchnta	(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_prefetch,.-vec_prefetch
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/add_mod_384x384-x86_64.s b/blst/elf/add_mod_384x384-x86_64.s
new file mode 100644
index 0000000..084f3d8
--- /dev/null
+++ b/blst/elf/add_mod_384x384-x86_64.s
@@ -0,0 +1,252 @@
+.text	
+
+.type	__add_mod_384x384,@function
+.align	32
+__add_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	addq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	adcq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	adcq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	adcq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	adcq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	adcq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	adcq	48(%rdx),%r14
+	movq	%r9,8(%rdi)
+	adcq	56(%rdx),%r15
+	movq	%r10,16(%rdi)
+	adcq	64(%rdx),%rax
+	movq	%r12,32(%rdi)
+	movq	%r14,%r8
+	adcq	72(%rdx),%rbx
+	movq	%r11,24(%rdi)
+	movq	%r15,%r9
+	adcq	80(%rdx),%rbp
+	movq	%r13,40(%rdi)
+	movq	%rax,%r10
+	adcq	88(%rdx),%rsi
+	movq	%rbx,%r11
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%rbp,%r12
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%rbx
+	sbbq	32(%rcx),%rbp
+	movq	%rsi,%r13
+	sbbq	40(%rcx),%rsi
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r14
+	cmovcq	%r9,%r15
+	cmovcq	%r10,%rax
+	movq	%r14,48(%rdi)
+	cmovcq	%r11,%rbx
+	movq	%r15,56(%rdi)
+	cmovcq	%r12,%rbp
+	movq	%rax,64(%rdi)
+	cmovcq	%r13,%rsi
+	movq	%rbx,72(%rdi)
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__add_mod_384x384,.-__add_mod_384x384
+
+.type	__sub_mod_384x384,@function
+.align	32
+__sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.globl	add_mod_384x384
+.hidden	add_mod_384x384
+.type	add_mod_384x384,@function
+.align	32
+add_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__add_mod_384x384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_mod_384x384,.-add_mod_384x384
+
+.globl	sub_mod_384x384
+.hidden	sub_mod_384x384
+.type	sub_mod_384x384,@function
+.align	32
+sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sub_mod_384x384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_mod_384x384,.-sub_mod_384x384
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/ct_inverse_mod_256-armv8.S b/blst/elf/ct_inverse_mod_256-armv8.S
new file mode 100644
index 0000000..347eb31
--- /dev/null
+++ b/blst/elf/ct_inverse_mod_256-armv8.S
@@ -0,0 +1,784 @@
+.text
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256, %function
+.align	5
+ct_inverse_mod_256:
+	.inst	0xd503233f
+	stp	x29, x30, [sp,#-80]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	sub	sp, sp, #1040
+
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x6, x7, [x1,#8*2]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	str	x0, [sp]
+
+	ldp	x8, x9, [x2,#8*0]
+	ldp	x10, x11, [x2,#8*2]
+
+	stp	x4, x5, [x1,#8*0]	// copy input to |a|
+	stp	x6, x7, [x1,#8*2]
+	stp	x8, x9, [x1,#8*4]	// copy modulus to |b|
+	stp	x10, x11, [x1,#8*6]
+
+	////////////////////////////////////////// first iteration
+	bl	.Lab_approximation_31_256_loaded
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	str	x12,[x0,#8*8]		// initialize |u| with |f0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to dst |b|
+	bl	__smul_256_n_shift_by_31
+	str	x12, [x0,#8*9]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	ldr	x8, [x1,#8*8]		// |u|
+	ldr	x9, [x1,#8*13]	// |v|
+	madd	x4, x16, x8, xzr	// |u|*|f0|
+	madd	x4, x17, x9, x4	// |v|*|g0|
+	str	x4, [x0,#8*4]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*5]
+	stp	x5, x5, [x0,#8*7]
+
+	madd	x4, x12, x8, xzr	// |u|*|f1|
+	madd	x4, x13, x9, x4	// |v|*|g1|
+	str	x4, [x0,#8*9]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*10]
+	stp	x5, x5, [x0,#8*12]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	////////////////////////////////////////// two[!] last iterations
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #47			// 31 + 512 % 31
+	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
+	ldr	x7, [x1,#8*0]		// just load
+	ldr	x11, [x1,#8*4]
+	bl	__inner_loop_62_256
+
+	mov	x16, x14
+	mov	x17, x15
+	ldr	x0, [sp]			// original out_ptr
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	ldr	x30, [x29,#8]
+
+	smulh	x20, x7, x17		// figure out top-most limb
+	ldp	x8, x9, [x3,#8*0]
+	adc	x23, x23, x25
+	ldp	x10, x11, [x3,#8*2]
+
+	add	x20, x20, x23		// x20 is 1, 0 or -1
+	asr	x19, x20, #63		// sign as mask
+
+	and	x23,   x8, x19		// add mod<<256 conditionally
+	and	x24,   x9, x19
+	adds	x4, x4, x23
+	and	x25,   x10, x19
+	adcs	x5, x5, x24
+	and	x26,   x11, x19
+	adcs	x6, x6, x25
+	adcs	x7, x22,   x26
+	adc	x20, x20, xzr		// x20 is 1, 0 or -1
+
+	neg	x19, x20
+	orr	x20, x20, x19		// excess bit or sign as mask
+	asr	x19, x19, #63		// excess bit as mask
+
+	and	x8, x8, x20		// mask |mod|
+	and	x9, x9, x20
+	and	x10, x10, x20
+	and	x11, x11, x20
+
+	eor	x8, x8, x19		// conditionally negate |mod|
+	eor	x9, x9, x19
+	adds	x8, x8, x19, lsr#63
+	eor	x10, x10, x19
+	adcs	x9, x9, xzr
+	eor	x11, x11, x19
+	adcs	x10, x10, xzr
+	adc	x11, x11, xzr
+
+	adds	x4, x4, x8	// final adjustment for |mod|<<256
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*4]
+	adc	x7, x7, x11
+	stp	x6, x7, [x0,#8*6]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldr	x29, [sp],#80
+	.inst	0xd50323bf
+	ret
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+
+////////////////////////////////////////////////////////////////////////
+.type	__smul_256x63, %function
+.align	5
+__smul_256x63:
+	ldp	x4, x5, [x1,#8*0+64]	// load |u| (or |v|)
+	asr	x14, x16, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x6, x7, [x1,#8*2+64]
+	eor	x16, x16, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x22, [x1,#8*4+64]
+
+	eor	x4, x4, x14	// conditionally negate |u| (or |v|)
+	sub	x16, x16, x14
+	eor	x5, x5, x14
+	adds	x4, x4, x14, lsr#63
+	eor	x6, x6, x14
+	adcs	x5, x5, xzr
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	eor	x22, x22, x14
+	umulh	x19, x4, x16
+	adcs	x7, x7, xzr
+	umulh	x20, x5, x16
+	adcs	x22, x22, xzr
+	umulh	x21, x6, x16
+	mul	x4, x4, x16
+	cmp	x16, #0
+	mul	x5, x5, x16
+	csel	x22, x22, xzr, ne
+	mul	x6, x6, x16
+	adds	x5, x5, x19
+	mul	x24, x7, x16
+	adcs	x6, x6, x20
+	adcs	x24, x24, x21
+	adc	x26, xzr, xzr
+	ldp	x8, x9, [x1,#8*0+104]	// load |u| (or |v|)
+	asr	x14, x17, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x10, x11, [x1,#8*2+104]
+	eor	x17, x17, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x23, [x1,#8*4+104]
+
+	eor	x8, x8, x14	// conditionally negate |u| (or |v|)
+	sub	x17, x17, x14
+	eor	x9, x9, x14
+	adds	x8, x8, x14, lsr#63
+	eor	x10, x10, x14
+	adcs	x9, x9, xzr
+	eor	x11, x11, x14
+	adcs	x10, x10, xzr
+	eor	x23, x23, x14
+	umulh	x19, x8, x17
+	adcs	x11, x11, xzr
+	umulh	x20, x9, x17
+	adcs	x23, x23, xzr
+	umulh	x21, x10, x17
+	adc	x15, xzr, xzr		// used in __smul_512x63_tail
+	mul	x8, x8, x17
+	cmp	x17, #0
+	mul	x9, x9, x17
+	csel	x23, x23, xzr, ne
+	mul	x10, x10, x17
+	adds	x9, x9, x19
+	mul	x25, x11, x17
+	adcs	x10, x10, x20
+	adcs	x25, x25, x21
+	adc	x26, x26, xzr
+
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*0]
+	adcs	x24,   x24,   x25
+	stp	x6, x24, [x0,#8*2]
+
+	ret
+.size	__smul_256x63,.-__smul_256x63
+
+.type	__smul_512x63_tail, %function
+.align	5
+__smul_512x63_tail:
+	umulh	x24, x7, x16
+	ldp	x5, x6, [x1,#8*18]	// load rest of |v|
+	adc	x26, x26, xzr
+	ldr	x7, [x1,#8*20]
+	and	x22, x22, x16
+
+	umulh	x11, x11, x17	// resume |v|*|g1| chain
+
+	sub	x24, x24, x22	// tie up |u|*|f1| chain
+	asr	x25, x24, #63
+
+	eor	x5, x5, x14	// conditionally negate rest of |v|
+	eor	x6, x6, x14
+	adds	x5, x5, x15
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	umulh	x19, x23,   x17
+	adc	x7, x7, xzr
+	umulh	x20, x5, x17
+	add	x11, x11, x26
+	umulh	x21, x6, x17
+
+	mul	x4, x23,   x17
+	mul	x5, x5, x17
+	adds	x4, x4, x11
+	mul	x6, x6, x17
+	adcs	x5, x5, x19
+	mul	x22,   x7, x17
+	adcs	x6, x6, x20
+	adcs	x22,   x22,   x21
+	adc	x23, xzr, xzr		// used in the final step
+
+	adds	x4, x4, x24
+	adcs	x5, x5, x25
+	adcs	x6, x6, x25
+	stp	x4, x5, [x0,#8*4]
+	adcs	x22,   x22,   x25	// carry is used in the final step
+	stp	x6, x22,   [x0,#8*6]
+
+	ret
+.size	__smul_512x63_tail,.-__smul_512x63_tail
+
+.type	__smul_256_n_shift_by_31, %function
+.align	5
+__smul_256_n_shift_by_31:
+	ldp	x4, x5, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x24, x12, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x6, x7, [x1,#8*2+0]
+	eor	x25, x12, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x4, x4, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x5, x5, x24
+	adds	x4, x4, x24, lsr#63
+	eor	x6, x6, x24
+	adcs	x5, x5, xzr
+	eor	x7, x7, x24
+	umulh	x19, x4, x25
+	adcs	x6, x6, xzr
+	umulh	x20, x5, x25
+	adc	x7, x7, xzr
+	umulh	x21, x6, x25
+	and	x24, x24, x25
+	umulh	x22, x7, x25
+	neg	x24, x24
+
+	mul	x4, x4, x25
+	mul	x5, x5, x25
+	mul	x6, x6, x25
+	adds	x5, x5, x19
+	mul	x7, x7, x25
+	adcs	x6, x6, x20
+	adcs	x7, x7, x21
+	adc	x22, x22, x24
+	ldp	x8, x9, [x1,#8*0+32]	// load |a| (or |b|)
+	asr	x24, x13, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x10, x11, [x1,#8*2+32]
+	eor	x25, x13, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x8, x8, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x9, x9, x24
+	adds	x8, x8, x24, lsr#63
+	eor	x10, x10, x24
+	adcs	x9, x9, xzr
+	eor	x11, x11, x24
+	umulh	x19, x8, x25
+	adcs	x10, x10, xzr
+	umulh	x20, x9, x25
+	adc	x11, x11, xzr
+	umulh	x21, x10, x25
+	and	x24, x24, x25
+	umulh	x23, x11, x25
+	neg	x24, x24
+
+	mul	x8, x8, x25
+	mul	x9, x9, x25
+	mul	x10, x10, x25
+	adds	x9, x9, x19
+	mul	x11, x11, x25
+	adcs	x10, x10, x20
+	adcs	x11, x11, x21
+	adc	x23, x23, x24
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	adcs	x7, x7, x11
+	adc	x8, x22,   x23
+
+	extr	x4, x5, x4, #31
+	extr	x5, x6, x5, #31
+	extr	x6, x7, x6, #31
+	asr	x23, x8, #63	// result's sign as mask
+	extr	x7, x8, x7, #31
+
+	eor	x4, x4, x23	// ensure the result is positive
+	eor	x5, x5, x23
+	adds	x4, x4, x23, lsr#63
+	eor	x6, x6, x23
+	adcs	x5, x5, xzr
+	eor	x7, x7, x23
+	adcs	x6, x6, xzr
+	stp	x4, x5, [x0,#8*0]
+	adc	x7, x7, xzr
+	stp	x6, x7, [x0,#8*2]
+
+	eor	x12, x12, x23		// adjust |f/g| accordingly
+	eor	x13, x13, x23
+	sub	x12, x12, x23
+	sub	x13, x13, x23
+
+	ret
+.size	__smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
+.type	__ab_approximation_31_256, %function
+.align	4
+__ab_approximation_31_256:
+	ldp	x6, x7, [x1,#8*2]
+	ldp	x10, x11, [x1,#8*6]
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x8, x9, [x1,#8*4]
+
+.Lab_approximation_31_256_loaded:
+	orr	x19, x7, x11	// check top-most limbs, ...
+	cmp	x19, #0
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	csel	x6, x6, x5, ne
+	orr	x19, x7, x11	// and ones before top-most, ...
+	csel	x10, x10, x9, ne
+
+	cmp	x19, #0
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	csel	x6, x6, x4, ne
+	orr	x19, x7, x11	// and one more, ...
+	csel	x10, x10, x8, ne
+
+	clz	x19, x19
+	cmp	x19, #64
+	csel	x19, x19, xzr, ne
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	neg	x20, x19
+
+	lslv	x7, x7, x19	// align high limbs to the left
+	lslv	x11, x11, x19
+	lsrv	x6, x6, x20
+	lsrv	x10, x10, x20
+	and	x6, x6, x20, asr#6
+	and	x10, x10, x20, asr#6
+	orr	x7, x7, x6
+	orr	x11, x11, x10
+
+	bfxil	x7, x4, #0, #31
+	bfxil	x11, x8, #0, #31
+
+	b	__inner_loop_31_256
+	ret
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+
+.type	__inner_loop_31_256, %function
+.align	4
+__inner_loop_31_256:
+	mov	x2, #31
+	mov	x13, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x15, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x23,#0x7FFFFFFF7FFFFFFF
+
+.Loop_31_256:
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x15
+	csel	x11, x11, x7, hs	// |b_| = |a_|
+	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x15, x15, x13,    hs	// exchange |fg0| and |fg1|
+	csel	x13, x13, x19,   hs
+	lsr	x7, x7, #1
+	and	x19, x15, x22
+	and	x20, x23, x22
+	sub	x13, x13, x19	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x15, x15, x15	// |f1|<<=1
+	add	x13, x13, x20
+	sub	x15, x15, x23
+	cbnz	x2, .Loop_31_256
+
+	mov	x23, #0x7FFFFFFF
+	ubfx	x12, x13, #0, #32
+	ubfx	x13, x13, #32, #32
+	ubfx	x14, x15, #0, #32
+	ubfx	x15, x15, #32, #32
+	sub	x12, x12, x23		// remove bias
+	sub	x13, x13, x23
+	sub	x14, x14, x23
+	sub	x15, x15, x23
+
+	ret
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256, %function
+.align	4
+__inner_loop_62_256:
+	mov	x12, #1		// |f0|=1
+	mov	x13, #0		// |g0|=0
+	mov	x14, #0		// |f1|=0
+	mov	x15, #1		// |g1|=1
+
+.Loop_62_256:
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x12
+	csel	x11, x11, x7, hs	// |b_| = |a_|
+	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	mov	x20, x13
+	csel	x12, x12, x14,       hs	// exchange |f0| and |f1|
+	csel	x14, x14, x19,     hs
+	csel	x13, x13, x15,       hs	// exchange |g0| and |g1|
+	csel	x15, x15, x20,     hs
+	lsr	x7, x7, #1
+	and	x19, x14, x22
+	and	x20, x15, x22
+	add	x14, x14, x14		// |f1|<<=1
+	add	x15, x15, x15		// |g1|<<=1
+	sub	x12, x12, x19		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x13, x13, x20		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, .Loop_62_256
+
+	ret
+.size	__inner_loop_62_256,.-__inner_loop_62_256
diff --git a/blst/elf/ct_inverse_mod_256-x86_64.s b/blst/elf/ct_inverse_mod_256-x86_64.s
new file mode 100644
index 0000000..c4d8d6d
--- /dev/null
+++ b/blst/elf/ct_inverse_mod_256-x86_64.s
@@ -0,0 +1,1185 @@
+.text	
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256,@function
+.align	32
+ct_inverse_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$1072,%rsp
+.cfi_adjust_cfa_offset	1072
+
+
+	leaq	48+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	movq	0(%rdx),%r12
+	movq	8(%rdx),%r13
+	movq	16(%rdx),%r14
+	movq	24(%rdx),%r15
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rax,%rsi
+
+
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+	movq	%rdx,64(%rdi)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+	movq	%rdx,72(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+
+	movq	64(%rsi),%r8
+	movq	104(%rsi),%r12
+	movq	%r8,%r9
+	imulq	0(%rsp),%r8
+	movq	%r12,%r13
+	imulq	8(%rsp),%r12
+	addq	%r12,%r8
+	movq	%r8,32(%rdi)
+	sarq	$63,%r8
+	movq	%r8,40(%rdi)
+	movq	%r8,48(%rdi)
+	movq	%r8,56(%rdi)
+	movq	%r8,64(%rdi)
+	leaq	64(%rsi),%rsi
+
+	imulq	%rdx,%r9
+	imulq	%rcx,%r13
+	addq	%r13,%r9
+	movq	%r9,72(%rdi)
+	sarq	$63,%r9
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	movq	%r9,96(%rdi)
+	movq	%r9,104(%rdi)
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	sarq	$63,%rbp
+	movq	%rbp,40(%rdi)
+	movq	%rbp,48(%rdi)
+	movq	%rbp,56(%rdi)
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+
+	xorq	$256+64,%rsi
+	movl	$47,%edx
+
+	movq	0(%rsi),%r8
+
+	movq	32(%rsi),%r10
+
+	call	__inner_loop_62_256
+
+
+
+
+
+
+
+	leaq	64(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulq_512x63
+	adcq	%rbp,%rdx
+
+	movq	40(%rsp),%rsi
+	movq	%rdx,%rax
+	sarq	$63,%rdx
+
+	movq	%rdx,%r8
+	movq	%rdx,%r9
+	andq	0(%rsi),%r8
+	movq	%rdx,%r10
+	andq	8(%rsi),%r9
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%rdx
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+	adcq	$0,%rax
+
+	movq	%rax,%rdx
+	negq	%rax
+	orq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rdx,%r8
+	movq	%rdx,%r9
+	andq	0(%rsi),%r8
+	movq	%rdx,%r10
+	andq	8(%rsi),%r9
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%rdx
+
+	xorq	%rax,%r8
+	xorq	%rcx,%rcx
+	xorq	%rax,%r9
+	subq	%rax,%rcx
+	xorq	%rax,%r10
+	xorq	%rax,%rdx
+	addq	%rcx,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%rdx
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	1072(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-1072-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+.type	__smulq_512x63,@function
+.align	32
+__smulq_512x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%rbp
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%rbp
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%rbp
+
+	mulq	%rbx
+	movq	%rax,0(%rdi)
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%r9,8(%rdi)
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%r10,16(%rdi)
+	movq	%rdx,%r11
+	andq	%rbx,%rbp
+	negq	%rbp
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	%r11,24(%rdi)
+
+	movq	40(%rsi),%r8
+	movq	48(%rsi),%r9
+	movq	56(%rsi),%r10
+	movq	64(%rsi),%r11
+	movq	72(%rsi),%r12
+	movq	80(%rsi),%r13
+	movq	88(%rsi),%r14
+	movq	96(%rsi),%r15
+
+	movq	%rcx,%rdx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rcx
+	addq	%rax,%rcx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+
+	mulq	%rcx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rcx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rcx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rcx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rcx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	mulq	%rcx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rcx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	imulq	%rcx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+
+	movq	%rbp,%rbx
+	sarq	$63,%rbp
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	%rbx,%r12
+	adcq	%rbp,%r13
+	adcq	%rbp,%r14
+	adcq	%rbp,%r15
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_512x63,.-__smulq_512x63
+
+.type	__smulq_256x63,@function
+.align	32
+__smulq_256x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%rbp
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%rbp
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%rbp
+
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	andq	%rbx,%rbp
+	negq	%rbp
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	%rcx,%rdx
+	movq	40+0(%rsi),%r12
+	movq	40+8(%rsi),%r13
+	movq	40+16(%rsi),%r14
+	movq	40+24(%rsi),%r15
+	movq	40+32(%rsi),%rcx
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	xorq	%rdx,%rcx
+	addq	%r12,%rax
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rcx
+
+	mulq	%rbx
+	movq	%rax,%r12
+	movq	%r13,%rax
+	movq	%rdx,%r13
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	andq	%rbx,%rcx
+	negq	%rcx
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	%rdx,%rcx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rcx,%rbp
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%rbp,32(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_256x63,.-__smulq_256x63
+.type	__smulq_256_n_shift_by_31,@function
+.align	32
+__smulq_256_n_shift_by_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,0(%rdi)
+	movq	%rcx,8(%rdi)
+	movq	%rdx,%rbp
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+
+	movq	%rbp,%rbx
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	andq	%rbx,%rbp
+	negq	%rbp
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r14
+	movq	32+24(%rsi),%r15
+
+	movq	%rcx,%rbx
+	sarq	$63,%rcx
+	xorq	%rax,%rax
+	subq	%rcx,%rax
+
+	xorq	%rcx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rcx,%r12
+	xorq	%rcx,%r13
+	xorq	%rcx,%r14
+	xorq	%rcx,%r15
+	addq	%r12,%rax
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+
+	mulq	%rbx
+	movq	%rax,%r12
+	movq	%r13,%rax
+	andq	%rbx,%rcx
+	negq	%rcx
+	movq	%rdx,%r13
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	%rdx,%rcx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rcx,%rbp
+
+	movq	0(%rdi),%rdx
+	movq	8(%rdi),%rcx
+
+	shrdq	$31,%r9,%r8
+	shrdq	$31,%r10,%r9
+	shrdq	$31,%r11,%r10
+	shrdq	$31,%rbp,%r11
+
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	xorq	%rbp,%rdx
+	xorq	%rbp,%rcx
+	addq	%rax,%rdx
+	addq	%rax,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31
+.type	__ab_approximation_31_256,@function
+.align	32
+__ab_approximation_31_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	24(%rsi),%r9
+	movq	56(%rsi),%r11
+	movq	16(%rsi),%rbx
+	movq	48(%rsi),%rbp
+	movq	8(%rsi),%r8
+	movq	40(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	0(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	32(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r9
+	cmovzq	%r10,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	movl	$0x7FFFFFFF,%eax
+	andq	%rax,%r8
+	andq	%rax,%r10
+	notq	%rax
+	andq	%rax,%r9
+	andq	%rax,%r11
+	orq	%r9,%r8
+	orq	%r11,%r10
+
+	jmp	__inner_loop_31_256
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+.type	__inner_loop_31_256,@function
+.align	32
+__inner_loop_31_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rcx
+	movq	$0x800000007FFFFFFF,%r13
+	movq	$0x7FFFFFFF7FFFFFFF,%r15
+
+.Loop_31_256:
+	cmpq	%r10,%r8
+	movq	%r8,%rax
+	movq	%r10,%rbx
+	movq	%rcx,%rbp
+	movq	%r13,%r14
+	cmovbq	%r10,%r8
+	cmovbq	%rax,%r10
+	cmovbq	%r13,%rcx
+	cmovbq	%rbp,%r13
+
+	subq	%r10,%r8
+	subq	%r13,%rcx
+	addq	%r15,%rcx
+
+	testq	$1,%rax
+	cmovzq	%rax,%r8
+	cmovzq	%rbx,%r10
+	cmovzq	%rbp,%rcx
+	cmovzq	%r14,%r13
+
+	shrq	$1,%r8
+	addq	%r13,%r13
+	subq	%r15,%r13
+	subl	$1,%edx
+	jnz	.Loop_31_256
+
+	shrq	$32,%r15
+	movl	%ecx,%edx
+	movl	%r13d,%r12d
+	shrq	$32,%rcx
+	shrq	$32,%r13
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+	subq	%r15,%r12
+	subq	%r15,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256,@function
+.align	32
+__inner_loop_62_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movl	%edx,%r15d
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	%rdx,%r13
+	movq	%rdx,%r14
+
+.Loop_62_256:
+	xorq	%rax,%rax
+	testq	%r14,%r8
+	movq	%r10,%rbx
+	cmovnzq	%r10,%rax
+	subq	%r8,%rbx
+	movq	%r8,%rbp
+	subq	%rax,%r8
+	cmovcq	%rbx,%r8
+	cmovcq	%rbp,%r10
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrq	$1,%r8
+	testq	%r14,%rbp
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%r15d
+	jnz	.Loop_62_256
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_62_256,.-__inner_loop_62_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/ct_inverse_mod_384-armv8.S b/blst/elf/ct_inverse_mod_384-armv8.S
new file mode 100644
index 0000000..d7eca17
--- /dev/null
+++ b/blst/elf/ct_inverse_mod_384-armv8.S
@@ -0,0 +1,717 @@
+.text
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383, %function
+.align	5
+ct_inverse_mod_383:
+	.inst	0xd503233f
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #1040
+
+	ldp	x22,   x4, [x1,#8*0]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x7, x8, [x1,#8*4]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	stp	x0, x3, [sp]
+
+	ldp	x9, x10, [x2,#8*0]
+	ldp	x11, x12, [x2,#8*2]
+	ldp	x13, x14, [x2,#8*4]
+
+	stp	x22,   x4, [x1,#8*0]	// copy input to |a|
+	stp	x5, x6, [x1,#8*2]
+	stp	x7, x8, [x1,#8*4]
+	stp	x9, x10, [x1,#8*6]	// copy modulus to |b|
+	stp	x11, x12, [x1,#8*8]
+	stp	x13, x14, [x1,#8*10]
+
+	////////////////////////////////////////// first iteration
+	mov	x2, #62
+	bl	.Lab_approximation_62_loaded
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	str	x15,[x0,#8*12]		// initialize |u| with |f0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to dst |b|
+	bl	__smul_383_n_shift_by_62
+	str	x15, [x0,#8*12]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	ldr	x7, [x1,#8*12]	// |u|
+	ldr	x8, [x1,#8*18]	// |v|
+	mul	x3, x20, x7		// |u|*|f0|
+	smulh	x4, x20, x7
+	mul	x5, x21, x8		// |v|*|g0|
+	smulh	x6, x21, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*6]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*8]
+	stp	x5, x5, [x0,#8*10]
+
+	mul	x3, x15, x7		// |u|*|f1|
+	smulh	x4, x15, x7
+	mul	x5, x16, x8		// |v|*|g1|
+	smulh	x6, x16, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*12]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*14]
+	stp	x5, x5, [x0,#8*16]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	asr	x27, x27, #63		// sign extension
+	stp	x27, x27, [x0,#8*6]
+	stp	x27, x27, [x0,#8*8]
+	stp	x27, x27, [x0,#8*10]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	////////////////////////////////////////// iteration before last
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldp	x3, x8, [x1,#8*0]	// just load
+	ldp	x9, x14, [x1,#8*6]
+	bl	__inner_loop_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	str	x3, [x0,#8*0]
+	str	x9, [x0,#8*6]
+
+	mov	x20, x15			// exact |f0|
+	mov	x21, x16			// exact |g0|
+	mov	x15, x17
+	mov	x16, x19
+	add	x0, x0, #8*12	// pointer to dst |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// exact |f1|
+	mov	x21, x16			// exact |g1|
+	add	x0, x0, #8*6	// pointer to dst |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+
+	////////////////////////////////////////// last iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #22			// 766 % 62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldr	x3, [x1,#8*0]		// just load
+	eor	x8, x8, x8
+	ldr	x9, [x1,#8*6]
+	eor	x14, x14, x14
+	bl	__inner_loop_62
+
+	mov	x20, x17
+	mov	x21, x19
+	ldp	x0, x15, [sp]		// original out_ptr and n_ptr
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	ldr	x30, [x29,#8]
+
+	asr	x22, x8, #63		// sign as mask
+	ldp	x9, x10, [x15,#8*0]
+	ldp	x11, x12, [x15,#8*2]
+	ldp	x13, x14, [x15,#8*4]
+
+	and	x9, x9, x22		// add mod<<384 conditionally
+	and	x10, x10, x22
+	adds	x3, x3, x9
+	and	x11, x11, x22
+	adcs	x4, x4, x10
+	and	x12, x12, x22
+	adcs	x5, x5, x11
+	and	x13, x13, x22
+	adcs	x6, x6, x12
+	and	x14, x14, x22
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x14
+	stp	x7, x8, [x0,#8*10]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	.inst	0xd50323bf
+	ret
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+
+////////////////////////////////////////////////////////////////////////
+// see corresponding commentary in ctx_inverse_mod_384-x86_64...
+.type	__smul_383x63, %function
+.align	5
+__smul_383x63:
+	ldp	x3, x4, [x1,#8*0+96]	// load |u| (or |v|)
+	asr	x17, x20, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x5, x6, [x1,#8*2+96]
+	eor	x20, x20, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x7, x8, [x1,#8*4+96]
+
+	eor	x3, x3, x17	// conditionally negate |u| (or |v|)
+	sub	x20, x20, x17
+	eor	x4, x4, x17
+	adds	x3, x3, x17, lsr#63
+	eor	x5, x5, x17
+	adcs	x4, x4, xzr
+	eor	x6, x6, x17
+	adcs	x5, x5, xzr
+	eor	x7, x7, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x20
+	eor	x8, x8, x17
+	umulh	x23, x4, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x5, x20
+	adcs	x8, x8, xzr
+	umulh	x25, x6, x20
+	umulh	x26, x7, x20
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x22
+	mul	x6, x6, x20
+	adcs	x5, x5, x23
+	mul	x7, x7, x20
+	adcs	x6, x6, x24
+	mul	x27,x8, x20
+	adcs	x7, x7, x25
+	adcs	x27,x27,x26
+	adc	x2, xzr, xzr
+	ldp	x9, x10, [x1,#8*0+144]	// load |u| (or |v|)
+	asr	x17, x21, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x11, x12, [x1,#8*2+144]
+	eor	x21, x21, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x13, x14, [x1,#8*4+144]
+
+	eor	x9, x9, x17	// conditionally negate |u| (or |v|)
+	sub	x21, x21, x17
+	eor	x10, x10, x17
+	adds	x9, x9, x17, lsr#63
+	eor	x11, x11, x17
+	adcs	x10, x10, xzr
+	eor	x12, x12, x17
+	adcs	x11, x11, xzr
+	eor	x13, x13, x17
+	adcs	x12, x12, xzr
+	umulh	x22, x9, x21
+	eor	x14, x14, x17
+	umulh	x23, x10, x21
+	adcs	x13, x13, xzr
+	umulh	x24, x11, x21
+	adcs	x14, x14, xzr
+	umulh	x25, x12, x21
+	adc	x19, xzr, xzr		// used in __smul_767x63_tail
+	umulh	x26, x13, x21
+	mul	x9, x9, x21
+	mul	x10, x10, x21
+	mul	x11, x11, x21
+	adds	x10, x10, x22
+	mul	x12, x12, x21
+	adcs	x11, x11, x23
+	mul	x13, x13, x21
+	adcs	x12, x12, x24
+	mul	x28,x14, x21
+	adcs	x13, x13, x25
+	adcs	x28,x28,x26
+	adc	x2, x2, xzr
+
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*2]
+	adcs	x27,   x27,   x28
+	stp	x7, x27,   [x0,#8*4]
+	adc	x28,   x2,   xzr	// used in __smul_767x63_tail
+
+	ret
+.size	__smul_383x63,.-__smul_383x63
+
+.type	__smul_767x63_tail, %function
+.align	5
+__smul_767x63_tail:
+	smulh	x27,   x8, x20
+	ldp	x3, x4, [x1,#8*24]	// load rest of |v|
+	umulh	x14,x14, x21
+	ldp	x5, x6, [x1,#8*26]
+	ldp	x7, x8, [x1,#8*28]
+
+	eor	x3, x3, x17	// conditionally negate rest of |v|
+	eor	x4, x4, x17
+	eor	x5, x5, x17
+	adds	x3, x3, x19
+	eor	x6, x6, x17
+	adcs	x4, x4, xzr
+	eor	x7, x7, x17
+	adcs	x5, x5, xzr
+	eor	x8, x8, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x21
+	adcs	x7, x7, xzr
+	umulh	x23, x4, x21
+	adc	x8, x8, xzr
+
+	umulh	x24, x5, x21
+	add	x14, x14, x28
+	umulh	x25, x6, x21
+	asr	x28, x27, #63
+	umulh	x26, x7, x21
+	mul	x3, x3, x21
+	mul	x4, x4, x21
+	mul	x5, x5, x21
+	adds	x3, x3, x14
+	mul	x6, x6, x21
+	adcs	x4, x4, x22
+	mul	x7, x7, x21
+	adcs	x5, x5, x23
+	mul	x8, x8, x21
+	adcs	x6, x6, x24
+	adcs	x7, x7, x25
+	adc	x8, x8, x26
+
+	adds	x3, x3, x27
+	adcs	x4, x4, x28
+	adcs	x5, x5, x28
+	adcs	x6, x6, x28
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x28
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x28
+	stp	x7, x8, [x0,#8*10]
+
+	ret
+.size	__smul_767x63_tail,.-__smul_767x63_tail
+
+.type	__smul_383_n_shift_by_62, %function
+.align	5
+__smul_383_n_shift_by_62:
+	ldp	x3, x4, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x28, x15, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x5, x6, [x1,#8*2+0]
+	eor	x2, x15, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x7, x8, [x1,#8*4+0]
+
+	eor	x3, x3, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	umulh	x22, x3, x2
+	adcs	x6, x6, xzr
+	umulh	x23, x4, x2
+	eor	x8, x8, x28
+	umulh	x24, x5, x2
+	adcs	x7, x7, xzr
+	umulh	x25, x6, x2
+	adc	x8, x8, xzr
+
+	umulh	x26, x7, x2
+	smulh	x27, x8, x2
+	mul	x3, x3, x2
+	mul	x4, x4, x2
+	mul	x5, x5, x2
+	adds	x4, x4, x22
+	mul	x6, x6, x2
+	adcs	x5, x5, x23
+	mul	x7, x7, x2
+	adcs	x6, x6, x24
+	mul	x8, x8, x2
+	adcs	x7, x7, x25
+	adcs	x8, x8 ,x26
+	adc	x27, x27, xzr
+	ldp	x9, x10, [x1,#8*0+48]	// load |a| (or |b|)
+	asr	x28, x16, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x11, x12, [x1,#8*2+48]
+	eor	x2, x16, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x13, x14, [x1,#8*4+48]
+
+	eor	x9, x9, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x10, x10, x28
+	adds	x9, x9, x28, lsr#63
+	eor	x11, x11, x28
+	adcs	x10, x10, xzr
+	eor	x12, x12, x28
+	adcs	x11, x11, xzr
+	eor	x13, x13, x28
+	umulh	x22, x9, x2
+	adcs	x12, x12, xzr
+	umulh	x23, x10, x2
+	eor	x14, x14, x28
+	umulh	x24, x11, x2
+	adcs	x13, x13, xzr
+	umulh	x25, x12, x2
+	adc	x14, x14, xzr
+
+	umulh	x26, x13, x2
+	smulh	x28, x14, x2
+	mul	x9, x9, x2
+	mul	x10, x10, x2
+	mul	x11, x11, x2
+	adds	x10, x10, x22
+	mul	x12, x12, x2
+	adcs	x11, x11, x23
+	mul	x13, x13, x2
+	adcs	x12, x12, x24
+	mul	x14, x14, x2
+	adcs	x13, x13, x25
+	adcs	x14, x14 ,x26
+	adc	x28, x28, xzr
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x27,   x28
+
+	extr	x3, x4, x3, #62
+	extr	x4, x5, x4, #62
+	extr	x5, x6, x5, #62
+	asr	x28, x9, #63
+	extr	x6, x7, x6, #62
+	extr	x7, x8, x7, #62
+	extr	x8, x9, x8, #62
+
+	eor	x3, x3, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	adcs	x6, x6, xzr
+	eor	x8, x8, x28
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x0,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x0,#8*4]
+
+	eor	x15, x15, x28
+	eor	x16, x16, x28
+	sub	x15, x15, x28
+	sub	x16, x16, x28
+
+	ret
+.size	__smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
+.type	__ab_approximation_62, %function
+.align	4
+__ab_approximation_62:
+	ldp	x7, x8, [x1,#8*4]
+	ldp	x13, x14, [x1,#8*10]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x11, x12, [x1,#8*8]
+
+.Lab_approximation_62_loaded:
+	orr	x22, x8, x14	// check top-most limbs, ...
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x6, ne
+	orr	x22, x8, x14	// ... ones before top-most, ...
+	csel	x13, x13, x12, ne
+
+	ldp	x3, x4, [x1,#8*0]
+	ldp	x9, x10, [x1,#8*6]
+
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x5, ne
+	orr	x22, x8, x14	// ... and ones before that ...
+	csel	x13, x13, x11, ne
+
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x4, ne
+	orr	x22, x8, x14
+	csel	x13, x13, x10, ne
+
+	clz	x22, x22
+	cmp	x22, #64
+	csel	x22, x22, xzr, ne
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	neg	x23, x22
+
+	lslv	x8, x8, x22	// align high limbs to the left
+	lslv	x14, x14, x22
+	lsrv	x7, x7, x23
+	lsrv	x13, x13, x23
+	and	x7, x7, x23, asr#6
+	and	x13, x13, x23, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	b	__inner_loop_62
+	ret
+.size	__ab_approximation_62,.-__ab_approximation_62
+.type	__inner_loop_62, %function
+.align	4
+__inner_loop_62:
+	mov	x15, #1		// |f0|=1
+	mov	x16, #0		// |g0|=0
+	mov	x17, #0		// |f1|=0
+	mov	x19, #1		// |g1|=1
+
+.Loop_62:
+	sbfx	x28, x3, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	subs	x24, x9, x3	// |b_|-|a_|
+	and	x22, x9, x28
+	sbc	x25, x14, x8
+	and	x23, x14, x28
+	subs	x26, x3, x22	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x22, x15
+	sbcs	x27, x8, x23
+	mov	x23, x16
+	csel	x9, x9, x3, hs	// |b_| = |a_|
+	csel	x14, x14, x8, hs
+	csel	x3, x26, x24, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x8, x27, x25, hs
+	csel	x15, x15, x17,       hs	// exchange |f0| and |f1|
+	csel	x17, x17, x22,     hs
+	csel	x16, x16, x19,       hs	// exchange |g0| and |g1|
+	csel	x19, x19, x23,     hs
+	extr	x3, x8, x3, #1
+	lsr	x8, x8, #1
+	and	x22, x17, x28
+	and	x23, x19, x28
+	add	x17, x17, x17		// |f1|<<=1
+	add	x19, x19, x19		// |g1|<<=1
+	sub	x15, x15, x22		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x16, x16, x23		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, .Loop_62
+
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
diff --git a/blst/elf/ct_is_square_mod_384-armv8.S b/blst/elf/ct_is_square_mod_384-armv8.S
new file mode 100644
index 0000000..ce670b7
--- /dev/null
+++ b/blst/elf/ct_is_square_mod_384-armv8.S
@@ -0,0 +1,324 @@
+.text
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384, %function
+.align	5
+ct_is_square_mod_384:
+	.inst	0xd503233f
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #512
+
+	ldp	x3, x4, [x0,#8*0]		// load input
+	ldp	x5, x6, [x0,#8*2]
+	ldp	x7, x8, [x0,#8*4]
+
+	add	x0, sp, #255	// find closest 256-byte-aligned spot
+	and	x0, x0, #-256	// in the frame...
+
+	ldp	x9, x10, [x1,#8*0]		// load modulus
+	ldp	x11, x12, [x1,#8*2]
+	ldp	x13, x14, [x1,#8*4]
+
+	stp	x3, x4, [x0,#8*6]	// copy input to |a|
+	stp	x5, x6, [x0,#8*8]
+	stp	x7, x8, [x0,#8*10]
+	stp	x9, x10, [x0,#8*0]	// copy modulus to |b|
+	stp	x11, x12, [x0,#8*2]
+	stp	x13, x14, [x0,#8*4]
+
+	eor	x2, x2, x2			// init the .Legendre symbol
+	mov	x15, #24			// 24 is 768/30-1
+	b	.Loop_is_square
+
+.align	4
+.Loop_is_square:
+	bl	__ab_approximation_30
+	sub	x15, x15, #1
+
+	eor	x1, x0, #128		// pointer to dst |b|
+	bl	__smul_384_n_shift_by_30
+
+	mov	x19, x16			// |f0|
+	mov	x20, x17			// |g0|
+	add	x1, x1, #8*6	// pointer to dst |a|
+	bl	__smul_384_n_shift_by_30
+
+	ldp	x9, x10, [x1,#-8*6]
+	eor	x0, x0, #128		// flip-flop src |a|b|
+	and	x27, x27, x9		// if |a| was negative,
+	add	x2, x2, x27, lsr#1		// adjust |L|
+
+	cbnz	x15, .Loop_is_square
+
+	////////////////////////////////////////// last iteration
+	//bl	__ab_approximation_30		// |a| and |b| are exact,
+	//ldr	x8, [x0,#8*6]		// just load
+	mov	x14, x9			// ldr	x14, [x0,#8*0]
+	mov	x15, #48			// 48 is 768%30 + 30
+	bl	__inner_loop_48
+	ldr	x30, [x29,#8]
+
+	and	x0, x2, #1
+	eor	x0, x0, #1
+
+	add	sp, sp, #512
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	.inst	0xd50323bf
+	ret
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smul_384_n_shift_by_30, %function
+.align	5
+__smul_384_n_shift_by_30:
+	ldp	x3, x4, [x0,#8*0+0]	// load |b| (or |a|)
+	asr	x27, x20, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x5, x6, [x0,#8*2+0]
+	eor	x20, x20, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x7, x8, [x0,#8*4+0]
+
+	eor	x3, x3, x27	// conditionally negate |b| (or |a|)
+	sub	x20, x20, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	umulh	x21, x3, x20
+	adcs	x6, x6, xzr
+	umulh	x22, x4, x20
+	eor	x8, x8, x27
+	umulh	x23, x5, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x6, x20
+	adc	x8, x8, xzr
+
+	umulh	x25, x7, x20
+	and	x28, x20, x27
+	umulh	x26, x8, x20
+	neg	x28, x28
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x21
+	mul	x6, x6, x20
+	adcs	x5, x5, x22
+	mul	x7, x7, x20
+	adcs	x6, x6, x23
+	mul	x8, x8, x20
+	adcs	x7, x7, x24
+	adcs	x8, x8 ,x25
+	adc	x26, x26, x28
+	ldp	x9, x10, [x0,#8*0+48]	// load |b| (or |a|)
+	asr	x27, x19, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x11, x12, [x0,#8*2+48]
+	eor	x19, x19, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x13, x14, [x0,#8*4+48]
+
+	eor	x9, x9, x27	// conditionally negate |b| (or |a|)
+	sub	x19, x19, x27
+	eor	x10, x10, x27
+	adds	x9, x9, x27, lsr#63
+	eor	x11, x11, x27
+	adcs	x10, x10, xzr
+	eor	x12, x12, x27
+	adcs	x11, x11, xzr
+	eor	x13, x13, x27
+	umulh	x21, x9, x19
+	adcs	x12, x12, xzr
+	umulh	x22, x10, x19
+	eor	x14, x14, x27
+	umulh	x23, x11, x19
+	adcs	x13, x13, xzr
+	umulh	x24, x12, x19
+	adc	x14, x14, xzr
+
+	umulh	x25, x13, x19
+	and	x28, x19, x27
+	umulh	x27, x14, x19
+	neg	x28, x28
+	mul	x9, x9, x19
+	mul	x10, x10, x19
+	mul	x11, x11, x19
+	adds	x10, x10, x21
+	mul	x12, x12, x19
+	adcs	x11, x11, x22
+	mul	x13, x13, x19
+	adcs	x12, x12, x23
+	mul	x14, x14, x19
+	adcs	x13, x13, x24
+	adcs	x14, x14 ,x25
+	adc	x27, x27, x28
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x26,   x27
+
+	extr	x3, x4, x3, #30
+	extr	x4, x5, x4, #30
+	extr	x5, x6, x5, #30
+	asr	x27, x9, #63
+	extr	x6, x7, x6, #30
+	extr	x7, x8, x7, #30
+	extr	x8, x9, x8, #30
+
+	eor	x3, x3, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	adcs	x6, x6, xzr
+	eor	x8, x8, x27
+	stp	x3, x4, [x1,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x1,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x1,#8*4]
+
+	ret
+.size	__smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
+.type	__ab_approximation_30, %function
+.align	4
+__ab_approximation_30:
+	ldp	x13, x14, [x0,#8*4]	// |a| is still in registers
+	ldp	x11, x12, [x0,#8*2]
+
+	orr	x21, x8, x14	// check top-most limbs, ...
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x6, ne
+	orr	x21, x8, x14	// ... ones before top-most, ...
+	csel	x13, x13, x12, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x5, ne
+	orr	x21, x8, x14	// ... and ones before that ...
+	csel	x13, x13, x11, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x4, ne
+	orr	x21, x8, x14	// and one more, ...
+	csel	x13, x13, x10, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x3, ne
+	orr	x21, x8, x14
+	csel	x13, x13, x9, ne
+
+	clz	x21, x21
+	cmp	x21, #64
+	csel	x21, x21, xzr, ne
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	neg	x22, x21
+
+	lslv	x8, x8, x21	// align high limbs to the left
+	lslv	x14, x14, x21
+	lsrv	x7, x7, x22
+	lsrv	x13, x13, x22
+	and	x7, x7, x22, asr#6
+	and	x13, x13, x22, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	bfxil	x8, x3, #0, #32
+	bfxil	x14, x9, #0, #32
+
+	b	__inner_loop_30
+	ret
+.size	__ab_approximation_30,.-__ab_approximation_30
+
+.type	__inner_loop_30, %function
+.align	4
+__inner_loop_30:
+	mov	x28, #30
+	mov	x17, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x20, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x27,#0x7FFFFFFF7FFFFFFF
+
+.Loop_30:
+	sbfx	x24, x8, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x8, x14
+	sub	x28, x28, #1
+	and	x21, x14, x24
+
+	sub	x22, x14, x8		// |b_|-|a_|
+	subs	x23, x8, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1	// L + (a_ & b_) >> 1
+	mov	x21, x20
+	csel	x14, x14, x8, hs	// |b_| = |a_|
+	csel	x8, x23, x22, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x20, x20, x17,  hs	// exchange |fg0| and |fg1|
+	csel	x17, x17, x21, hs
+	csel	x2,   x2,   x25, hs
+	lsr	x8, x8, #1
+	and	x21, x20, x24
+	and	x22, x27, x24
+	add	x23, x14, #2
+	sub	x17, x17, x21	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x20, x20, x20	// |f1|<<=1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+	add	x17, x17, x22
+	sub	x20, x20, x27
+
+	cbnz	x28, .Loop_30
+
+	mov	x27, #0x7FFFFFFF
+	ubfx	x16, x17, #0, #32
+	ubfx	x17, x17, #32, #32
+	ubfx	x19, x20, #0, #32
+	ubfx	x20, x20, #32, #32
+	sub	x16, x16, x27		// remove the bias
+	sub	x17, x17, x27
+	sub	x19, x19, x27
+	sub	x20, x20, x27
+
+	ret
+.size	__inner_loop_30,.-__inner_loop_30
+.type	__inner_loop_48, %function
+.align	4
+__inner_loop_48:
+.Loop_48:
+	sbfx	x24, x8, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x8, x14
+	sub	x15, x15, #1
+	and	x21, x14, x24
+	sub	x22, x14, x8		// |b_|-|a_|
+	subs	x23, x8, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1
+	csel	x14, x14, x8, hs	// |b_| = |a_|
+	csel	x8, x23, x22, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x2,   x2,   x25, hs
+	add	x23, x14, #2
+	lsr	x8, x8, #1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+
+	cbnz	x15, .Loop_48
+
+	ret
+.size	__inner_loop_48,.-__inner_loop_48
diff --git a/blst/elf/ct_is_square_mod_384-x86_64.s b/blst/elf/ct_is_square_mod_384-x86_64.s
new file mode 100644
index 0000000..fec1493
--- /dev/null
+++ b/blst/elf/ct_is_square_mod_384-x86_64.s
@@ -0,0 +1,479 @@
+.text	
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384,@function
+.align	32
+ct_is_square_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$536,%rsp
+.cfi_adjust_cfa_offset	536
+
+
+	leaq	24+255(%rsp),%rax
+	andq	$-256,%rax
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rcx
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%rdi
+	movq	%rax,%rsi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rcx,72(%rax)
+	movq	%rdx,80(%rax)
+	movq	%rdi,88(%rax)
+
+	xorq	%rbp,%rbp
+	movl	$24,%ecx
+	jmp	.Loop_is_square
+
+.align	32
+.Loop_is_square:
+	movl	%ecx,16(%rsp)
+
+	call	__ab_approximation_30
+	movq	%rax,0(%rsp)
+	movq	%rbx,8(%rsp)
+
+	movq	$128+48,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_384_n_shift_by_30
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	-48(%rdi),%rdi
+	call	__smulq_384_n_shift_by_30
+
+	movl	16(%rsp),%ecx
+	xorq	$128,%rsi
+
+	andq	48(%rdi),%r14
+	shrq	$1,%r14
+	addq	%r14,%rbp
+
+	subl	$1,%ecx
+	jnz	.Loop_is_square
+
+
+
+
+	movq	48(%rsi),%r9
+	call	__inner_loop_48
+
+	movq	$1,%rax
+	andq	%rbp,%rax
+	xorq	$1,%rax
+
+	leaq	536(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-536-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smulq_384_n_shift_by_30,@function
+.align	32
+__smulq_384_n_shift_by_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%rdx,%r14
+	andq	%rbx,%r14
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	negq	%r14
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	%rdx,%r14
+	leaq	48(%rsi),%rsi
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%rdx,%r15
+	andq	%rbx,%r15
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	negq	%r15
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	%rdx,%r15
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	%r15,%r14
+
+	shrdq	$30,%r9,%r8
+	shrdq	$30,%r10,%r9
+	shrdq	$30,%r11,%r10
+	shrdq	$30,%r12,%r11
+	shrdq	$30,%r13,%r12
+	shrdq	$30,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbx,%rbx
+	subq	%r14,%rbx
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbx,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30
+.type	__ab_approximation_30,@function
+.align	32
+__ab_approximation_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	88(%rsi),%rbx
+	movq	80(%rsi),%r15
+	movq	72(%rsi),%r14
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r11,%r12
+	movq	64(%rsi),%r11
+	cmovzq	%r14,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r10,%r12
+	movq	56(%rsi),%r10
+	cmovzq	%r11,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r9,%r12
+	movq	48(%rsi),%r9
+	cmovzq	%r10,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r8,%r12
+	cmovzq	%r9,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r13
+	cmovzq	%r9,%rbx
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%r12,%r13
+	shldq	%cl,%r15,%rbx
+
+	movq	$0xFFFFFFFF00000000,%rax
+	movl	%r8d,%r8d
+	movl	%r9d,%r9d
+	andq	%rax,%r13
+	andq	%rax,%rbx
+	orq	%r13,%r8
+	orq	%rbx,%r9
+
+	jmp	__inner_loop_30
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__ab_approximation_30,.-__ab_approximation_30
+.type	__inner_loop_30,@function
+.align	32
+__inner_loop_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rbx
+	movq	$0x800000007FFFFFFF,%rcx
+	leaq	-1(%rbx),%r15
+	movl	$30,%edi
+
+.Loop_30:
+	movq	%r8,%rax
+	andq	%r9,%rax
+	shrq	$1,%rax
+
+	cmpq	%r9,%r8
+	movq	%r8,%r10
+	movq	%r9,%r11
+	leaq	(%rax,%rbp,1),%rax
+	movq	%rbx,%r12
+	movq	%rcx,%r13
+	movq	%rbp,%r14
+	cmovbq	%r9,%r8
+	cmovbq	%r10,%r9
+	cmovbq	%rcx,%rbx
+	cmovbq	%r12,%rcx
+	cmovbq	%rax,%rbp
+
+	subq	%r9,%r8
+	subq	%rcx,%rbx
+	addq	%r15,%rbx
+
+	testq	$1,%r10
+	cmovzq	%r10,%r8
+	cmovzq	%r11,%r9
+	cmovzq	%r12,%rbx
+	cmovzq	%r13,%rcx
+	cmovzq	%r14,%rbp
+
+	leaq	2(%r9),%rax
+	shrq	$1,%r8
+	shrq	$2,%rax
+	addq	%rcx,%rcx
+	leaq	(%rax,%rbp,1),%rbp
+	subq	%r15,%rcx
+
+	subl	$1,%edi
+	jnz	.Loop_30
+
+	shrq	$32,%r15
+	movl	%ebx,%eax
+	shrq	$32,%rbx
+	movl	%ecx,%edx
+	shrq	$32,%rcx
+	subq	%r15,%rax
+	subq	%r15,%rbx
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_30,.-__inner_loop_30
+
+.type	__inner_loop_48,@function
+.align	32
+__inner_loop_48:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movl	$48,%edi
+
+.Loop_48:
+	movq	%r8,%rax
+	andq	%r9,%rax
+	shrq	$1,%rax
+
+	cmpq	%r9,%r8
+	movq	%r8,%r10
+	movq	%r9,%r11
+	leaq	(%rax,%rbp,1),%rax
+	movq	%rbp,%r12
+	cmovbq	%r9,%r8
+	cmovbq	%r10,%r9
+	cmovbq	%rax,%rbp
+
+	subq	%r9,%r8
+
+	testq	$1,%r10
+	cmovzq	%r10,%r8
+	cmovzq	%r11,%r9
+	cmovzq	%r12,%rbp
+
+	leaq	2(%r9),%rax
+	shrq	$1,%r8
+	shrq	$2,%rax
+	addq	%rax,%rbp
+
+	subl	$1,%edi
+	jnz	.Loop_48
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_48,.-__inner_loop_48
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/ctq_inverse_mod_384-x86_64.s b/blst/elf/ctq_inverse_mod_384-x86_64.s
new file mode 100644
index 0000000..b702262
--- /dev/null
+++ b/blst/elf/ctq_inverse_mod_384-x86_64.s
@@ -0,0 +1,1195 @@
+.text	
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383,@function
+.align	32
+ct_inverse_mod_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$1112,%rsp
+.cfi_adjust_cfa_offset	1112
+
+
+	leaq	88+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	0(%rdx),%r14
+	movq	8(%rdx),%r15
+	movq	16(%rdx),%rbx
+	movq	24(%rdx),%rbp
+	movq	32(%rdx),%rsi
+	movq	40(%rdx),%rdi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rbp,72(%rax)
+	movq	%rsi,80(%rax)
+	movq	%rax,%rsi
+	movq	%rdi,88(%rax)
+
+
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+	movq	%rdx,96(%rdi)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+	movq	%rdx,96(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+
+	movq	96(%rsi),%rax
+	movq	144(%rsi),%r11
+	movq	%rdx,%rbx
+	movq	%rax,%r10
+	imulq	56(%rsp)
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	64(%rsp)
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	sarq	$63,%r9
+	movq	%r9,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	leaq	96(%rsi),%rsi
+
+	movq	%r10,%rax
+	imulq	%rbx
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	%rcx
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,96(%rdi)
+	movq	%r9,104(%rdi)
+	sarq	$63,%r9
+	movq	%r9,112(%rdi)
+	movq	%r9,120(%rdi)
+	movq	%r9,128(%rdi)
+	movq	%r9,136(%rdi)
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	sarq	$63,%r13
+	movq	%r13,48(%rdi)
+	movq	%r13,56(%rdi)
+	movq	%r13,64(%rdi)
+	movq	%r13,72(%rdi)
+	movq	%r13,80(%rdi)
+	movq	%r13,88(%rdi)
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	48(%rsi),%r10
+	movq	56(%rsi),%r11
+	call	__inner_loop_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	movq	%r8,0(%rdi)
+	movq	%r10,48(%rdi)
+
+
+
+	leaq	96(%rsi),%rsi
+	leaq	96(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+
+
+	xorq	$256+96,%rsi
+	movl	$22,%edi
+
+	movq	0(%rsi),%r8
+	xorq	%r9,%r9
+	movq	48(%rsi),%r10
+	xorq	%r11,%r11
+	call	__inner_loop_62
+
+
+
+
+
+
+
+	leaq	96(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulq_767x63
+
+	movq	40(%rsp),%rsi
+	movq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rax,%r8
+	movq	%rax,%r9
+	movq	%rax,%r10
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	movq	%rax,%r11
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%r11
+	movq	%rax,%r12
+	andq	32(%rsi),%r12
+	andq	40(%rsi),%rax
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rbx
+	adcq	%r11,%rbp
+	adcq	%r12,%rcx
+	adcq	%rax,%rdx
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%rbx,64(%rdi)
+	movq	%rbp,72(%rdi)
+	movq	%rcx,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	leaq	1112(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-1112-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+.type	__smulq_767x63,@function
+.align	32
+__smulq_767x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	leaq	48(%rsi),%rsi
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,0(%rdi)
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	%r9,8(%rdi)
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	%r10,16(%rdi)
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%r11,24(%rdi)
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	%r12,32(%rdi)
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,40(%rdi)
+	movq	%rdx,48(%rdi)
+	sarq	$63,%rdx
+	movq	%rdx,56(%rdi)
+	movq	%rcx,%rdx
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+	movq	56(%rsi),%r15
+	movq	64(%rsi),%rbx
+	movq	72(%rsi),%rbp
+	movq	80(%rsi),%rcx
+	movq	88(%rsi),%rdi
+
+	movq	%rdx,%rsi
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rsi
+	addq	%rax,%rsi
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	xorq	%rdx,%rbx
+	xorq	%rdx,%rbp
+	xorq	%rdx,%rcx
+	xorq	%rdx,%rdi
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	adcq	$0,%rcx
+	adcq	$0,%rdi
+
+	mulq	%rsi
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rsi
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rsi
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rsi
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rsi
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	mulq	%rsi
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rsi
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	mulq	%rsi
+	addq	%rax,%r15
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+	mulq	%rsi
+	addq	%rax,%rbx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+	mulq	%rsi
+	addq	%rax,%rbp
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+	mulq	%rsi
+	addq	%rax,%rcx
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+	movq	8(%rsp),%rdx
+	imulq	%rsi,%rax
+	movq	16(%rsp),%rsi
+	addq	%rdi,%rax
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	32(%rdx),%r12
+	adcq	40(%rdx),%r13
+	adcq	48(%rdx),%r14
+	movq	56(%rdx),%rdi
+	adcq	%rdi,%r15
+	adcq	%rdi,%rbx
+	adcq	%rdi,%rbp
+	adcq	%rdi,%rcx
+	adcq	%rdi,%rax
+
+	movq	%rdx,%rdi
+
+	movq	%r8,0(%rdx)
+	movq	%r9,8(%rdx)
+	movq	%r10,16(%rdx)
+	movq	%r11,24(%rdx)
+	movq	%r12,32(%rdx)
+	movq	%r13,40(%rdx)
+	movq	%r14,48(%rdx)
+	movq	%r15,56(%rdx)
+	movq	%rbx,64(%rdx)
+	movq	%rbp,72(%rdx)
+	movq	%rcx,80(%rdx)
+	movq	%rax,88(%rdx)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_767x63,.-__smulq_767x63
+.type	__smulq_383x63,@function
+.align	32
+__smulq_383x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp,%rax
+	addq	%rax,%r13
+
+	leaq	48(%rsi),%rsi
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp,%rax
+	addq	%rax,%r13
+
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_383x63,.-__smulq_383x63
+.type	__smulq_383_n_shift_by_62,@function
+.align	32
+__smulq_383_n_shift_by_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	leaq	48(%rsi),%rsi
+	movq	%rdx,%r14
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$62,%r9,%r8
+	shrdq	$62,%r10,%r9
+	shrdq	$62,%r11,%r10
+	shrdq	$62,%r12,%r11
+	shrdq	$62,%r13,%r12
+	shrdq	$62,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62
+.type	__ab_approximation_62,@function
+.align	32
+__ab_approximation_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	40(%rsi),%r9
+	movq	88(%rsi),%r11
+	movq	32(%rsi),%rbx
+	movq	80(%rsi),%rbp
+	movq	24(%rsi),%r8
+	movq	72(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	16(%rsi),%r8
+	movq	64(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	8(%rsi),%r8
+	movq	56(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	0(%rsi),%r8
+	movq	48(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	jmp	__inner_loop_62
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__ab_approximation_62,.-__ab_approximation_62
+.type	__inner_loop_62,@function
+.align	8
+.long	0
+__inner_loop_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	$1,%r13
+	movq	%rsi,8(%rsp)
+
+.Loop_62:
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	testq	$1,%r8
+	movq	%r10,%rbp
+	movq	%r11,%r14
+	cmovnzq	%r10,%rax
+	cmovnzq	%r11,%rbx
+	subq	%r8,%rbp
+	sbbq	%r9,%r14
+	movq	%r8,%r15
+	movq	%r9,%rsi
+	subq	%rax,%r8
+	sbbq	%rbx,%r9
+	cmovcq	%rbp,%r8
+	cmovcq	%r14,%r9
+	cmovcq	%r15,%r10
+	cmovcq	%rsi,%r11
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrdq	$1,%r9,%r8
+	shrq	$1,%r9
+	testq	$1,%r15
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%edi
+	jnz	.Loop_62
+
+	movq	8(%rsp),%rsi
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_62,.-__inner_loop_62
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/ctx_inverse_mod_384-x86_64.s b/blst/elf/ctx_inverse_mod_384-x86_64.s
new file mode 100644
index 0000000..25a5fa5
--- /dev/null
+++ b/blst/elf/ctx_inverse_mod_384-x86_64.s
@@ -0,0 +1,1574 @@
+.text	
+
+.globl	ctx_inverse_mod_383
+.type	ctx_inverse_mod_383,@function
+.align	32
+ctx_inverse_mod_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$1112,%rsp
+.cfi_adjust_cfa_offset	1112
+
+
+	leaq	88+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	0(%rdx),%r14
+	movq	8(%rdx),%r15
+	movq	16(%rdx),%rbx
+	movq	24(%rdx),%rbp
+	movq	32(%rdx),%rsi
+	movq	40(%rdx),%rdi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rbp,72(%rax)
+	movq	%rsi,80(%rax)
+	movq	%rax,%rsi
+	movq	%rdi,88(%rax)
+
+
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+	movq	%rdx,96(%rdi)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+	movq	%rdx,96(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+
+	movq	96(%rsi),%rax
+	movq	144(%rsi),%r11
+	movq	%rdx,%rbx
+	movq	%rax,%r10
+	imulq	56(%rsp)
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	64(%rsp)
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	sarq	$63,%r9
+	movq	%r9,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	leaq	96(%rsi),%rsi
+
+	movq	%r10,%rax
+	imulq	%rbx
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	%rcx
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,96(%rdi)
+	movq	%r9,104(%rdi)
+	sarq	$63,%r9
+	movq	%r9,112(%rdi)
+	movq	%r9,120(%rdi)
+	movq	%r9,128(%rdi)
+	movq	%r9,136(%rdi)
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	sarq	$63,%r13
+	movq	%r13,48(%rdi)
+	movq	%r13,56(%rdi)
+	movq	%r13,64(%rdi)
+	movq	%r13,72(%rdi)
+	movq	%r13,80(%rdi)
+	movq	%r13,88(%rdi)
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+
+	xorq	$256+96,%rsi
+	movl	$53,%edi
+
+	movq	0(%rsi),%r8
+
+	movq	48(%rsi),%r10
+
+	call	__inner_loop_62
+
+
+
+
+
+
+
+	leaq	96(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulx_767x63
+
+	movq	40(%rsp),%rsi
+	movq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rax,%r8
+	movq	%rax,%r9
+	movq	%rax,%r10
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	movq	%rax,%r11
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%r11
+	movq	%rax,%r12
+	andq	32(%rsi),%r12
+	andq	40(%rsi),%rax
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rbx
+	adcq	%r11,%rbp
+	adcq	%r12,%rcx
+	adcq	%rax,%rdx
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%rbx,64(%rdi)
+	movq	%rbp,72(%rdi)
+	movq	%rcx,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	leaq	1112(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-1112-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ctx_inverse_mod_383,.-ctx_inverse_mod_383
+.type	__smulx_767x63,@function
+.align	32
+__smulx_767x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	leaq	48(%rsi),%rsi
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+	movq	%rdx,48(%rdi)
+	sarq	$63,%rdx
+	movq	%rdx,56(%rdi)
+	movq	%rcx,%rdx
+	movq	%rcx,%rax
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+	movq	56(%rsi),%r15
+	movq	64(%rsi),%rbx
+	movq	72(%rsi),%rbp
+	movq	80(%rsi),%rcx
+	movq	88(%rsi),%rdi
+
+	sarq	$63,%rax
+	xorq	%rsi,%rsi
+	subq	%rax,%rsi
+
+	xorq	%rax,%rdx
+	addq	%rsi,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%rax,%r13
+	xorq	%rax,%r14
+	xorq	%rax,%r15
+	xorq	%rax,%rbx
+	xorq	%rax,%rbp
+	xorq	%rax,%rcx
+	xorq	%rax,%rdi
+	addq	%rsi,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	adcq	$0,%rcx
+	adcq	$0,%rdi
+
+	mulxq	%r8,%r8,%rax
+	mulxq	%r9,%r9,%rsi
+	addq	%rax,%r9
+	mulxq	%r10,%r10,%rax
+	adcq	%rsi,%r10
+	mulxq	%r11,%r11,%rsi
+	adcq	%rax,%r11
+	mulxq	%r12,%r12,%rax
+	adcq	%rsi,%r12
+	mulxq	%r13,%r13,%rsi
+	adcq	%rax,%r13
+	mulxq	%r14,%r14,%rax
+	adcq	%rsi,%r14
+	mulxq	%r15,%r15,%rsi
+	adcq	%rax,%r15
+	mulxq	%rbx,%rbx,%rax
+	adcq	%rsi,%rbx
+	mulxq	%rbp,%rbp,%rsi
+	adcq	%rax,%rbp
+	mulxq	%rcx,%rcx,%rax
+	adcq	%rsi,%rcx
+	mulxq	%rdi,%rdi,%rsi
+	movq	8(%rsp),%rdx
+	movq	16(%rsp),%rsi
+	adcq	%rdi,%rax
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	32(%rdx),%r12
+	adcq	40(%rdx),%r13
+	adcq	48(%rdx),%r14
+	movq	56(%rdx),%rdi
+	adcq	%rdi,%r15
+	adcq	%rdi,%rbx
+	adcq	%rdi,%rbp
+	adcq	%rdi,%rcx
+	adcq	%rdi,%rax
+
+	movq	%rdx,%rdi
+
+	movq	%r8,0(%rdx)
+	movq	%r9,8(%rdx)
+	movq	%r10,16(%rdx)
+	movq	%r11,24(%rdx)
+	movq	%r12,32(%rdx)
+	movq	%r13,40(%rdx)
+	movq	%r14,48(%rdx)
+	movq	%r15,56(%rdx)
+	movq	%rbx,64(%rdx)
+	movq	%rbp,72(%rdx)
+	movq	%rcx,80(%rdx)
+	movq	%rax,88(%rdx)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulx_767x63,.-__smulx_767x63
+.type	__smulx_383x63,@function
+.align	32
+__smulx_383x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%r12
+	movq	0+40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rdx
+	addq	%rax,%rdx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	xorq	%rbp,%r12
+	xorq	%rbp,%r13
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%rax
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%rax,%r10
+	mulxq	%r11,%r11,%rax
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%rax,%r12
+	mulxq	%r13,%r13,%rax
+	movq	%rcx,%rdx
+	adcq	%rbp,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rdx
+	addq	%rax,%rdx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	xorq	%rbp,%r12
+	xorq	%rbp,%r13
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%rax
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%rax,%r10
+	mulxq	%r11,%r11,%rax
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%rax,%r12
+	mulxq	%r13,%r13,%rax
+	adcq	%rbp,%r13
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulx_383x63,.-__smulx_383x63
+.type	__smulx_383_n_shift_by_31,@function
+.align	32
+__smulx_383_n_shift_by_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	xorq	%r14,%r14
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%r12
+	movq	0+40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	%rdx,%r14
+
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%rax
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$31,%r9,%r8
+	shrdq	$31,%r10,%r9
+	shrdq	$31,%r11,%r10
+	shrdq	$31,%r12,%r11
+	shrdq	$31,%rax,%r12
+	shrdq	$31,%r14,%rax
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31
+.type	__smulx_191_n_shift_by_31,@function
+.align	32
+__smulx_191_n_shift_by_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%r10,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r10
+	addq	%rbp,%r9
+	adcq	$0,%r10
+	imulq	%rdx
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	%rcx,%rdx
+	movq	48+0(%rsi),%r11
+	movq	48+8(%rsi),%r12
+	movq	48+16(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r11,%r11,%rbp
+	mulxq	%r12,%r12,%r13
+	addq	%rbp,%r12
+	adcq	$0,%r13
+	imulq	%rdx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	%r9,%r12
+	adcq	%r10,%r13
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$31,%r12,%r11
+	shrdq	$31,%r13,%r12
+	shrdq	$31,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbp,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+	movq	%r13,16(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31
+.type	__ab_approximation_31,@function
+.align	32
+__ab_approximation_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	40(%rsi),%r9
+	movq	88(%rsi),%r11
+	movq	32(%rsi),%rbx
+	movq	80(%rsi),%rbp
+	movq	24(%rsi),%r8
+	movq	72(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	16(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	64(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	8(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	56(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	0(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	48(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r9
+	cmovzq	%r10,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	movl	$0x7FFFFFFF,%eax
+	andq	%rax,%r8
+	andq	%rax,%r10
+	andnq	%r9,%rax,%r9
+	andnq	%r11,%rax,%r11
+	orq	%r9,%r8
+	orq	%r11,%r10
+
+	jmp	__inner_loop_31
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__ab_approximation_31,.-__ab_approximation_31
+.type	__inner_loop_31,@function
+.align	32
+__inner_loop_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rcx
+	movq	$0x800000007FFFFFFF,%r13
+	movq	$0x7FFFFFFF7FFFFFFF,%r15
+
+.Loop_31:
+	cmpq	%r10,%r8
+	movq	%r8,%rax
+	movq	%r10,%rbx
+	movq	%rcx,%rbp
+	movq	%r13,%r14
+	cmovbq	%r10,%r8
+	cmovbq	%rax,%r10
+	cmovbq	%r13,%rcx
+	cmovbq	%rbp,%r13
+
+	subq	%r10,%r8
+	subq	%r13,%rcx
+	addq	%r15,%rcx
+
+	testq	$1,%rax
+	cmovzq	%rax,%r8
+	cmovzq	%rbx,%r10
+	cmovzq	%rbp,%rcx
+	cmovzq	%r14,%r13
+
+	shrq	$1,%r8
+	addq	%r13,%r13
+	subq	%r15,%r13
+	subl	$1,%edi
+	jnz	.Loop_31
+
+	shrq	$32,%r15
+	movl	%ecx,%edx
+	movl	%r13d,%r12d
+	shrq	$32,%rcx
+	shrq	$32,%r13
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+	subq	%r15,%r12
+	subq	%r15,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_31,.-__inner_loop_31
+
+.type	__inner_loop_62,@function
+.align	32
+__inner_loop_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	$1,%r13
+
+.Loop_62:
+	xorq	%rax,%rax
+	testq	$1,%r8
+	movq	%r10,%rbx
+	cmovnzq	%r10,%rax
+	subq	%r8,%rbx
+	movq	%r8,%rbp
+	subq	%rax,%r8
+	cmovcq	%rbx,%r8
+	cmovcq	%rbp,%r10
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrq	$1,%r8
+	testq	$1,%rbp
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%edi
+	jnz	.Loop_62
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_62,.-__inner_loop_62
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/div3w-armv8.S b/blst/elf/div3w-armv8.S
new file mode 100644
index 0000000..a2b1d67
--- /dev/null
+++ b/blst/elf/div3w-armv8.S
@@ -0,0 +1,88 @@
+.text
+
+.globl	div_3_limbs
+.type	div_3_limbs,%function
+.align	5
+div_3_limbs:
+	ldp	x4,x5,[x0]	// load R
+	eor	x0,x0,x0	// Q = 0
+	mov	x3,#64		// loop counter
+	nop
+
+.Loop:
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,x0	// Q <<= 1
+	sbcs	x7,x5,x2
+	add	x0,x0,#1	// Q + speculative bit
+	csel	x4,x4,x6,lo	// select between R and R - D
+	extr	x1,x2,x1,#1	// D >>= 1
+	csel	x5,x5,x7,lo
+	lsr	x2,x2,#1
+	sbc	x0,x0,xzr	// subtract speculative bit
+	sub	x3,x3,#1
+	cbnz	x3,.Loop
+
+	asr	x3,x0,#63	// top bit -> mask
+	add	x0,x0,x0	// Q <<= 1
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,#1	// Q + specilative bit
+	sbcs	x7,x5,x2
+	sbc	x0,x0,xzr	// subtract speculative bit
+
+	orr	x0,x0,x3	// all ones if overflow
+
+	ret
+.size	div_3_limbs,.-div_3_limbs
+.globl	quot_rem_128
+.type	quot_rem_128,%function
+.align	5
+quot_rem_128:
+	ldp	x3,x4,[x1]
+
+	mul	x5,x3,x2	// divisor[0:1} * quotient
+	umulh	x6,x3,x2
+	mul	x11,  x4,x2
+	umulh	x7,x4,x2
+
+	ldp	x8,x9,[x0]	// load 3 limbs of the dividend
+	ldr	x10,[x0,#16]
+
+	adds	x6,x6,x11
+	adc	x7,x7,xzr
+
+	subs	x8,x8,x5	// dividend - divisor * quotient
+	sbcs	x9,x9,x6
+	sbcs	x10,x10,x7
+	sbc	x5,xzr,xzr		// borrow -> mask
+
+	add	x2,x2,x5	// if borrowed, adjust the quotient ...
+	and	x3,x3,x5
+	and	x4,x4,x5
+	adds	x8,x8,x3	// ... and add divisor
+	adc	x9,x9,x4
+
+	stp	x8,x9,[x0]	// save 2 limbs of the remainder
+	str	x2,[x0,#16]	// and one limb of the quotient
+
+	mov	x0,x2		// return adjusted quotient
+
+	ret
+.size	quot_rem_128,.-quot_rem_128
+
+.globl	quot_rem_64
+.type	quot_rem_64,%function
+.align	5
+quot_rem_64:
+	ldr	x3,[x1]
+	ldr	x8,[x0]	// load 1 limb of the dividend
+
+	mul	x5,x3,x2	// divisor * quotient
+
+	sub	x8,x8,x5	// dividend - divisor * quotient
+
+	stp	x8,x2,[x0]	// save remainder and quotient
+
+	mov	x0,x2		// return quotient
+
+	ret
+.size	quot_rem_64,.-quot_rem_64
diff --git a/blst/elf/div3w-x86_64.s b/blst/elf/div3w-x86_64.s
new file mode 100644
index 0000000..00ae569
--- /dev/null
+++ b/blst/elf/div3w-x86_64.s
@@ -0,0 +1,123 @@
+.text	
+
+.globl	div_3_limbs
+.hidden	div_3_limbs
+.type	div_3_limbs,@function
+.align	32
+div_3_limbs:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	(%rdi),%r8
+	movq	8(%rdi),%r9
+	xorq	%rax,%rax
+	movl	$64,%ecx
+
+.Loop:
+	movq	%r8,%r10
+	subq	%rsi,%r8
+	movq	%r9,%r11
+	sbbq	%rdx,%r9
+	leaq	1(%rax,%rax,1),%rax
+	movq	%rdx,%rdi
+	cmovcq	%r10,%r8
+	cmovcq	%r11,%r9
+	sbbq	$0,%rax
+	shlq	$63,%rdi
+	shrq	$1,%rsi
+	shrq	$1,%rdx
+	orq	%rdi,%rsi
+	subl	$1,%ecx
+	jnz	.Loop
+
+	leaq	1(%rax,%rax,1),%rcx
+	sarq	$63,%rax
+
+	subq	%rsi,%r8
+	sbbq	%rdx,%r9
+	sbbq	$0,%rcx
+
+	orq	%rcx,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	div_3_limbs,.-div_3_limbs
+.globl	quot_rem_128
+.hidden	quot_rem_128
+.type	quot_rem_128,@function
+.align	32
+quot_rem_128:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rax
+	movq	%rdx,%rcx
+
+	mulq	0(%rsi)
+	movq	%rax,%r8
+	movq	%rcx,%rax
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r9
+	adcq	$0,%rdx
+
+	movq	0(%rdi),%r10
+	movq	8(%rdi),%r11
+	movq	16(%rdi),%rax
+
+	subq	%r8,%r10
+	sbbq	%r9,%r11
+	sbbq	%rdx,%rax
+	sbbq	%r8,%r8
+
+	addq	%r8,%rcx
+	movq	%r8,%r9
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	addq	%r8,%r10
+	adcq	%r9,%r11
+
+	movq	%r10,0(%rdi)
+	movq	%r11,8(%rdi)
+	movq	%rcx,16(%rdi)
+
+	movq	%rcx,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	quot_rem_128,.-quot_rem_128
+
+
+
+
+
+.globl	quot_rem_64
+.hidden	quot_rem_64
+.type	quot_rem_64,@function
+.align	32
+quot_rem_64:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rax
+	imulq	0(%rsi),%rdx
+
+	movq	0(%rdi),%r10
+
+	subq	%rdx,%r10
+
+	movq	%r10,0(%rdi)
+	movq	%rax,8(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	quot_rem_64,.-quot_rem_64
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/mul_mont_256-armv8.S b/blst/elf/mul_mont_256-armv8.S
new file mode 100644
index 0000000..8bb1197
--- /dev/null
+++ b/blst/elf/mul_mont_256-armv8.S
@@ -0,0 +1,464 @@
+.text
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,%function
+.align	5
+mul_mont_sparse_256:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x10,x11,[x1]
+	ldr	x9,        [x2]
+	ldp	x12,x13,[x1,#16]
+
+	mul	x19,x10,x9
+	ldp	x5,x6,[x3]
+	mul	x20,x11,x9
+	ldp	x7,x8,[x3,#16]
+	mul	x21,x12,x9
+	mul	x22,x13,x9
+
+	umulh	x14,x10,x9
+	umulh	x15,x11,x9
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	umulh	x17,x13,x9
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,xzr,    x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*1]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*2]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*3]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	adcs	x20,x21,x15
+	adcs	x21,x22,x16
+	adcs	x22,x23,x17
+	adc	x23,xzr,xzr
+
+	subs	x14,x19,x5
+	sbcs	x15,x20,x6
+	sbcs	x16,x21,x7
+	sbcs	x17,x22,x8
+	sbcs	xzr,    x23,xzr
+
+	csel	x19,x19,x14,lo
+	csel	x20,x20,x15,lo
+	csel	x21,x21,x16,lo
+	csel	x22,x22,x17,lo
+
+	stp	x19,x20,[x0]
+	stp	x21,x22,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,%function
+.align	5
+sqr_mont_sparse_256:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	mov	x4,x3
+
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x11,x6,x5	// a[1]*a[0]
+	umulh	x15,x6,x5
+	mul	x12,x7,x5	// a[2]*a[0]
+	umulh	x16,x7,x5
+	mul	x13,x8,x5	// a[3]*a[0]
+	umulh	x19,x8,x5
+
+	adds	x12,x12,x15	// accumulate high parts of multiplication
+	mul	x14,x7,x6	// a[2]*a[1]
+	umulh	x15,x7,x6
+	adcs	x13,x13,x16
+	mul	x16,x8,x6	// a[3]*a[1]
+	umulh	x17,x8,x6
+	adc	x19,x19,xzr	// can't overflow
+
+	mul	x20,x8,x7	// a[3]*a[2]
+	umulh	x21,x8,x7
+
+	adds	x15,x15,x16	// accumulate high parts of multiplication
+	mul	x10,x5,x5	// a[0]*a[0]
+	adc	x16,x17,xzr	// can't overflow
+
+	adds	x13,x13,x14	// accumulate low parts of multiplication
+	umulh	x5,x5,x5
+	adcs	x19,x19,x15
+	mul	x15,x6,x6	// a[1]*a[1]
+	adcs	x20,x20,x16
+	umulh	x6,x6,x6
+	adc	x21,x21,xzr	// can't overflow
+
+	adds	x11,x11,x11	// acc[1-6]*=2
+	mul	x16,x7,x7	// a[2]*a[2]
+	adcs	x12,x12,x12
+	umulh	x7,x7,x7
+	adcs	x13,x13,x13
+	mul	x17,x8,x8	// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x8,x8,x8
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adc	x22,xzr,xzr
+
+	adds	x11,x11,x5	// +a[i]*a[i]
+	adcs	x12,x12,x15
+	adcs	x13,x13,x6
+	adcs	x19,x19,x16
+	adcs	x20,x20,x7
+	adcs	x21,x21,x17
+	adc	x22,x22,x8
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	adds	x10,x10,x19	// accumulate upper half
+	adcs	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adc	x19,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x19,xzr
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,%function
+.align	5
+from_mont_256:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+	.inst	0xd50323bf
+	ret
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,%function
+.align	5
+redc_mont_256:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	ldp	x14,x15,[x1,#32]
+	ldp	x16,x17,[x1,#48]
+
+	adds	x10,x10,x14
+	adcs	x11,x11,x15
+	adcs	x12,x12,x16
+	adcs	x13,x13,x17
+	adc	x9,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x9,xzr
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+	.inst	0xd50323bf
+	ret
+.size	redc_mont_256,.-redc_mont_256
+
+.type	__mul_by_1_mont_256,%function
+.align	5
+__mul_by_1_mont_256:
+	mul	x3,x4,x10
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	adc	x13,x9,x17
+
+	ret
+.size	__mul_by_1_mont_256,.-__mul_by_1_mont_256
diff --git a/blst/elf/mul_mont_384-armv8.S b/blst/elf/mul_mont_384-armv8.S
new file mode 100644
index 0000000..c048e81
--- /dev/null
+++ b/blst/elf/mul_mont_384-armv8.S
@@ -0,0 +1,2372 @@
+.text
+
+.globl	add_mod_384x384
+.type	add_mod_384x384,%function
+.align	5
+add_mod_384x384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__add_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	.inst	0xd50323bf
+	ret
+.size	add_mod_384x384,.-add_mod_384x384
+
+.type	__add_mod_384x384,%function
+.align	5
+__add_mod_384x384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	adds	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	adcs	x14,x14,x22
+	stp	x11,  x12,  [x0]
+	adcs	x15,x15,x23
+	ldp	x11,  x12,  [x1,#48]
+	adcs	x16,x16,x24
+
+	ldp	x19,x20,[x2,#48]
+	stp	x13,  x14,  [x0,#16]
+	ldp	x13,  x14,  [x1,#64]
+	ldp	x21,x22,[x2,#64]
+
+	adcs	x11,x11,x19
+	stp	x15,  x16,  [x0,#32]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#80]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#80]
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x17,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x17,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	stp	x11,x12,[x0,#48]
+	csel	x15,x15,x23,lo
+	stp	x13,x14,[x0,#64]
+	csel	x16,x16,x24,lo
+	stp	x15,x16,[x0,#80]
+
+	ret
+.size	__add_mod_384x384,.-__add_mod_384x384
+
+.globl	sub_mod_384x384
+.type	sub_mod_384x384,%function
+.align	5
+sub_mod_384x384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	.inst	0xd50323bf
+	ret
+.size	sub_mod_384x384,.-sub_mod_384x384
+
+.type	__sub_mod_384x384,%function
+.align	5
+__sub_mod_384x384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	subs	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	sbcs	x14,x14,x22
+	stp	x11,  x12,  [x0]
+	sbcs	x15,x15,x23
+	ldp	x11,  x12,  [x1,#48]
+	sbcs	x16,x16,x24
+
+	ldp	x19,x20,[x2,#48]
+	stp	x13,  x14,  [x0,#16]
+	ldp	x13,  x14,  [x1,#64]
+	ldp	x21,x22,[x2,#64]
+
+	sbcs	x11,x11,x19
+	stp	x15,  x16,  [x0,#32]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#80]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#80]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x17,xzr,xzr
+
+	and	x19,x5,x17
+	and	x20,x6,x17
+	adds	x11,x11,x19
+	and	x21,x7,x17
+	adcs	x12,x12,x20
+	and	x22,x8,x17
+	adcs	x13,x13,x21
+	and	x23,x9,x17
+	adcs	x14,x14,x22
+	and	x24,x10,x17
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0,#48]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#64]
+	stp	x15,x16,[x0,#80]
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,%function
+.align	5
+__add_mod_384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	adds	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x17,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x17,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	stp	x11,x12,[x0]
+	csel	x16,x16,x24,lo
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,%function
+.align	5
+__sub_mod_384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	subs	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x17,xzr,xzr
+
+	and	x19,x5,x17
+	and	x20,x6,x17
+	adds	x11,x11,x19
+	and	x21,x7,x17
+	adcs	x12,x12,x20
+	and	x22,x8,x17
+	adcs	x13,x13,x21
+	and	x23,x9,x17
+	adcs	x14,x14,x22
+	and	x24,x10,x17
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	mul_mont_384x
+.hidden	mul_mont_384x
+.type	mul_mont_384x,%function
+.align	5
+mul_mont_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#288		// space for 3 768-bit vectors
+
+	mov	x26,x0		// save r_ptr
+	mov	x27,x1		// save b_ptr
+	mov	x28,x2		// save b_ptr
+
+	sub	x0,sp,#0		// mul_384(t0, a->re, b->re)
+	bl	__mul_384
+
+	add	x1,x1,#48	// mul_384(t1, a->im, b->im)
+	add	x2,x2,#48
+	add	x0,sp,#96
+	bl	__mul_384
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	sub	x2,x1,#48
+	add	x0,sp,#240
+	bl	__add_mod_384
+
+	add	x1,x28,#0
+	add	x2,x28,#48
+	add	x0,sp,#192		// t2
+	bl	__add_mod_384
+
+	add	x1,x0,#0
+	add	x2,x0,#48
+	bl	__mul_384		// mul_384(t2, a->re+a->im, b->re+b->im)
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	mov	x1,x0
+	add	x2,sp,#0
+	bl	__sub_mod_384x384
+
+	add	x2,sp,#96
+	bl	__sub_mod_384x384	// t2 = t2-t0-t1
+
+	add	x1,sp,#0
+	add	x2,sp,#96
+	add	x0,sp,#0
+	bl	__sub_mod_384x384	// t0 = t0-t1
+
+	add	x1,sp,#0		// ret->re = redc(t0)
+	add	x0,x26,#0
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+
+	add	x1,sp,#192		// ret->im = redc(t2)
+	add	x0,x0,#48
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#288
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	mul_mont_384x,.-mul_mont_384x
+
+.globl	sqr_mont_384x
+.hidden	sqr_mont_384x
+.type	sqr_mont_384x,%function
+.align	5
+sqr_mont_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x3,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 2 384-bit vectors
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	add	x2,x1,#48
+	add	x0,sp,#0
+	bl	__add_mod_384		// t0 = a->re + a->im
+
+	add	x0,sp,#48
+	bl	__sub_mod_384		// t1 = a->re - a->im
+
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__mul_mont_384		// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	x11,x11,x11	// add with itself
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x25,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x25,xzr
+
+	csel	x19,x11,x19,lo
+	csel	x20,x12,x20,lo
+	csel	x21,x13,x21,lo
+	ldp	x11,x12,[sp]
+	csel	x22,x14,x22,lo
+	ldr	x17,        [sp,#48]
+	csel	x23,x15,x23,lo
+	ldp	x13,x14,[sp,#16]
+	csel	x24,x16,x24,lo
+	ldp	x15,x16,[sp,#32]
+
+	stp	x19,x20,[x2,#48]
+	stp	x21,x22,[x2,#64]
+	stp	x23,x24,[x2,#80]
+
+	add	x2,sp,#48
+	bl	__mul_mont_384		// mul_mont_384(ret->re, t0, t1)
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_mont_384x,.-sqr_mont_384x
+
+.globl	mul_mont_384
+.hidden	mul_mont_384
+.type	mul_mont_384,%function
+.align	5
+mul_mont_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x4,x0,[sp,#96]	// __mul_mont_384 wants them there
+
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	mul_mont_384,.-mul_mont_384
+
+.type	__mul_mont_384,%function
+.align	5
+__mul_mont_384:
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+	mul	x4,x4,x19
+
+	umulh	x26,x11,x17
+	umulh	x27,x12,x17
+	umulh	x28,x13,x17
+	umulh	x0,x14,x17
+	umulh	x1,x15,x17
+	umulh	x3,x16,x17
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,xzr,    x3
+	mul	x3,x10,x4
+	mov	x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*1]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*2]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*3]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*4]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*5]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	ldp	x4,x2,[x29,#96]	// pull r_ptr
+	adc	x17,x17,xzr
+
+	adds	x19,x20,x26
+	adcs	x20,x21,x27
+	adcs	x21,x22,x28
+	adcs	x22,x23,x0
+	adcs	x23,x24,x1
+	adcs	x24,x25,x3
+	adc	x25,x17,xzr
+
+	subs	x26,x19,x5
+	sbcs	x27,x20,x6
+	sbcs	x28,x21,x7
+	sbcs	x0,x22,x8
+	sbcs	x1,x23,x9
+	sbcs	x3,x24,x10
+	sbcs	xzr,    x25,xzr
+
+	csel	x11,x19,x26,lo
+	csel	x12,x20,x27,lo
+	csel	x13,x21,x28,lo
+	csel	x14,x22,x0,lo
+	csel	x15,x23,x1,lo
+	csel	x16,x24,x3,lo
+	ret
+.size	__mul_mont_384,.-__mul_mont_384
+
+.globl	sqr_mont_384
+.hidden	sqr_mont_384
+.type	sqr_mont_384,%function
+.align	5
+sqr_mont_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	x4,x3		// adjust for missing b_ptr
+
+	mov	x3,x0		// save r_ptr
+	mov	x0,sp
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__sqr_384
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	mov	x1,sp
+	mov	x0,x3		// restore r_ptr
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_mont_384,.-sqr_mont_384
+
+.globl	sqr_n_mul_mont_383
+.hidden	sqr_n_mul_mont_383
+.type	sqr_n_mul_mont_383,%function
+.align	5
+sqr_n_mul_mont_383:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x4,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	x17,x5			// save b_ptr
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+	mov	x0,sp
+.Loop_sqr_383:
+	bl	__sqr_384
+	sub	x2,x2,#1	// counter
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	mov	x1,sp
+	bl	__mul_by_1_mont_384
+
+	ldp	x19,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x11,x11,x19	// just accumulate upper half
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adc	x16,x16,x24
+
+	cbnz	x2,.Loop_sqr_383
+
+	mov	x2,x17
+	ldr	x17,[x17]
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
+.type	__sqr_384,%function
+.align	5
+__sqr_384:
+	mul	x19,x12,x11
+	mul	x20,x13,x11
+	mul	x21,x14,x11
+	mul	x22,x15,x11
+	mul	x23,x16,x11
+
+	umulh	x6,x12,x11
+	umulh	x7,x13,x11
+	umulh	x8,x14,x11
+	umulh	x9,x15,x11
+	adds	x20,x20,x6
+	umulh	x10,x16,x11
+	adcs	x21,x21,x7
+	mul	x7,x13,x12
+	adcs	x22,x22,x8
+	mul	x8,x14,x12
+	adcs	x23,x23,x9
+	mul	x9,x15,x12
+	adc	x24,xzr,    x10
+	mul	x10,x16,x12
+
+	adds	x21,x21,x7
+	umulh	x7,x13,x12
+	adcs	x22,x22,x8
+	umulh	x8,x14,x12
+	adcs	x23,x23,x9
+	umulh	x9,x15,x12
+	adcs	x24,x24,x10
+	umulh	x10,x16,x12
+	adc	x25,xzr,xzr
+
+	mul	x5,x11,x11
+	adds	x22,x22,x7
+	umulh	x11,  x11,x11
+	adcs	x23,x23,x8
+	mul	x8,x14,x13
+	adcs	x24,x24,x9
+	mul	x9,x15,x13
+	adc	x25,x25,x10
+	mul	x10,x16,x13
+
+	adds	x23,x23,x8
+	umulh	x8,x14,x13
+	adcs	x24,x24,x9
+	umulh	x9,x15,x13
+	adcs	x25,x25,x10
+	umulh	x10,x16,x13
+	adc	x26,xzr,xzr
+
+	mul	x6,x12,x12
+	adds	x24,x24,x8
+	umulh	x12,  x12,x12
+	adcs	x25,x25,x9
+	mul	x9,x15,x14
+	adc	x26,x26,x10
+	mul	x10,x16,x14
+
+	adds	x25,x25,x9
+	umulh	x9,x15,x14
+	adcs	x26,x26,x10
+	umulh	x10,x16,x14
+	adc	x27,xzr,xzr
+	mul	x7,x13,x13
+	adds	x26,x26,x9
+	umulh	x13,  x13,x13
+	adc	x27,x27,x10
+	mul	x8,x14,x14
+
+	mul	x10,x16,x15
+	umulh	x14,  x14,x14
+	adds	x27,x27,x10
+	umulh	x10,x16,x15
+	mul	x9,x15,x15
+	adc	x28,x10,xzr
+
+	adds	x19,x19,x19
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adcs	x22,x22,x22
+	adcs	x23,x23,x23
+	adcs	x24,x24,x24
+	adcs	x25,x25,x25
+	adcs	x26,x26,x26
+	umulh	x15,  x15,x15
+	adcs	x27,x27,x27
+	mul	x10,x16,x16
+	adcs	x28,x28,x28
+	umulh	x16,  x16,x16
+	adc	x1,xzr,xzr
+
+	adds	x19,x19,x11
+	adcs	x20,x20,x6
+	adcs	x21,x21,x12
+	adcs	x22,x22,x7
+	adcs	x23,x23,x13
+	adcs	x24,x24,x8
+	adcs	x25,x25,x14
+	stp	x5,x19,[x0]
+	adcs	x26,x26,x9
+	stp	x20,x21,[x0,#16]
+	adcs	x27,x27,x15
+	stp	x22,x23,[x0,#32]
+	adcs	x28,x28,x10
+	stp	x24,x25,[x0,#48]
+	adc	x16,x16,x1
+	stp	x26,x27,[x0,#64]
+	stp	x28,x16,[x0,#80]
+
+	ret
+.size	__sqr_384,.-__sqr_384
+.globl	sqr_384
+.hidden	sqr_384
+.type	sqr_384,%function
+.align	5
+sqr_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__sqr_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_384,.-sqr_384
+
+.globl	redc_mont_384
+.hidden	redc_mont_384
+.type	redc_mont_384,%function
+.align	5
+redc_mont_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	redc_mont_384,.-redc_mont_384
+
+.globl	from_mont_384
+.hidden	from_mont_384
+.type	from_mont_384,%function
+.align	5
+from_mont_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	csel	x16,x16,x24,lo
+
+	stp	x11,x12,[x0]
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	from_mont_384,.-from_mont_384
+
+.type	__mul_by_1_mont_384,%function
+.align	5
+__mul_by_1_mont_384:
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	mul	x26,x4,x11
+	ldp	x15,x16,[x1,#32]
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	ret
+.size	__mul_by_1_mont_384,.-__mul_by_1_mont_384
+
+.type	__redc_tail_mont_384,%function
+.align	5
+__redc_tail_mont_384:
+	ldp	x19,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x11,x11,x19	// accumulate upper half
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x25,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x25,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	csel	x16,x16,x24,lo
+
+	stp	x11,x12,[x0]
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	mul_384
+.hidden	mul_384
+.type	mul_384,%function
+.align	5
+mul_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	mul_384,.-mul_384
+
+.type	__mul_384,%function
+.align	5
+__mul_384:
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+
+	umulh	x5,x11,x17
+	umulh	x6,x12,x17
+	umulh	x7,x13,x17
+	umulh	x8,x14,x17
+	umulh	x9,x15,x17
+	umulh	x10,x16,x17
+	ldr	x17,[x2,8*1]
+
+	str	x19,[x0]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,xzr,    x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(1+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*1]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(2+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*2]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(3+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*3]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(4+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*4]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*5]
+	adds	x19,x20,x5
+	adcs	x20,x21,x6
+	adcs	x21,x22,x7
+	adcs	x22,x23,x8
+	adcs	x23,x24,x9
+	adc	x24,x25,x10
+
+	stp	x19,x20,[x0,#48]
+	stp	x21,x22,[x0,#64]
+	stp	x23,x24,[x0,#80]
+
+	ret
+.size	__mul_384,.-__mul_384
+
+.globl	mul_382x
+.hidden	mul_382x
+.type	mul_382x,%function
+.align	5
+mul_382x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for two 384-bit vectors
+
+	ldp	x11,x12,[x1]
+	mov	x26,x0		// save r_ptr
+	ldp	x19,x20,[x1,#48]
+	mov	x27,x1		// save a_ptr
+	ldp	x13,x14,[x1,#16]
+	mov	x28,x2		// save b_ptr
+	ldp	x21,x22,[x1,#64]
+	ldp	x15,x16,[x1,#32]
+	adds	x5,x11,x19	// t0 = a->re + a->im
+	ldp	x23,x24,[x1,#80]
+	adcs	x6,x12,x20
+	ldp	x11,x12,[x2]
+	adcs	x7,x13,x21
+	ldp	x19,x20,[x2,#48]
+	adcs	x8,x14,x22
+	ldp	x13,x14,[x2,#16]
+	adcs	x9,x15,x23
+	ldp	x21,x22,[x2,#64]
+	adc	x10,x16,x24
+	ldp	x15,x16,[x2,#32]
+
+	stp	x5,x6,[sp]
+	adds	x5,x11,x19	// t1 = b->re + b->im
+	ldp	x23,x24,[x2,#80]
+	adcs	x6,x12,x20
+	stp	x7,x8,[sp,#16]
+	adcs	x7,x13,x21
+	adcs	x8,x14,x22
+	stp	x9,x10,[sp,#32]
+	adcs	x9,x15,x23
+	stp	x5,x6,[sp,#48]
+	adc	x10,x16,x24
+	stp	x7,x8,[sp,#64]
+	stp	x9,x10,[sp,#80]
+
+	bl	__mul_384		// mul_384(ret->re, a->re, b->re)
+
+	add	x1,sp,#0		// mul_384(ret->im, t0, t1)
+	add	x2,sp,#48
+	add	x0,x26,#96
+	bl	__mul_384
+
+	add	x1,x27,#48	// mul_384(tx, a->im, b->im)
+	add	x2,x28,#48
+	add	x0,sp,#0
+	bl	__mul_384
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	add	x1,x26,#96	// ret->im -= tx
+	add	x2,sp,#0
+	add	x0,x26,#96
+	bl	__sub_mod_384x384
+
+	add	x2,x26,#0	// ret->im -= ret->re
+	bl	__sub_mod_384x384
+
+	add	x1,x26,#0	// ret->re -= tx
+	add	x2,sp,#0
+	add	x0,x26,#0
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	mul_382x,.-mul_382x
+
+.globl	sqr_382x
+.hidden	sqr_382x
+.type	sqr_382x,%function
+.align	5
+sqr_382x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	x11,x12,[x1]
+	ldp	x19,x20,[x1,#48]
+	ldp	x13,x14,[x1,#16]
+	adds	x5,x11,x19	// t0 = a->re + a->im
+	ldp	x21,x22,[x1,#64]
+	adcs	x6,x12,x20
+	ldp	x15,x16,[x1,#32]
+	adcs	x7,x13,x21
+	ldp	x23,x24,[x1,#80]
+	adcs	x8,x14,x22
+	stp	x5,x6,[x0]
+	adcs	x9,x15,x23
+	ldp	x5,x6,[x2]
+	adc	x10,x16,x24
+	stp	x7,x8,[x0,#16]
+
+	subs	x11,x11,x19	// t1 = a->re - a->im
+	ldp	x7,x8,[x2,#16]
+	sbcs	x12,x12,x20
+	stp	x9,x10,[x0,#32]
+	sbcs	x13,x13,x21
+	ldp	x9,x10,[x2,#32]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x25,xzr,xzr
+
+	and	x19,x5,x25
+	and	x20,x6,x25
+	adds	x11,x11,x19
+	and	x21,x7,x25
+	adcs	x12,x12,x20
+	and	x22,x8,x25
+	adcs	x13,x13,x21
+	and	x23,x9,x25
+	adcs	x14,x14,x22
+	and	x24,x10,x25
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0,#48]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#64]
+	stp	x15,x16,[x0,#80]
+
+	mov	x4,x1		// save a_ptr
+	add	x1,x0,#0	// mul_384(ret->re, t0, t1)
+	add	x2,x0,#48
+	bl	__mul_384
+
+	add	x1,x4,#0		// mul_384(ret->im, a->re, a->im)
+	add	x2,x4,#48
+	add	x0,x0,#96
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x11,x12,[x0]
+	ldp	x13,x14,[x0,#16]
+	adds	x11,x11,x11	// add with itself
+	ldp	x15,x16,[x0,#32]
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adcs	x19,x19,x19
+	adcs	x20,x20,x20
+	stp	x11,x12,[x0]
+	adcs	x21,x21,x21
+	stp	x13,x14,[x0,#16]
+	adcs	x22,x22,x22
+	stp	x15,x16,[x0,#32]
+	adcs	x23,x23,x23
+	stp	x19,x20,[x0,#48]
+	adc	x24,x24,x24
+	stp	x21,x22,[x0,#64]
+	stp	x23,x24,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_382x,.-sqr_382x
+
+.globl	sqr_mont_382x
+.hidden	sqr_mont_382x
+.type	sqr_mont_382x,%function
+.align	5
+sqr_mont_382x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x3,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#112		// space for two 384-bit vectors + word
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	ldp	x17,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x5,x11,x17	// t0 = a->re + a->im
+	adcs	x6,x12,x20
+	adcs	x7,x13,x21
+	adcs	x8,x14,x22
+	adcs	x9,x15,x23
+	adc	x10,x16,x24
+
+	subs	x19,x11,x17	// t1 = a->re - a->im
+	sbcs	x20,x12,x20
+	sbcs	x21,x13,x21
+	sbcs	x22,x14,x22
+	sbcs	x23,x15,x23
+	sbcs	x24,x16,x24
+	sbc	x25,xzr,xzr		// borrow flag as mask
+
+	stp	x5,x6,[sp]
+	stp	x7,x8,[sp,#16]
+	stp	x9,x10,[sp,#32]
+	stp	x19,x20,[sp,#48]
+	stp	x21,x22,[sp,#64]
+	stp	x23,x24,[sp,#80]
+	str	x25,[sp,#96]
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	add	x2,x1,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	x19,x11,x11	// add with itself
+	adcs	x20,x12,x12
+	adcs	x21,x13,x13
+	adcs	x22,x14,x14
+	adcs	x23,x15,x15
+	adc	x24,x16,x16
+
+	stp	x19,x20,[x2,#48]
+	stp	x21,x22,[x2,#64]
+	stp	x23,x24,[x2,#80]
+
+	ldp	x11,x12,[sp]
+	ldr	x17,[sp,#48]
+	ldp	x13,x14,[sp,#16]
+	ldp	x15,x16,[sp,#32]
+
+	add	x2,sp,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, t0, t1)
+	ldr	x30,[x29,#8]
+
+	ldr	x25,[sp,#96]	// account for sign from a->re - a->im
+	ldp	x19,x20,[sp]
+	ldp	x21,x22,[sp,#16]
+	ldp	x23,x24,[sp,#32]
+
+	and	x19,x19,x25
+	and	x20,x20,x25
+	and	x21,x21,x25
+	and	x22,x22,x25
+	and	x23,x23,x25
+	and	x24,x24,x25
+
+	subs	x11,x11,x19
+	sbcs	x12,x12,x20
+	sbcs	x13,x13,x21
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x25,xzr,xzr
+
+	and	x19,x5,x25
+	and	x20,x6,x25
+	and	x21,x7,x25
+	and	x22,x8,x25
+	and	x23,x9,x25
+	and	x24,x10,x25
+
+	adds	x11,x11,x19
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adc	x16,x16,x24
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#112
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_mont_382x,.-sqr_mont_382x
+
+.type	__mul_mont_383_nonred,%function
+.align	5
+__mul_mont_383_nonred:
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+	mul	x4,x4,x19
+
+	umulh	x26,x11,x17
+	umulh	x27,x12,x17
+	umulh	x28,x13,x17
+	umulh	x0,x14,x17
+	umulh	x1,x15,x17
+	umulh	x3,x16,x17
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,xzr,    x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*1]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*2]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*3]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*4]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*5]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+	ldp	x4,x2,[x29,#96]		// pull r_ptr
+
+	adds	x11,x20,x26
+	adcs	x12,x21,x27
+	adcs	x13,x22,x28
+	adcs	x14,x23,x0
+	adcs	x15,x24,x1
+	adcs	x16,x25,x3
+
+	ret
+.size	__mul_mont_383_nonred,.-__mul_mont_383_nonred
+
+.globl	sgn0_pty_mont_384
+.hidden	sgn0_pty_mont_384
+.type	sgn0_pty_mont_384,%function
+.align	5
+sgn0_pty_mont_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	x4,x2
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	ldp	x9,x10,[x1,#32]
+	mov	x1,x0
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	x0,x11,#1
+	adds	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x0,x0,x17
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384
+
+.globl	sgn0_pty_mont_384x
+.hidden	sgn0_pty_mont_384x
+.type	sgn0_pty_mont_384x,%function
+.align	5
+sgn0_pty_mont_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	x4,x2
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	ldp	x9,x10,[x1,#32]
+	mov	x1,x0
+
+	bl	__mul_by_1_mont_384
+	add	x1,x1,#48
+
+	and	x2,x11,#1
+	orr	x3,x11,x12
+	adds	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	orr	x3,x3,x16
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x2,x2,x17
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	x0,x11,#1
+	orr	x1,x11,x12
+	adds	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	orr	x1,x1,x16
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x0,x0,x17
+
+	cmp	x3,#0
+	csel	x3,x0,x2,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	x1,#0
+	csel	x1,x0,x2,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3		// pack sign and parity
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
diff --git a/blst/elf/mulq_mont_256-x86_64.s b/blst/elf/mulq_mont_256-x86_64.s
new file mode 100644
index 0000000..37abd43
--- /dev/null
+++ b/blst/elf/mulq_mont_256-x86_64.s
@@ -0,0 +1,714 @@
+.text	
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,@function
+.align	32
+mul_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r13
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%rbp
+	movq	%rdx,%rbx
+
+	movq	%rax,%r15
+	mulq	%r13
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+	call	__mulq_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,@function
+.align	32
+sqr_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rax
+	movq	%rcx,%r8
+	movq	8(%rsi),%r14
+	movq	%rdx,%rcx
+	movq	16(%rsi),%r12
+	leaq	(%rsi),%rbx
+	movq	24(%rsi),%rbp
+
+	movq	%rax,%r15
+	mulq	%rax
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+	call	__mulq_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+.type	__mulq_mont_sparse_256,@function
+.align	32
+__mulq_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	xorq	%r14,%r14
+	movq	%rdx,%r13
+
+	movq	%r9,%rdi
+	imulq	%r8,%r9
+
+
+	movq	%rax,%r15
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	%rdx,%r14
+	xorq	%r15,%r15
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r9,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rdx,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	movq	%r10,%rdi
+	imulq	%r8,%r10
+
+
+	movq	%rax,%r9
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	%rdx,%r15
+	xorq	%r9,%r9
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r10,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rdx,%r14
+	adcq	$0,%r15
+	adcq	$0,%r9
+	movq	%r11,%rdi
+	imulq	%r8,%r11
+
+
+	movq	%rax,%r10
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	%rdx,%r9
+	xorq	%r10,%r10
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r11,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	addq	%rdx,%r15
+	adcq	$0,%r9
+	adcq	$0,%r10
+	imulq	%r8,%rax
+	movq	8(%rsp),%rsi
+
+
+	movq	%rax,%r11
+	mulq	0(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	%rdx,%r12
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	movq	%r14,%rbx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+
+	movq	%r15,%r12
+	subq	0(%rcx),%r13
+	sbbq	8(%rcx),%r14
+	sbbq	16(%rcx),%r15
+	movq	%r9,%rbp
+	sbbq	24(%rcx),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rax,%r13
+	cmovcq	%rbx,%r14
+	cmovcq	%r12,%r15
+	movq	%r13,0(%rsi)
+	cmovcq	%rbp,%r9
+	movq	%r14,8(%rsi)
+	movq	%r15,16(%rsi)
+	movq	%r9,24(%rsi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__mulq_mont_sparse_256,.-__mulq_mont_sparse_256
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,@function
+.align	32
+from_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_256
+
+
+
+
+
+	movq	%r14,%r10
+	movq	%r15,%r11
+	movq	%r9,%r12
+
+	subq	0(%rbx),%r13
+	sbbq	8(%rbx),%r14
+	sbbq	16(%rbx),%r15
+	sbbq	24(%rbx),%r9
+
+	cmovncq	%r13,%rax
+	cmovncq	%r14,%r10
+	cmovncq	%r15,%r11
+	movq	%rax,0(%rdi)
+	cmovncq	%r9,%r12
+	movq	%r10,8(%rdi)
+	movq	%r11,16(%rdi)
+	movq	%r12,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,@function
+.align	32
+redc_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_256
+
+	addq	32(%rsi),%r13
+	adcq	40(%rsi),%r14
+	movq	%r13,%rax
+	adcq	48(%rsi),%r15
+	movq	%r14,%r10
+	adcq	56(%rsi),%r9
+	sbbq	%rsi,%rsi
+
+
+
+
+	movq	%r15,%r11
+	subq	0(%rbx),%r13
+	sbbq	8(%rbx),%r14
+	sbbq	16(%rbx),%r15
+	movq	%r9,%r12
+	sbbq	24(%rbx),%r9
+	sbbq	$0,%rsi
+
+	cmovncq	%r13,%rax
+	cmovncq	%r14,%r10
+	cmovncq	%r15,%r11
+	movq	%rax,0(%rdi)
+	cmovncq	%r9,%r12
+	movq	%r10,8(%rdi)
+	movq	%r11,16(%rdi)
+	movq	%r12,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	redc_mont_256,.-redc_mont_256
+.type	__mulq_by_1_mont_256,@function
+.align	32
+__mulq_by_1_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	movq	%rax,%r13
+	imulq	%rcx,%rax
+	movq	%rax,%r9
+
+	mulq	0(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+
+	mulq	8(%rbx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	16(%rbx)
+	movq	%r10,%r14
+	imulq	%rcx,%r10
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	24(%rbx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	movq	%r11,%r15
+	imulq	%rcx,%r11
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	movq	%r12,%r9
+	imulq	%rcx,%r12
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_by_1_mont_256,.-__mulq_by_1_mont_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/mulq_mont_384-x86_64.s b/blst/elf/mulq_mont_384-x86_64.s
new file mode 100644
index 0000000..fa9dd35
--- /dev/null
+++ b/blst/elf/mulq_mont_384-x86_64.s
@@ -0,0 +1,3620 @@
+.text	
+
+
+
+
+
+
+
+.type	__sub_mod_384x384,@function
+.align	32
+__sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,@function
+.align	32
+__add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,@function
+.align	32
+__sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__sub_mod_384_a_is_loaded:
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384,.-__sub_mod_384
+.globl	mul_mont_384x
+.hidden	mul_mont_384x
+.type	mul_mont_384x,@function
+.align	32
+mul_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$328,%rsp
+.cfi_adjust_cfa_offset	328
+
+
+	movq	%rdx,%rbx
+	movq	%rdi,32(%rsp)
+	movq	%rsi,24(%rsp)
+	movq	%rdx,16(%rsp)
+	movq	%rcx,8(%rsp)
+	movq	%r8,0(%rsp)
+
+
+
+
+	leaq	40(%rsp),%rdi
+	call	__mulq_384
+
+
+	leaq	48(%rbx),%rbx
+	leaq	48(%rsi),%rsi
+	leaq	40+96(%rsp),%rdi
+	call	__mulq_384
+
+
+	movq	8(%rsp),%rcx
+	leaq	-48(%rsi),%rdx
+	leaq	40+192+48(%rsp),%rdi
+	call	__add_mod_384
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	leaq	(%rdi),%rbx
+	leaq	48(%rdi),%rsi
+	call	__mulq_384
+
+
+	leaq	(%rdi),%rsi
+	leaq	40(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	call	__sub_mod_384x384
+
+	leaq	(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	40(%rsp),%rsi
+	leaq	40+96(%rsp),%rdx
+	leaq	40(%rsp),%rdi
+	call	__sub_mod_384x384
+
+	movq	%rcx,%rbx
+
+
+	leaq	40(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	movq	32(%rsp),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+
+	leaq	40+192(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	328(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-328-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_mont_384x,.-mul_mont_384x
+.globl	sqr_mont_384x
+.hidden	sqr_mont_384x
+.type	sqr_mont_384x,@function
+.align	32
+sqr_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	32(%rsp),%rdi
+	call	__add_mod_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	32+48(%rsp),%rdi
+	call	__sub_mod_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	call	__mulq_mont_384
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	movq	%r14,%r12
+	adcq	%r9,%r9
+	movq	%r15,%r13
+	adcq	%r10,%r10
+	movq	%r8,%rax
+	adcq	%r11,%r11
+	movq	%r9,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%r10,%rbp
+	sbbq	16(%rcx),%r8
+	sbbq	24(%rcx),%r9
+	sbbq	32(%rcx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%r12,%r14
+	cmovcq	%r13,%r15
+	cmovcq	%rax,%r8
+	movq	%r14,48(%rdi)
+	cmovcq	%rbx,%r9
+	movq	%r15,56(%rdi)
+	cmovcq	%rbp,%r10
+	movq	%r8,64(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,72(%rdi)
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rax
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%r12
+	movq	32+24(%rsp),%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_mont_384x,.-sqr_mont_384x
+
+.globl	mul_382x
+.hidden	mul_382x
+.type	mul_382x,@function
+.align	32
+mul_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	leaq	96(%rdi),%rdi
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	movq	%rdi,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	48(%rsi),%r8
+	adcq	56(%rsi),%r9
+	adcq	64(%rsi),%r10
+	adcq	72(%rsi),%r11
+	adcq	80(%rsi),%r12
+	adcq	88(%rsi),%r13
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	movq	32(%rdx),%r12
+	movq	40(%rdx),%r13
+
+	addq	48(%rdx),%r8
+	adcq	56(%rdx),%r9
+	adcq	64(%rdx),%r10
+	adcq	72(%rdx),%r11
+	adcq	80(%rdx),%r12
+	adcq	88(%rdx),%r13
+
+	movq	%r8,32+48(%rsp)
+	movq	%r9,32+56(%rsp)
+	movq	%r10,32+64(%rsp)
+	movq	%r11,32+72(%rsp)
+	movq	%r12,32+80(%rsp)
+	movq	%r13,32+88(%rsp)
+
+
+	leaq	32+0(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+	call	__mulq_384
+
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rbx
+	leaq	-96(%rdi),%rdi
+	call	__mulq_384
+
+
+	leaq	48(%rsi),%rsi
+	leaq	48(%rbx),%rbx
+	leaq	32(%rsp),%rdi
+	call	__mulq_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	32(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	movq	%rsi,%rdi
+	call	__sub_mod_384x384
+
+
+	leaq	0(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	-96(%rdi),%rsi
+	leaq	32(%rsp),%rdx
+	leaq	-96(%rdi),%rdi
+	call	__sub_mod_384x384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_382x,.-mul_382x
+.globl	sqr_382x
+.hidden	sqr_382x
+.type	sqr_382x,@function
+.align	32
+sqr_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rcx
+
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%rbx
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rdx
+
+	movq	%r14,%r8
+	addq	48(%rsi),%r14
+	movq	%r15,%r9
+	adcq	56(%rsi),%r15
+	movq	%rax,%r10
+	adcq	64(%rsi),%rax
+	movq	%rbx,%r11
+	adcq	72(%rsi),%rbx
+	movq	%rbp,%r12
+	adcq	80(%rsi),%rbp
+	movq	%rdx,%r13
+	adcq	88(%rsi),%rdx
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rdx,40(%rdi)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384_a_is_loaded
+
+
+	leaq	(%rdi),%rsi
+	leaq	-48(%rdi),%rbx
+	leaq	-48(%rdi),%rdi
+	call	__mulq_384
+
+
+	movq	(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+	leaq	96(%rdi),%rdi
+	call	__mulq_384
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	64(%rdi),%rax
+	movq	72(%rdi),%rbx
+	movq	80(%rdi),%rbp
+	addq	%r8,%r8
+	movq	88(%rdi),%rdx
+	adcq	%r9,%r9
+	movq	%r8,0(%rdi)
+	adcq	%r10,%r10
+	movq	%r9,8(%rdi)
+	adcq	%r11,%r11
+	movq	%r10,16(%rdi)
+	adcq	%r12,%r12
+	movq	%r11,24(%rdi)
+	adcq	%r13,%r13
+	movq	%r12,32(%rdi)
+	adcq	%r14,%r14
+	movq	%r13,40(%rdi)
+	adcq	%r15,%r15
+	movq	%r14,48(%rdi)
+	adcq	%rax,%rax
+	movq	%r15,56(%rdi)
+	adcq	%rbx,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%rbp,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%rdx,%rdx
+	movq	%rbp,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*7
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_382x,.-sqr_382x
+.globl	mul_384
+.hidden	mul_384
+.type	mul_384,@function
+.align	32
+mul_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	%rdx,%rbx
+	call	__mulq_384
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_384,.-mul_384
+
+.type	__mulq_384,@function
+.align	32
+__mulq_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rbx),%rax
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	movq	%rax,0(%rdi)
+	movq	%rbp,%rax
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r11
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,8(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,16(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,24(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,32(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,40(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	%rax,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rcx,48(%rdi)
+	movq	%r8,56(%rdi)
+	movq	%r9,64(%rdi)
+	movq	%r10,72(%rdi)
+	movq	%r11,80(%rdi)
+	movq	%r12,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_384,.-__mulq_384
+.globl	sqr_384
+.hidden	sqr_384
+.type	sqr_384,@function
+.align	32
+sqr_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sqrq_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_384,.-sqr_384
+
+.type	__sqrq_384,@function
+.align	32
+__sqrq_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rcx
+	movq	24(%rsi),%rbx
+
+
+	movq	%rax,%r14
+	mulq	%r15
+	movq	%rax,%r9
+	movq	%r14,%rax
+	movq	32(%rsi),%rbp
+	movq	%rdx,%r10
+
+	mulq	%rcx
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	40(%rsi),%rsi
+	movq	%rdx,%r11
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	%rsi
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	%rax
+	xorq	%r8,%r8
+	movq	%rax,0(%rdi)
+	movq	%r15,%rax
+	addq	%r9,%r9
+	adcq	$0,%r8
+	addq	%rdx,%r9
+	adcq	$0,%r8
+	movq	%r9,8(%rdi)
+
+	mulq	%rcx
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rsi
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	%rax
+	xorq	%r9,%r9
+	addq	%rax,%r8
+	movq	%rcx,%rax
+	addq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%r9
+	addq	%r8,%r10
+	adcq	%rdx,%r11
+	adcq	$0,%r9
+	movq	%r10,16(%rdi)
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%r11,24(%rdi)
+	movq	%rdx,%r8
+
+	mulq	%rbp
+	addq	%rax,%r14
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	%rsi
+	addq	%rax,%r15
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	xorq	%r11,%r11
+	addq	%rax,%r9
+	movq	%rbx,%rax
+	addq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	$0,%r11
+	addq	%r9,%r12
+	adcq	%rdx,%r13
+	adcq	$0,%r11
+	movq	%r12,32(%rdi)
+
+
+	mulq	%rbp
+	addq	%rax,%r15
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r13,40(%rdi)
+	movq	%rdx,%r8
+
+	mulq	%rsi
+	addq	%rax,%rcx
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	mulq	%rax
+	xorq	%r12,%r12
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	$0,%r12
+	addq	%r11,%r14
+	adcq	%rdx,%r15
+	movq	%r14,48(%rdi)
+	adcq	$0,%r12
+	movq	%r15,56(%rdi)
+
+
+	mulq	%rsi
+	addq	%rax,%rbx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	xorq	%r13,%r13
+	addq	%rax,%r12
+	movq	%rsi,%rax
+	addq	%rcx,%rcx
+	adcq	%rbx,%rbx
+	adcq	$0,%r13
+	addq	%r12,%rcx
+	adcq	%rdx,%rbx
+	movq	%rcx,64(%rdi)
+	adcq	$0,%r13
+	movq	%rbx,72(%rdi)
+
+
+	mulq	%rax
+	addq	%r13,%rax
+	addq	%rbp,%rbp
+	adcq	$0,%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rax,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sqrq_384,.-__sqrq_384
+
+.globl	sqr_mont_384
+.hidden	sqr_mont_384
+.type	sqr_mont_384,@function
+.align	32
+sqr_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$120,%rsp
+.cfi_adjust_cfa_offset	8*15
+
+
+	movq	%rcx,96(%rsp)
+	movq	%rdx,104(%rsp)
+	movq	%rdi,112(%rsp)
+
+	movq	%rsp,%rdi
+	call	__sqrq_384
+
+	leaq	0(%rsp),%rsi
+	movq	96(%rsp),%rcx
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	120(%rsp),%r8
+	movq	120(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-8*21
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_mont_384,.-sqr_mont_384
+
+
+
+.globl	redc_mont_384
+.hidden	redc_mont_384
+.type	redc_mont_384,@function
+.align	32
+redc_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	redc_mont_384,.-redc_mont_384
+
+
+
+
+.globl	from_mont_384
+.hidden	from_mont_384
+.type	from_mont_384,@function
+.align	32
+from_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_384
+
+
+
+
+
+	movq	%r15,%rcx
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	from_mont_384,.-from_mont_384
+.type	__mulq_by_1_mont_384,@function
+.align	32
+__mulq_by_1_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rax,%r14
+	imulq	%rcx,%rax
+	movq	%rax,%r8
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r8,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%r9,%r15
+	imulq	%rcx,%r9
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	32(%rbx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	40(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r9,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%r10,%r8
+	imulq	%rcx,%r10
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r8
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rbx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%r11,%r9
+	imulq	%rcx,%r11
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	0(%rbx)
+	addq	%rax,%r9
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%r12,%r10
+	imulq	%rcx,%r12
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rbx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	0(%rbx)
+	addq	%rax,%r10
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%r13,%r11
+	imulq	%rcx,%r13
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rbx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rbx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r11
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rbx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rbx)
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rbx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rbx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_by_1_mont_384,.-__mulq_by_1_mont_384
+
+.type	__redc_tail_mont_384,@function
+.align	32
+__redc_tail_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	48(%rsi),%r14
+	movq	%r14,%rax
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	movq	%r15,%rcx
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	sbbq	%r12,%r12
+
+
+
+
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	sgn0_pty_mont_384
+.hidden	sgn0_pty_mont_384
+.type	sgn0_pty_mont_384,@function
+.align	32
+sgn0_pty_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	0(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulq_by_1_mont_384
+
+	xorq	%rax,%rax
+	movq	%r14,%r13
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384
+
+.globl	sgn0_pty_mont_384x
+.hidden	sgn0_pty_mont_384x
+.type	sgn0_pty_mont_384x,@function
+.align	32
+sgn0_pty_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	48(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulq_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	leaq	0(%rdi),%rsi
+	xorq	%rdi,%rdi
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rdi
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rdi
+
+	movq	%r14,0(%rsp)
+	notq	%rdi
+	andq	$1,%r13
+	andq	$2,%rdi
+	orq	%r13,%rdi
+
+	call	__mulq_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	xorq	%rax,%rax
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%r12
+
+	notq	%rax
+
+	testq	%r14,%r14
+	cmovzq	%rdi,%r13
+
+	testq	%r12,%r12
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
+.globl	mul_mont_384
+.hidden	mul_mont_384
+.type	mul_mont_384,@function
+.align	32
+mul_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$24,%rsp
+.cfi_adjust_cfa_offset	8*3
+
+
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+	movq	%rdx,%rbx
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+
+	call	__mulq_mont_384
+
+	movq	24(%rsp),%r15
+.cfi_restore	%r15
+	movq	32(%rsp),%r14
+.cfi_restore	%r14
+	movq	40(%rsp),%r13
+.cfi_restore	%r13
+	movq	48(%rsp),%r12
+.cfi_restore	%r12
+	movq	56(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	64(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-72
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_mont_384,.-mul_mont_384
+.type	__mulq_mont_384,@function
+.align	32
+__mulq_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rax,%rdi
+	mulq	%r14
+	movq	%rax,%r8
+	movq	%rdi,%rax
+	movq	%rdx,%r9
+
+	mulq	%r15
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r12
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r8,%rbp
+	imulq	8(%rsp),%r8
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	40(%rsi)
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	xorq	%r15,%r15
+	movq	%rdx,%r14
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r8,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r13
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	%rdx,%r14
+	adcq	$0,%r15
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	8(%rsi)
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	movq	%r9,%rbp
+	imulq	8(%rsp),%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rsi)
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	xorq	%r8,%r8
+	addq	%rax,%r14
+	movq	%r9,%rax
+	adcq	%rdx,%r15
+	adcq	$0,%r8
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r9,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r14
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	%rdx,%r15
+	adcq	$0,%r8
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	movq	%r10,%rbp
+	imulq	8(%rsp),%r10
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rsi)
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	xorq	%r9,%r9
+	addq	%rax,%r15
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r10,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r15
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	movq	%r11,%rbp
+	imulq	8(%rsp),%r11
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rsi)
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	xorq	%r10,%r10
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r11,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r8
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r8
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r12,%rbp
+	imulq	8(%rsp),%r12
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rsi)
+	addq	%rax,%r8
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	xorq	%r11,%r11
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+	adcq	$0,%r11
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r12,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r9
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	%rdx,%r10
+	adcq	$0,%r11
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	8(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	16(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	movq	%r13,%rbp
+	imulq	8(%rsp),%r13
+
+	mulq	24(%rsi)
+	addq	%rax,%r8
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	40(%rsi)
+	addq	%r12,%r10
+	adcq	$0,%rdx
+	xorq	%r12,%r12
+	addq	%rax,%r10
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+	adcq	$0,%r12
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r13,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	%rdx,%r11
+	adcq	$0,%r12
+
+
+
+
+	movq	16(%rsp),%rdi
+	subq	0(%rcx),%r14
+	movq	%r15,%rdx
+	sbbq	8(%rcx),%r15
+	movq	%r8,%rbx
+	sbbq	16(%rcx),%r8
+	movq	%r9,%rsi
+	sbbq	24(%rcx),%r9
+	movq	%r10,%rbp
+	sbbq	32(%rcx),%r10
+	movq	%r11,%r13
+	sbbq	40(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rdx,%r15
+	cmovcq	%rbx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rsi,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%rbp,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%r13,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_mont_384,.-__mulq_mont_384
+.globl	sqr_n_mul_mont_384
+.hidden	sqr_n_mul_mont_384
+.type	sqr_n_mul_mont_384,@function
+.align	32
+sqr_n_mul_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	8*17
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rcx,16(%rsp)
+	leaq	32(%rsp),%rdi
+	movq	%r9,24(%rsp)
+	movq	(%r9),%xmm2
+
+.Loop_sqr_384:
+	movd	%edx,%xmm1
+
+	call	__sqrq_384
+
+	leaq	0(%rdi),%rsi
+	movq	0(%rsp),%rcx
+	movq	16(%rsp),%rbx
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movd	%xmm1,%edx
+	leaq	0(%rdi),%rsi
+	decl	%edx
+	jnz	.Loop_sqr_384
+
+.byte	102,72,15,126,208
+	movq	%rbx,%rcx
+	movq	24(%rsp),%rbx
+
+
+
+
+
+
+	movq	%r8,%r12
+	movq	%r9,%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	136(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-8*23
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_n_mul_mont_384,.-sqr_n_mul_mont_384
+
+.globl	sqr_n_mul_mont_383
+.hidden	sqr_n_mul_mont_383
+.type	sqr_n_mul_mont_383,@function
+.align	32
+sqr_n_mul_mont_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	8*17
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rcx,16(%rsp)
+	leaq	32(%rsp),%rdi
+	movq	%r9,24(%rsp)
+	movq	(%r9),%xmm2
+
+.Loop_sqr_383:
+	movd	%edx,%xmm1
+
+	call	__sqrq_384
+
+	leaq	0(%rdi),%rsi
+	movq	0(%rsp),%rcx
+	movq	16(%rsp),%rbx
+	call	__mulq_by_1_mont_384
+
+	movd	%xmm1,%edx
+	addq	48(%rsi),%r14
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	leaq	0(%rdi),%rsi
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	decl	%edx
+	jnz	.Loop_sqr_383
+
+.byte	102,72,15,126,208
+	movq	%rbx,%rcx
+	movq	24(%rsp),%rbx
+
+
+
+
+
+
+	movq	%r8,%r12
+	movq	%r9,%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	136(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-8*23
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
+.type	__mulq_mont_383_nonred,@function
+.align	32
+__mulq_mont_383_nonred:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rax,%rbp
+	mulq	%r14
+	movq	%rax,%r8
+	movq	%rbp,%rax
+	movq	%rdx,%r9
+
+	mulq	%r15
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r12
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r8,%r15
+	imulq	8(%rsp),%r8
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	40(%rsi)
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rcx)
+	addq	%rax,%r15
+	movq	%r8,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rcx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rcx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rcx)
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rcx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rcx)
+	addq	%rax,%r13
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	%rdx,%r14
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	8(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	movq	%r9,%r8
+	imulq	8(%rsp),%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rsi)
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rcx)
+	addq	%rax,%r8
+	movq	%r9,%rax
+	adcq	%rdx,%r8
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	24(%rcx)
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rcx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rcx)
+	addq	%rax,%r14
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	%rdx,%r15
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	movq	%r10,%r9
+	imulq	8(%rsp),%r10
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rsi)
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	0(%rcx)
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rcx)
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rcx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rcx)
+	addq	%rax,%r15
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	%rdx,%r8
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	movq	%r11,%r10
+	imulq	8(%rsp),%r11
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rsi)
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	0(%rcx)
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rcx)
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rcx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rcx)
+	addq	%rax,%r8
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	%rdx,%r9
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	movq	%r12,%r11
+	imulq	8(%rsp),%r12
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rsi)
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rcx)
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rcx)
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rcx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rcx)
+	addq	%rax,%r9
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	%rdx,%r10
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	8(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r13,%r12
+	imulq	8(%rsp),%r13
+
+	mulq	24(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	0(%rcx)
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	%rdx,%r12
+
+	mulq	8(%rcx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	16(%rcx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	24(%rcx)
+	addq	%r12,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rcx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	40(%rcx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	adcq	%rdx,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_mont_383_nonred,.-__mulq_mont_383_nonred
+.globl	sqr_mont_382x
+.hidden	sqr_mont_382x
+.type	sqr_mont_382x,@function
+.align	32
+sqr_mont_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rsi,16(%rsp)
+	movq	%rdi,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rdx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rbx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rdx
+	sbbq	80(%rsi),%rbx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rdi,%rdi
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+	movq	%r14,32+48(%rsp)
+	movq	%r15,32+56(%rsp)
+	movq	%rax,32+64(%rsp)
+	movq	%rdx,32+72(%rsp)
+	movq	%rbx,32+80(%rsp)
+	movq	%rbp,32+88(%rsp)
+	movq	%rdi,32+96(%rsp)
+
+
+
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	movq	24(%rsp),%rdi
+	call	__mulq_mont_383_nonred
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%r8,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rax
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%r12
+	movq	32+24(%rsp),%r13
+
+	call	__mulq_mont_383_nonred
+	movq	32+96(%rsp),%rsi
+	movq	32+0(%rsp),%r12
+	movq	32+8(%rsp),%r13
+	andq	%rsi,%r12
+	movq	32+16(%rsp),%rax
+	andq	%rsi,%r13
+	movq	32+24(%rsp),%rbx
+	andq	%rsi,%rax
+	movq	32+32(%rsp),%rbp
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	32+40(%rsp),%rsi
+
+	subq	%r12,%r14
+	movq	0(%rcx),%r12
+	sbbq	%r13,%r15
+	movq	8(%rcx),%r13
+	sbbq	%rax,%r8
+	movq	16(%rcx),%rax
+	sbbq	%rbx,%r9
+	movq	24(%rcx),%rbx
+	sbbq	%rbp,%r10
+	movq	32(%rcx),%rbp
+	sbbq	%rsi,%r11
+	sbbq	%rsi,%rsi
+
+	andq	%rsi,%r12
+	andq	%rsi,%r13
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	40(%rcx),%rsi
+
+	addq	%r12,%r14
+	adcq	%r13,%r15
+	adcq	%rax,%r8
+	adcq	%rbx,%r9
+	adcq	%rbp,%r10
+	adcq	%rsi,%r11
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_mont_382x,.-sqr_mont_382x
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/mulx_mont_256-x86_64.s b/blst/elf/mulx_mont_256-x86_64.s
new file mode 100644
index 0000000..20a0207
--- /dev/null
+++ b/blst/elf/mulx_mont_256-x86_64.s
@@ -0,0 +1,627 @@
+.text	
+
+.globl	mulx_mont_sparse_256
+.hidden	mulx_mont_sparse_256
+.type	mulx_mont_sparse_256,@function
+.align	32
+mulx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%r9
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%rax,%r11
+	call	__mulx_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_mont_sparse_256,.-mulx_mont_sparse_256
+
+.globl	sqrx_mont_sparse_256
+.hidden	sqrx_mont_sparse_256
+.type	sqrx_mont_sparse_256,@function
+.align	32
+sqrx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	movq	%rcx,%r8
+	movq	%rdx,%rcx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%r9
+	leaq	-128(%rbx),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%rdx,%rax,%r11
+	call	__mulx_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_mont_sparse_256,.-sqrx_mont_sparse_256
+.type	__mulx_mont_sparse_256,@function
+.align	32
+__mulx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	mulxq	%r15,%r15,%r12
+	mulxq	%rbp,%rbp,%r13
+	addq	%r15,%r11
+	mulxq	%r9,%r9,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rbp,%r12
+	adcq	%r9,%r13
+	adcq	$0,%r14
+
+	movq	%rax,%r10
+	imulq	%r8,%rax
+
+
+	xorq	%r15,%r15
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r11
+	adcxq	%r9,%r12
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r12
+	adcxq	%r9,%r13
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r14
+	adcxq	%r15,%r9
+	adoxq	%r9,%r15
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r10
+	adoxq	%r11,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r12
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r12
+	adoxq	%r9,%r13
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	16(%rbx),%rdx
+	adcxq	%rbp,%r13
+	adoxq	%r9,%r14
+	adcxq	%r10,%r14
+	adoxq	%r10,%r15
+	adcxq	%r10,%r15
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+	movq	%rax,%r11
+	imulq	%r8,%rax
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r12
+	adcxq	%r9,%r13
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r14
+	adcxq	%r9,%r15
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r15
+	adcxq	%r10,%r9
+	adoxq	%r9,%r10
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r11
+	adoxq	%r12,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r13
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r13
+	adoxq	%r9,%r14
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	24(%rbx),%rdx
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+	adcxq	%r11,%r15
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	adoxq	%r11,%r11
+	adcq	$0,%r11
+	movq	%rax,%r12
+	imulq	%r8,%rax
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r14
+	adcxq	%r9,%r15
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r15
+	adcxq	%r9,%r10
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r10
+	adcxq	%r11,%r9
+	adoxq	%r9,%r11
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r12
+	adoxq	%r13,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r14
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	%rax,%rdx
+	adcxq	%rbp,%r15
+	adoxq	%r9,%r10
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	adcxq	%r12,%r11
+	adoxq	%r12,%r12
+	adcq	$0,%r12
+	imulq	%r8,%rdx
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rcx),%r13,%r9
+	adcxq	%rax,%r13
+	adoxq	%r9,%r14
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r15
+	adoxq	%r9,%r10
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	%r14,%rdx
+	leaq	128(%rcx),%rcx
+	adcxq	%rbp,%r10
+	adoxq	%r9,%r11
+	movq	%r15,%rax
+	adcxq	%r13,%r11
+	adoxq	%r13,%r12
+	adcq	$0,%r12
+
+
+
+
+	movq	%r10,%rbp
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	sbbq	16(%rcx),%r10
+	movq	%r11,%r9
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rdx,%r14
+	cmovcq	%rax,%r15
+	cmovcq	%rbp,%r10
+	movq	%r14,0(%rdi)
+	cmovcq	%r9,%r11
+	movq	%r15,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulx_mont_sparse_256,.-__mulx_mont_sparse_256
+.globl	fromx_mont_256
+.hidden	fromx_mont_256
+.type	fromx_mont_256,@function
+.align	32
+fromx_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_256
+
+
+
+
+
+	movq	%r15,%rdx
+	movq	%r10,%r12
+	movq	%r11,%r13
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r10
+	sbbq	24(%rbx),%r11
+
+	cmovncq	%r14,%rax
+	cmovncq	%r15,%rdx
+	cmovncq	%r10,%r12
+	movq	%rax,0(%rdi)
+	cmovncq	%r11,%r13
+	movq	%rdx,8(%rdi)
+	movq	%r12,16(%rdi)
+	movq	%r13,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	fromx_mont_256,.-fromx_mont_256
+
+.globl	redcx_mont_256
+.hidden	redcx_mont_256
+.type	redcx_mont_256,@function
+.align	32
+redcx_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_256
+
+	addq	32(%rsi),%r14
+	adcq	40(%rsi),%r15
+	movq	%r14,%rax
+	adcq	48(%rsi),%r10
+	movq	%r15,%rdx
+	adcq	56(%rsi),%r11
+	sbbq	%rsi,%rsi
+
+
+
+
+	movq	%r10,%r12
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r10
+	movq	%r11,%r13
+	sbbq	24(%rbx),%r11
+	sbbq	$0,%rsi
+
+	cmovncq	%r14,%rax
+	cmovncq	%r15,%rdx
+	cmovncq	%r10,%r12
+	movq	%rax,0(%rdi)
+	cmovncq	%r11,%r13
+	movq	%rdx,8(%rdi)
+	movq	%r12,16(%rdi)
+	movq	%r13,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	redcx_mont_256,.-redcx_mont_256
+.type	__mulx_by_1_mont_256,@function
+.align	32
+__mulx_by_1_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r11
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	movq	%rax,%r14
+	imulq	%rcx,%rax
+	movq	%rax,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	movq	%r11,%r15
+	imulq	%rcx,%r11
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	movq	%r12,%r10
+	imulq	%rcx,%r12
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r10
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rbx)
+	movq	%r13,%r11
+	imulq	%rcx,%r13
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r11
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rbx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rbx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulx_by_1_mont_256,.-__mulx_by_1_mont_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/mulx_mont_384-x86_64.s b/blst/elf/mulx_mont_384-x86_64.s
new file mode 100644
index 0000000..9f9f740
--- /dev/null
+++ b/blst/elf/mulx_mont_384-x86_64.s
@@ -0,0 +1,2968 @@
+.text	
+
+
+
+
+
+
+
+.type	__sub_mod_384x384,@function
+.align	32
+__sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,@function
+.align	32
+__add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,@function
+.align	32
+__sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__sub_mod_384_a_is_loaded:
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384,.-__sub_mod_384
+.globl	mulx_mont_384x
+.hidden	mulx_mont_384x
+.type	mulx_mont_384x,@function
+.align	32
+mulx_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$328,%rsp
+.cfi_adjust_cfa_offset	328
+
+
+	movq	%rdx,%rbx
+	movq	%rdi,32(%rsp)
+	movq	%rsi,24(%rsp)
+	movq	%rdx,16(%rsp)
+	movq	%rcx,8(%rsp)
+	movq	%r8,0(%rsp)
+
+
+
+
+	leaq	40(%rsp),%rdi
+	call	__mulx_384
+
+
+	leaq	48(%rbx),%rbx
+	leaq	128+48(%rsi),%rsi
+	leaq	96(%rdi),%rdi
+	call	__mulx_384
+
+
+	movq	8(%rsp),%rcx
+	leaq	(%rbx),%rsi
+	leaq	-48(%rbx),%rdx
+	leaq	40+192+48(%rsp),%rdi
+	call	__add_mod_384
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	leaq	(%rdi),%rbx
+	leaq	48(%rdi),%rsi
+	call	__mulx_384
+
+
+	leaq	(%rdi),%rsi
+	leaq	40(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	call	__sub_mod_384x384
+
+	leaq	(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	40(%rsp),%rsi
+	leaq	40+96(%rsp),%rdx
+	leaq	40(%rsp),%rdi
+	call	__sub_mod_384x384
+
+	leaq	(%rcx),%rbx
+
+
+	leaq	40(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	movq	32(%rsp),%rdi
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+
+	leaq	40+192(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	328(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-328-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_mont_384x,.-mulx_mont_384x
+.globl	sqrx_mont_384x
+.hidden	sqrx_mont_384x
+.type	sqrx_mont_384x,@function
+.align	32
+sqrx_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+
+	movq	%rdi,16(%rsp)
+	movq	%rsi,24(%rsp)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	32(%rsp),%rdi
+	call	__add_mod_384
+
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	32+48(%rsp),%rdi
+	call	__sub_mod_384
+
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+	addq	%rdx,%rdx
+	adcq	%r15,%r15
+	adcq	%rax,%rax
+	movq	%rdx,%r8
+	adcq	%r12,%r12
+	movq	%r15,%r9
+	adcq	%rdi,%rdi
+	movq	%rax,%r10
+	adcq	%rbp,%rbp
+	movq	%r12,%r11
+	sbbq	%rsi,%rsi
+
+	subq	0(%rcx),%rdx
+	sbbq	8(%rcx),%r15
+	movq	%rdi,%r13
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%r12
+	sbbq	32(%rcx),%rdi
+	movq	%rbp,%r14
+	sbbq	40(%rcx),%rbp
+	sbbq	$0,%rsi
+
+	cmovcq	%r8,%rdx
+	cmovcq	%r9,%r15
+	cmovcq	%r10,%rax
+	movq	%rdx,48(%rbx)
+	cmovcq	%r11,%r12
+	movq	%r15,56(%rbx)
+	cmovcq	%r13,%rdi
+	movq	%rax,64(%rbx)
+	cmovcq	%r14,%rbp
+	movq	%r12,72(%rbx)
+	movq	%rdi,80(%rbx)
+	movq	%rbp,88(%rbx)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rdx
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%rax
+	movq	32+24(%rsp),%r12
+	movq	32+32(%rsp),%rdi
+	movq	32+40(%rsp),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_mont_384x,.-sqrx_mont_384x
+
+.globl	mulx_382x
+.hidden	mulx_382x
+.type	mulx_382x,@function
+.align	32
+mulx_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	leaq	96(%rdi),%rdi
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	movq	%rdi,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	48(%rsi),%r8
+	adcq	56(%rsi),%r9
+	adcq	64(%rsi),%r10
+	adcq	72(%rsi),%r11
+	adcq	80(%rsi),%r12
+	adcq	88(%rsi),%r13
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	movq	32(%rdx),%r12
+	movq	40(%rdx),%r13
+
+	addq	48(%rdx),%r8
+	adcq	56(%rdx),%r9
+	adcq	64(%rdx),%r10
+	adcq	72(%rdx),%r11
+	adcq	80(%rdx),%r12
+	adcq	88(%rdx),%r13
+
+	movq	%r8,32+48(%rsp)
+	movq	%r9,32+56(%rsp)
+	movq	%r10,32+64(%rsp)
+	movq	%r11,32+72(%rsp)
+	movq	%r12,32+80(%rsp)
+	movq	%r13,32+88(%rsp)
+
+
+	leaq	32+0(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+	call	__mulx_384
+
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rbx
+	leaq	-96(%rdi),%rdi
+	call	__mulx_384
+
+
+	leaq	48+128(%rsi),%rsi
+	leaq	48(%rbx),%rbx
+	leaq	32(%rsp),%rdi
+	call	__mulx_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	32(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	movq	%rsi,%rdi
+	call	__sub_mod_384x384
+
+
+	leaq	0(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	-96(%rdi),%rsi
+	leaq	32(%rsp),%rdx
+	leaq	-96(%rdi),%rdi
+	call	__sub_mod_384x384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_382x,.-mulx_382x
+.globl	sqrx_382x
+.hidden	sqrx_382x
+.type	sqrx_382x,@function
+.align	32
+sqrx_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rcx
+
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%rbx
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rdx
+
+	movq	%r14,%r8
+	addq	48(%rsi),%r14
+	movq	%r15,%r9
+	adcq	56(%rsi),%r15
+	movq	%rax,%r10
+	adcq	64(%rsi),%rax
+	movq	%rbx,%r11
+	adcq	72(%rsi),%rbx
+	movq	%rbp,%r12
+	adcq	80(%rsi),%rbp
+	movq	%rdx,%r13
+	adcq	88(%rsi),%rdx
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rdx,40(%rdi)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384_a_is_loaded
+
+
+	leaq	(%rdi),%rsi
+	leaq	-48(%rdi),%rbx
+	leaq	-48(%rdi),%rdi
+	call	__mulx_384
+
+
+	movq	(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+	leaq	96(%rdi),%rdi
+	call	__mulx_384
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	64(%rdi),%rax
+	movq	72(%rdi),%rbx
+	movq	80(%rdi),%rbp
+	addq	%r8,%r8
+	movq	88(%rdi),%rdx
+	adcq	%r9,%r9
+	movq	%r8,0(%rdi)
+	adcq	%r10,%r10
+	movq	%r9,8(%rdi)
+	adcq	%r11,%r11
+	movq	%r10,16(%rdi)
+	adcq	%r12,%r12
+	movq	%r11,24(%rdi)
+	adcq	%r13,%r13
+	movq	%r12,32(%rdi)
+	adcq	%r14,%r14
+	movq	%r13,40(%rdi)
+	adcq	%r15,%r15
+	movq	%r14,48(%rdi)
+	adcq	%rax,%rax
+	movq	%r15,56(%rdi)
+	adcq	%rbx,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%rbp,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%rdx,%rdx
+	movq	%rbp,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*7
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_382x,.-sqrx_382x
+.globl	mulx_384
+.hidden	mulx_384
+.type	mulx_384,@function
+.align	32
+mulx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	movq	%rdx,%rbx
+	call	__mulx_384
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_384,.-mulx_384
+
+.type	__mulx_384,@function
+.align	32
+__mulx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rbx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	leaq	-128(%rsi),%rsi
+
+	mulxq	%r14,%r9,%rcx
+	xorq	%rbp,%rbp
+
+	mulxq	%r15,%r8,%rax
+	adcxq	%rcx,%r8
+	movq	%r9,0(%rdi)
+
+	mulxq	%r10,%r9,%rcx
+	adcxq	%rax,%r9
+
+	mulxq	%r11,%r10,%rax
+	adcxq	%rcx,%r10
+
+	mulxq	%r12,%r11,%rcx
+	adcxq	%rax,%r11
+
+	mulxq	%r13,%r12,%r13
+	movq	8(%rbx),%rdx
+	adcxq	%rcx,%r12
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,8(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	16(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,16(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	24(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,24(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	32(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,32(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	40(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,40(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	%rax,%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	movq	%r10,64(%rdi)
+	movq	%r11,72(%rdi)
+	movq	%r12,80(%rdi)
+	movq	%r13,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulx_384,.-__mulx_384
+.globl	sqrx_384
+.hidden	sqrx_384
+.type	sqrx_384,@function
+.align	32
+sqrx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sqrx_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_384,.-sqrx_384
+.type	__sqrx_384,@function
+.align	32
+__sqrx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%rcx
+	movq	32(%rsi),%rbx
+
+
+	mulxq	%r14,%r8,%rdi
+	movq	40(%rsi),%rbp
+	mulxq	%r15,%r9,%rax
+	addq	%rdi,%r9
+	mulxq	%rcx,%r10,%rdi
+	adcq	%rax,%r10
+	mulxq	%rbx,%r11,%rax
+	adcq	%rdi,%r11
+	mulxq	%rbp,%r12,%r13
+	movq	%r14,%rdx
+	adcq	%rax,%r12
+	adcq	$0,%r13
+
+
+	xorq	%r14,%r14
+	mulxq	%r15,%rdi,%rax
+	adcxq	%rdi,%r10
+	adoxq	%rax,%r11
+
+	mulxq	%rcx,%rdi,%rax
+	adcxq	%rdi,%r11
+	adoxq	%rax,%r12
+
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r12
+	adoxq	%rax,%r13
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%r15,%rdx
+	adcxq	%rdi,%r13
+	adoxq	%r14,%rax
+	adcxq	%rax,%r14
+
+
+	xorq	%r15,%r15
+	mulxq	%rcx,%rdi,%rax
+	adcxq	%rdi,%r12
+	adoxq	%rax,%r13
+
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r13
+	adoxq	%rax,%r14
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%rcx,%rdx
+	adcxq	%rdi,%r14
+	adoxq	%r15,%rax
+	adcxq	%rax,%r15
+
+
+	xorq	%rcx,%rcx
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r14
+	adoxq	%rax,%r15
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%rbx,%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rcx,%rax
+	adcxq	%rax,%rcx
+
+
+	mulxq	%rbp,%rdi,%rbx
+	movq	0(%rsi),%rdx
+	addq	%rdi,%rcx
+	movq	8(%rsp),%rdi
+	adcq	$0,%rbx
+
+
+	xorq	%rbp,%rbp
+	adcxq	%r8,%r8
+	adcxq	%r9,%r9
+	adcxq	%r10,%r10
+	adcxq	%r11,%r11
+	adcxq	%r12,%r12
+
+
+	mulxq	%rdx,%rdx,%rax
+	movq	%rdx,0(%rdi)
+	movq	8(%rsi),%rdx
+	adoxq	%rax,%r8
+	movq	%r8,8(%rdi)
+
+	mulxq	%rdx,%r8,%rax
+	movq	16(%rsi),%rdx
+	adoxq	%r8,%r9
+	adoxq	%rax,%r10
+	movq	%r9,16(%rdi)
+	movq	%r10,24(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	24(%rsi),%rdx
+	adoxq	%r8,%r11
+	adoxq	%r9,%r12
+	adcxq	%r13,%r13
+	adcxq	%r14,%r14
+	movq	%r11,32(%rdi)
+	movq	%r12,40(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	32(%rsi),%rdx
+	adoxq	%r8,%r13
+	adoxq	%r9,%r14
+	adcxq	%r15,%r15
+	adcxq	%rcx,%rcx
+	movq	%r13,48(%rdi)
+	movq	%r14,56(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	40(%rsi),%rdx
+	adoxq	%r8,%r15
+	adoxq	%r9,%rcx
+	adcxq	%rbx,%rbx
+	adcxq	%rbp,%rbp
+	movq	%r15,64(%rdi)
+	movq	%rcx,72(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	adoxq	%r8,%rbx
+	adoxq	%r9,%rbp
+
+	movq	%rbx,80(%rdi)
+	movq	%rbp,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sqrx_384,.-__sqrx_384
+
+
+
+.globl	redcx_mont_384
+.hidden	redcx_mont_384
+.type	redcx_mont_384,@function
+.align	32
+redcx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	redcx_mont_384,.-redcx_mont_384
+
+
+
+
+.globl	fromx_mont_384
+.hidden	fromx_mont_384
+.type	fromx_mont_384,@function
+.align	32
+fromx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_384
+
+
+
+
+	movq	%r14,%rax
+	movq	%r15,%rcx
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	fromx_mont_384,.-fromx_mont_384
+.type	__mulx_by_1_mont_384,@function
+.align	32
+__mulx_by_1_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	%rcx,%rdx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	imulq	%r8,%rdx
+
+
+	xorq	%r14,%r14
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r13
+	adoxq	%r14,%rbp
+	adcxq	%rbp,%r14
+	imulq	%r9,%rdx
+
+
+	xorq	%r15,%r15
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%r15,%rbp
+	adcxq	%rbp,%r15
+	imulq	%r10,%rdx
+
+
+	xorq	%r8,%r8
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r15
+	adoxq	%r8,%rbp
+	adcxq	%rbp,%r8
+	imulq	%r11,%rdx
+
+
+	xorq	%r9,%r9
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r8
+	adoxq	%r9,%rbp
+	adcxq	%rbp,%r9
+	imulq	%r12,%rdx
+
+
+	xorq	%r10,%r10
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r9
+	adoxq	%r10,%rbp
+	adcxq	%rbp,%r10
+	imulq	%r13,%rdx
+
+
+	xorq	%r11,%r11
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r10
+	adoxq	%r11,%rbp
+	adcxq	%rbp,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulx_by_1_mont_384,.-__mulx_by_1_mont_384
+
+.type	__redc_tail_mont_384,@function
+.align	32
+__redc_tail_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	48(%rsi),%r14
+	movq	%r14,%rax
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	movq	%r15,%rcx
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	sbbq	%r12,%r12
+
+
+
+
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	sgn0x_pty_mont_384
+.hidden	sgn0x_pty_mont_384
+.type	sgn0x_pty_mont_384,@function
+.align	32
+sgn0x_pty_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	0(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulx_by_1_mont_384
+
+	xorq	%rax,%rax
+	movq	%r14,%r13
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0x_pty_mont_384,.-sgn0x_pty_mont_384
+
+.globl	sgn0x_pty_mont_384x
+.hidden	sgn0x_pty_mont_384x
+.type	sgn0x_pty_mont_384x,@function
+.align	32
+sgn0x_pty_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	48(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulx_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	leaq	0(%rdi),%rsi
+	xorq	%rdi,%rdi
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rdi
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rdi
+
+	movq	%r14,0(%rsp)
+	notq	%rdi
+	andq	$1,%r13
+	andq	$2,%rdi
+	orq	%r13,%rdi
+
+	call	__mulx_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	xorq	%rax,%rax
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%r12
+
+	notq	%rax
+
+	testq	%r14,%r14
+	cmovzq	%rdi,%r13
+
+	testq	%r12,%r12
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x
+.globl	mulx_mont_384
+.hidden	mulx_mont_384
+.type	mulx_mont_384,@function
+.align	32
+mulx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-24(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*3
+
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+	movq	%r8,(%rsp)
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	24(%rsp),%r15
+.cfi_restore	%r15
+	movq	32(%rsp),%r14
+.cfi_restore	%r14
+	movq	40(%rsp),%r13
+.cfi_restore	%r13
+	movq	48(%rsp),%r12
+.cfi_restore	%r12
+	movq	56(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	64(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*9
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_mont_384,.-mulx_mont_384
+.type	__mulx_mont_384,@function
+.align	32
+__mulx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	mulxq	%r15,%r14,%r10
+	mulxq	%rax,%r15,%r11
+	addq	%r14,%r9
+	mulxq	%r12,%rax,%r12
+	adcq	%r15,%r10
+	mulxq	%rdi,%rdi,%r13
+	adcq	%rax,%r11
+	mulxq	%rbp,%rbp,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rdi,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,16(%rsp)
+	imulq	8(%rsp),%r8
+
+
+	xorq	%rax,%rax
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r8,%rdx
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+	adoxq	%rax,%r15
+	adoxq	%rax,%rax
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+	adcxq	%r8,%r14
+	adoxq	%r8,%r15
+	adcxq	%r8,%r15
+	adoxq	%r8,%rax
+	adcxq	%r8,%rax
+	movq	%r9,16(%rsp)
+	imulq	8(%rsp),%r9
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r9,%rdx
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+	adoxq	%r8,%rax
+	adoxq	%r8,%r8
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+	adcxq	%r9,%r15
+	adoxq	%r9,%rax
+	adcxq	%r9,%rax
+	adoxq	%r9,%r8
+	adcxq	%r9,%r8
+	movq	%r10,16(%rsp)
+	imulq	8(%rsp),%r10
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r10,%rdx
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+	adoxq	%r9,%r8
+	adoxq	%r9,%r9
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	32(%rbx),%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+	adcxq	%r10,%rax
+	adoxq	%r10,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcxq	%r10,%r9
+	movq	%r11,16(%rsp)
+	imulq	8(%rsp),%r11
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r11,%rdx
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+	adoxq	%r10,%r9
+	adoxq	%r10,%r10
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	40(%rbx),%rdx
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	adcxq	%r11,%r8
+	adoxq	%r11,%r9
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	movq	%r12,16(%rsp)
+	imulq	8(%rsp),%r12
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r12,%rdx
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+	adoxq	%r11,%r10
+	adoxq	%r11,%r11
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r13
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r13,%rdx
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r9
+	adoxq	%r12,%r10
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	adcxq	%r12,%r11
+	imulq	8(%rsp),%rdx
+	movq	24(%rsp),%rbx
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	movq	%r15,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	movq	%rax,%rsi
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+	movq	%r14,%rdx
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	leaq	128(%rcx),%rcx
+	movq	%r8,%r12
+	adcq	$0,%r11
+
+
+
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%r9,%rdi
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%r8
+	sbbq	32(%rcx),%r9
+	movq	%r10,%rbp
+	sbbq	40(%rcx),%r10
+	sbbq	$0,%r11
+
+	cmovncq	%r14,%rdx
+	cmovcq	%r13,%r15
+	cmovcq	%rsi,%rax
+	cmovncq	%r8,%r12
+	movq	%rdx,0(%rbx)
+	cmovncq	%r9,%rdi
+	movq	%r15,8(%rbx)
+	cmovncq	%r10,%rbp
+	movq	%rax,16(%rbx)
+	movq	%r12,24(%rbx)
+	movq	%rdi,32(%rbx)
+	movq	%rbp,40(%rbx)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__mulx_mont_384,.-__mulx_mont_384
+.globl	sqrx_mont_384
+.hidden	sqrx_mont_384
+.type	sqrx_mont_384,@function
+.align	32
+sqrx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-24(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*3
+
+
+	movq	%rcx,%r8
+	leaq	-128(%rdx),%rcx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	leaq	(%rsi),%rbx
+	movq	%r8,(%rsp)
+	leaq	-128(%rsi),%rsi
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	24(%rsp),%r15
+.cfi_restore	%r15
+	movq	32(%rsp),%r14
+.cfi_restore	%r14
+	movq	40(%rsp),%r13
+.cfi_restore	%r13
+	movq	48(%rsp),%r12
+.cfi_restore	%r12
+	movq	56(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	64(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*9
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_mont_384,.-sqrx_mont_384
+
+.globl	sqrx_n_mul_mont_384
+.hidden	sqrx_n_mul_mont_384
+.type	sqrx_n_mul_mont_384,@function
+.align	32
+sqrx_n_mul_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-40(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*5
+
+
+	movq	%rdx,%r10
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	%rsi,%rbx
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	movq	%r8,(%rsp)
+	movq	%r9,24(%rsp)
+	movq	0(%r9),%xmm2
+
+.Loop_sqrx_384:
+	movd	%r10d,%xmm1
+	leaq	-128(%rbx),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_384
+
+	movd	%xmm1,%r10d
+	decl	%r10d
+	jnz	.Loop_sqrx_384
+
+	movq	%rdx,%r14
+.byte	102,72,15,126,210
+	leaq	-128(%rbx),%rsi
+	movq	24(%rsp),%rbx
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	40(%rsp),%r15
+.cfi_restore	%r15
+	movq	48(%rsp),%r14
+.cfi_restore	%r14
+	movq	56(%rsp),%r13
+.cfi_restore	%r13
+	movq	64(%rsp),%r12
+.cfi_restore	%r12
+	movq	72(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	80(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	88(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*11
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384
+
+.globl	sqrx_n_mul_mont_383
+.hidden	sqrx_n_mul_mont_383
+.type	sqrx_n_mul_mont_383,@function
+.align	32
+sqrx_n_mul_mont_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-40(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*5
+
+
+	movq	%rdx,%r10
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	%rsi,%rbx
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	movq	%r8,(%rsp)
+	movq	%r9,24(%rsp)
+	movq	0(%r9),%xmm2
+	leaq	-128(%rcx),%rcx
+
+.Loop_sqrx_383:
+	movd	%r10d,%xmm1
+	leaq	-128(%rbx),%rsi
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_383_nonred
+
+	movd	%xmm1,%r10d
+	decl	%r10d
+	jnz	.Loop_sqrx_383
+
+	movq	%rdx,%r14
+.byte	102,72,15,126,210
+	leaq	-128(%rbx),%rsi
+	movq	24(%rsp),%rbx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	40(%rsp),%r15
+.cfi_restore	%r15
+	movq	48(%rsp),%r14
+.cfi_restore	%r14
+	movq	56(%rsp),%r13
+.cfi_restore	%r13
+	movq	64(%rsp),%r12
+.cfi_restore	%r12
+	movq	72(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	80(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	88(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*11
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383
+.type	__mulx_mont_383_nonred,@function
+.align	32
+__mulx_mont_383_nonred:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	mulxq	%r15,%r14,%r10
+	mulxq	%rax,%r15,%r11
+	addq	%r14,%r9
+	mulxq	%r12,%rax,%r12
+	adcq	%r15,%r10
+	mulxq	%rdi,%rdi,%r13
+	adcq	%rax,%r11
+	mulxq	%rbp,%rbp,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rdi,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r14
+	movq	%r8,%rax
+	imulq	8(%rsp),%r8
+
+
+	xorq	%r15,%r15
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r8,%rdx
+	adoxq	%rdi,%r14
+	adcxq	%r15,%rbp
+	adoxq	%rbp,%r15
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+	adcxq	%rax,%r14
+	adoxq	%rax,%r15
+	adcxq	%rax,%r15
+	movq	%r9,%r8
+	imulq	8(%rsp),%r9
+
+
+	xorq	%rax,%rax
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r9,%rdx
+	adoxq	%rdi,%r15
+	adcxq	%rax,%rbp
+	adoxq	%rbp,%rax
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+	adcxq	%r8,%r15
+	adoxq	%r8,%rax
+	adcxq	%r8,%rax
+	movq	%r10,%r9
+	imulq	8(%rsp),%r10
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r10,%rdx
+	adoxq	%rdi,%rax
+	adcxq	%r8,%rbp
+	adoxq	%rbp,%r8
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	32(%rbx),%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+	adcxq	%r9,%rax
+	adoxq	%r9,%r8
+	adcxq	%r9,%r8
+	movq	%r11,%r10
+	imulq	8(%rsp),%r11
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r11,%rdx
+	adoxq	%rdi,%r8
+	adcxq	%r9,%rbp
+	adoxq	%rbp,%r9
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	40(%rbx),%rdx
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcxq	%r10,%r9
+	movq	%r12,%r11
+	imulq	8(%rsp),%r12
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r12,%rdx
+	adoxq	%rdi,%r9
+	adcxq	%r10,%rbp
+	adoxq	%rbp,%r10
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r13
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r13,%rdx
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	imulq	8(%rsp),%rdx
+	movq	24(%rsp),%rbx
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r14,%rdx
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+	adcq	$0,%r10
+	movq	%r8,%r12
+
+	movq	%r14,0(%rbx)
+	movq	%r15,8(%rbx)
+	movq	%rax,16(%rbx)
+	movq	%r9,%rdi
+	movq	%r8,24(%rbx)
+	movq	%r9,32(%rbx)
+	movq	%r10,40(%rbx)
+	movq	%r10,%rbp
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__mulx_mont_383_nonred,.-__mulx_mont_383_nonred
+.globl	sqrx_mont_382x
+.hidden	sqrx_mont_382x
+.type	sqrx_mont_382x,@function
+.align	32
+sqrx_mont_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rdi,16(%rsp)
+	movq	%rsi,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rdx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rbx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rdx
+	sbbq	80(%rsi),%rbx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rdi,%rdi
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+	movq	%r14,32+48(%rsp)
+	movq	%r15,32+56(%rsp)
+	movq	%rax,32+64(%rsp)
+	movq	%rdx,32+72(%rsp)
+	movq	%rbx,32+80(%rsp)
+	movq	%rbp,32+88(%rsp)
+	movq	%rdi,32+96(%rsp)
+
+
+
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_383_nonred
+	addq	%rdx,%rdx
+	adcq	%r15,%r15
+	adcq	%rax,%rax
+	adcq	%r12,%r12
+	adcq	%rdi,%rdi
+	adcq	%rbp,%rbp
+
+	movq	%rdx,48(%rbx)
+	movq	%r15,56(%rbx)
+	movq	%rax,64(%rbx)
+	movq	%r12,72(%rbx)
+	movq	%rdi,80(%rbx)
+	movq	%rbp,88(%rbx)
+
+	leaq	32-128(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rdx
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%rax
+	movq	32+24(%rsp),%r12
+	movq	32+32(%rsp),%rdi
+	movq	32+40(%rsp),%rbp
+
+
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_383_nonred
+	movq	32+96(%rsp),%r14
+	leaq	128(%rcx),%rcx
+	movq	32+0(%rsp),%r8
+	andq	%r14,%r8
+	movq	32+8(%rsp),%r9
+	andq	%r14,%r9
+	movq	32+16(%rsp),%r10
+	andq	%r14,%r10
+	movq	32+24(%rsp),%r11
+	andq	%r14,%r11
+	movq	32+32(%rsp),%r13
+	andq	%r14,%r13
+	andq	32+40(%rsp),%r14
+
+	subq	%r8,%rdx
+	movq	0(%rcx),%r8
+	sbbq	%r9,%r15
+	movq	8(%rcx),%r9
+	sbbq	%r10,%rax
+	movq	16(%rcx),%r10
+	sbbq	%r11,%r12
+	movq	24(%rcx),%r11
+	sbbq	%r13,%rdi
+	movq	32(%rcx),%r13
+	sbbq	%r14,%rbp
+	sbbq	%r14,%r14
+
+	andq	%r14,%r8
+	andq	%r14,%r9
+	andq	%r14,%r10
+	andq	%r14,%r11
+	andq	%r14,%r13
+	andq	40(%rcx),%r14
+
+	addq	%r8,%rdx
+	adcq	%r9,%r15
+	adcq	%r10,%rax
+	adcq	%r11,%r12
+	adcq	%r13,%rdi
+	adcq	%r14,%rbp
+
+	movq	%rdx,0(%rbx)
+	movq	%r15,8(%rbx)
+	movq	%rax,16(%rbx)
+	movq	%r12,24(%rbx)
+	movq	%rdi,32(%rbx)
+	movq	%rbp,40(%rbx)
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_mont_382x,.-sqrx_mont_382x
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/sha256-armv8.S b/blst/elf/sha256-armv8.S
new file mode 100644
index 0000000..7341dec
--- /dev/null
+++ b/blst/elf/sha256-armv8.S
@@ -0,0 +1,1077 @@
+//
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// ====================================================================
+// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+// project.
+// ====================================================================
+//
+// sha256_block procedure for ARMv8.
+//
+// This module is stripped of scalar code paths, with raionale that all
+// known processors are NEON-capable.
+//
+// See original module at CRYPTOGAMS for further details.
+
+.text
+
+.align	6
+.type	.LK256,%object
+.LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+.size	.LK256,.-.LK256
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.align	2
+.align	2
+.globl	blst_sha256_block_armv8
+.type	blst_sha256_block_armv8,%function
+.align	6
+blst_sha256_block_armv8:
+.Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adr	x3,.LK256
+
+.Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+.size	blst_sha256_block_armv8,.-blst_sha256_block_armv8
+.globl	blst_sha256_block_data_order
+.type	blst_sha256_block_data_order,%function
+.align	4
+blst_sha256_block_data_order:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	sub	sp,sp,#16*4
+
+	adr	x16,.LK256
+	add	x2,x1,x2,lsl#6	// len to point at the end of inp
+
+	ld1	{v0.16b},[x1], #16
+	ld1	{v1.16b},[x1], #16
+	ld1	{v2.16b},[x1], #16
+	ld1	{v3.16b},[x1], #16
+	ld1	{v4.4s},[x16], #16
+	ld1	{v5.4s},[x16], #16
+	ld1	{v6.4s},[x16], #16
+	ld1	{v7.4s},[x16], #16
+	rev32	v0.16b,v0.16b		// yes, even on
+	rev32	v1.16b,v1.16b		// big-endian
+	rev32	v2.16b,v2.16b
+	rev32	v3.16b,v3.16b
+	mov	x17,sp
+	add	v4.4s,v4.4s,v0.4s
+	add	v5.4s,v5.4s,v1.4s
+	add	v6.4s,v6.4s,v2.4s
+	st1	{v4.4s,v5.4s},[x17], #32
+	add	v7.4s,v7.4s,v3.4s
+	st1	{v6.4s,v7.4s},[x17]
+	sub	x17,x17,#32
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#8]
+	ldp	w7,w8,[x0,#16]
+	ldp	w9,w10,[x0,#24]
+	ldr	w12,[sp,#0]
+	mov	w13,wzr
+	eor	w14,w4,w5
+	mov	w15,wzr
+	b	.L_00_48
+
+.align	4
+.L_00_48:
+	ext	v4.16b,v0.16b,v1.16b,#4
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	bic	w15,w9,w7
+	ext	v7.16b,v2.16b,v3.16b,#4
+	eor	w11,w7,w7,ror#5
+	add	w3,w3,w13
+	mov	d19,v3.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w3,w3,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w10,w10,w12
+	add	v0.4s,v0.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w10,w10,w11
+	ldr	w12,[sp,#4]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w4
+	ushr	v16.4s,v19.4s,#17
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	add	v0.4s,v0.4s,v5.4s
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w9,w9,w11
+	ldr	w12,[sp,#8]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	eor	v17.16b,v17.16b,v7.16b
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	add	v0.4s,v0.4s,v17.4s
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	ushr	v18.4s,v0.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v0.4s,#10
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	sli	v18.4s,v0.4s,#15
+	add	w8,w8,w12
+	ushr	v17.4s,v0.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	sli	v17.4s,v0.4s,#13
+	ldr	w12,[sp,#12]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w4,w4,w8
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w10
+	eor	v17.16b,v17.16b,v17.16b
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	add	v0.4s,v0.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	v4.4s,v4.4s,v0.4s
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v1.16b,v2.16b,#4
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	bic	w15,w5,w3
+	ext	v7.16b,v3.16b,v0.16b,#4
+	eor	w11,w3,w3,ror#5
+	add	w7,w7,w13
+	mov	d19,v0.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w7,w7,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w6,w6,w12
+	add	v1.4s,v1.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w6,w6,w11
+	ldr	w12,[sp,#20]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w8
+	ushr	v16.4s,v19.4s,#17
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	add	v1.4s,v1.4s,v5.4s
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w5,w5,w11
+	ldr	w12,[sp,#24]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	eor	v17.16b,v17.16b,v7.16b
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	add	v1.4s,v1.4s,v17.4s
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	ushr	v18.4s,v1.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v1.4s,#10
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	sli	v18.4s,v1.4s,#15
+	add	w4,w4,w12
+	ushr	v17.4s,v1.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	sli	v17.4s,v1.4s,#13
+	ldr	w12,[sp,#28]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w8,w8,w4
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w6
+	eor	v17.16b,v17.16b,v17.16b
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	add	v1.4s,v1.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	v4.4s,v4.4s,v1.4s
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[sp,#32]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v2.16b,v3.16b,#4
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	bic	w15,w9,w7
+	ext	v7.16b,v0.16b,v1.16b,#4
+	eor	w11,w7,w7,ror#5
+	add	w3,w3,w13
+	mov	d19,v1.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w3,w3,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w10,w10,w12
+	add	v2.4s,v2.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w10,w10,w11
+	ldr	w12,[sp,#36]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w4
+	ushr	v16.4s,v19.4s,#17
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	add	v2.4s,v2.4s,v5.4s
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w9,w9,w11
+	ldr	w12,[sp,#40]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	eor	v17.16b,v17.16b,v7.16b
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	add	v2.4s,v2.4s,v17.4s
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	ushr	v18.4s,v2.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v2.4s,#10
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	sli	v18.4s,v2.4s,#15
+	add	w8,w8,w12
+	ushr	v17.4s,v2.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	sli	v17.4s,v2.4s,#13
+	ldr	w12,[sp,#44]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w4,w4,w8
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w10
+	eor	v17.16b,v17.16b,v17.16b
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	add	v2.4s,v2.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	v4.4s,v4.4s,v2.4s
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#48]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v3.16b,v0.16b,#4
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	bic	w15,w5,w3
+	ext	v7.16b,v1.16b,v2.16b,#4
+	eor	w11,w3,w3,ror#5
+	add	w7,w7,w13
+	mov	d19,v2.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w7,w7,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w6,w6,w12
+	add	v3.4s,v3.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w6,w6,w11
+	ldr	w12,[sp,#52]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w8
+	ushr	v16.4s,v19.4s,#17
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	add	v3.4s,v3.4s,v5.4s
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w5,w5,w11
+	ldr	w12,[sp,#56]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	eor	v17.16b,v17.16b,v7.16b
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	add	v3.4s,v3.4s,v17.4s
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	ushr	v18.4s,v3.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v3.4s,#10
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	sli	v18.4s,v3.4s,#15
+	add	w4,w4,w12
+	ushr	v17.4s,v3.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	sli	v17.4s,v3.4s,#13
+	ldr	w12,[sp,#60]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w8,w8,w4
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w6
+	eor	v17.16b,v17.16b,v17.16b
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	add	v3.4s,v3.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	v4.4s,v4.4s,v3.4s
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[x16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	cmp	w12,#0				// check for K256 terminator
+	ldr	w12,[sp,#0]
+	sub	x17,x17,#64
+	bne	.L_00_48
+
+	sub	x16,x16,#256		// rewind x16
+	cmp	x1,x2
+	mov	x17, #64
+	csel	x17, x17, xzr, eq
+	sub	x1,x1,x17			// avoid SEGV
+	mov	x17,sp
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	ld1	{v0.16b},[x1],#16
+	bic	w15,w9,w7
+	eor	w11,w7,w7,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w3,w3,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	eor	w15,w3,w3,ror#11
+	rev32	v0.16b,v0.16b
+	add	w10,w10,w12
+	ror	w11,w11,#6
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	add	v4.4s,v4.4s,v0.4s
+	add	w10,w10,w11
+	ldr	w12,[sp,#4]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	eor	w14,w14,w4
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	add	w9,w9,w11
+	ldr	w12,[sp,#8]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	add	w8,w8,w12
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	ldr	w12,[sp,#12]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w4,w4,w8
+	eor	w14,w14,w10
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	ld1	{v1.16b},[x1],#16
+	bic	w15,w5,w3
+	eor	w11,w3,w3,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w7,w7,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	eor	w15,w7,w7,ror#11
+	rev32	v1.16b,v1.16b
+	add	w6,w6,w12
+	ror	w11,w11,#6
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	add	v4.4s,v4.4s,v1.4s
+	add	w6,w6,w11
+	ldr	w12,[sp,#20]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	eor	w14,w14,w8
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	add	w5,w5,w11
+	ldr	w12,[sp,#24]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	add	w4,w4,w12
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	ldr	w12,[sp,#28]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w8,w8,w4
+	eor	w14,w14,w6
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[sp,#32]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	ld1	{v2.16b},[x1],#16
+	bic	w15,w9,w7
+	eor	w11,w7,w7,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w3,w3,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	eor	w15,w3,w3,ror#11
+	rev32	v2.16b,v2.16b
+	add	w10,w10,w12
+	ror	w11,w11,#6
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	add	v4.4s,v4.4s,v2.4s
+	add	w10,w10,w11
+	ldr	w12,[sp,#36]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	eor	w14,w14,w4
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	add	w9,w9,w11
+	ldr	w12,[sp,#40]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	add	w8,w8,w12
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	ldr	w12,[sp,#44]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w4,w4,w8
+	eor	w14,w14,w10
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#48]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	ld1	{v3.16b},[x1],#16
+	bic	w15,w5,w3
+	eor	w11,w3,w3,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w7,w7,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	eor	w15,w7,w7,ror#11
+	rev32	v3.16b,v3.16b
+	add	w6,w6,w12
+	ror	w11,w11,#6
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	add	v4.4s,v4.4s,v3.4s
+	add	w6,w6,w11
+	ldr	w12,[sp,#52]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	eor	w14,w14,w8
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	add	w5,w5,w11
+	ldr	w12,[sp,#56]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	add	w4,w4,w12
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	ldr	w12,[sp,#60]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w8,w8,w4
+	eor	w14,w14,w6
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	add	w3,w3,w15			// h+=Sigma0(a) from the past
+	ldp	w11,w12,[x0,#0]
+	add	w3,w3,w13			// h+=Maj(a,b,c) from the past
+	ldp	w13,w14,[x0,#8]
+	add	w3,w3,w11			// accumulate
+	add	w4,w4,w12
+	ldp	w11,w12,[x0,#16]
+	add	w5,w5,w13
+	add	w6,w6,w14
+	ldp	w13,w14,[x0,#24]
+	add	w7,w7,w11
+	add	w8,w8,w12
+	ldr	w12,[sp,#0]
+	stp	w3,w4,[x0,#0]
+	add	w9,w9,w13
+	mov	w13,wzr
+	stp	w5,w6,[x0,#8]
+	add	w10,w10,w14
+	stp	w7,w8,[x0,#16]
+	eor	w14,w4,w5
+	stp	w9,w10,[x0,#24]
+	mov	w15,wzr
+	mov	x17,sp
+	b.ne	.L_00_48
+
+	ldr	x29,[x29]
+	add	sp,sp,#16*4+16
+	ret
+.size	blst_sha256_block_data_order,.-blst_sha256_block_data_order
+.globl	blst_sha256_emit
+.hidden	blst_sha256_emit
+.type	blst_sha256_emit,%function
+.align	4
+blst_sha256_emit:
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+#ifndef	__AARCH64EB__
+	rev	x4,x4
+	rev	x5,x5
+	rev	x6,x6
+	rev	x7,x7
+#endif
+	str	w4,[x0,#4]
+	lsr	x4,x4,#32
+	str	w5,[x0,#12]
+	lsr	x5,x5,#32
+	str	w6,[x0,#20]
+	lsr	x6,x6,#32
+	str	w7,[x0,#28]
+	lsr	x7,x7,#32
+	str	w4,[x0,#0]
+	str	w5,[x0,#8]
+	str	w6,[x0,#16]
+	str	w7,[x0,#24]
+	ret
+.size	blst_sha256_emit,.-blst_sha256_emit
+
+.globl	blst_sha256_bcopy
+.hidden	blst_sha256_bcopy
+.type	blst_sha256_bcopy,%function
+.align	4
+blst_sha256_bcopy:
+.Loop_bcopy:
+	ldrb	w3,[x1],#1
+	sub	x2,x2,#1
+	strb	w3,[x0],#1
+	cbnz	x2,.Loop_bcopy
+	ret
+.size	blst_sha256_bcopy,.-blst_sha256_bcopy
+
+.globl	blst_sha256_hcopy
+.hidden	blst_sha256_hcopy
+.type	blst_sha256_hcopy,%function
+.align	4
+blst_sha256_hcopy:
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+	ret
+.size	blst_sha256_hcopy,.-blst_sha256_hcopy
diff --git a/blst/elf/sha256-portable-x86_64.s b/blst/elf/sha256-portable-x86_64.s
new file mode 100644
index 0000000..20b5c41
--- /dev/null
+++ b/blst/elf/sha256-portable-x86_64.s
@@ -0,0 +1,1754 @@
+.text	
+
+.globl	blst_sha256_block_data_order
+.type	blst_sha256_block_data_order,@function
+.align	16
+blst_sha256_block_data_order:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$64+24,%rsp
+.cfi_adjust_cfa_offset	16*4+3*8
+	leaq	(%rsi,%rdx,4),%rdx
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movl	%ebx,%edi
+	leaq	K256(%rip),%rbp
+	xorl	%ecx,%edi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	0(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	addl	%r14d,%r11d
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	4(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	addl	%r14d,%r10d
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	8(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	addl	%r14d,%r9d
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	12(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	addl	%r14d,%r8d
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	16(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	addl	%r14d,%edx
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	20(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	addl	%r14d,%ecx
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	24(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	addl	%r14d,%ebx
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	28(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	addl	%r14d,%eax
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	32(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	addl	%r14d,%r11d
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	36(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	addl	%r14d,%r10d
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	40(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	addl	%r14d,%r9d
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	44(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	addl	%r14d,%r8d
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	48(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	addl	%r14d,%edx
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	52(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	addl	%r14d,%ecx
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	56(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	addl	%r14d,%ebx
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	60(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	36(%rsp),%r12d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	64(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	40(%rsp),%r12d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	68(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	44(%rsp),%r12d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	72(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	48(%rsp),%r12d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	76(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	52(%rsp),%r12d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	80(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	56(%rsp),%r12d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	84(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	60(%rsp),%r12d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	88(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	0(%rsp),%r12d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	92(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	4(%rsp),%r12d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	96(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	8(%rsp),%r12d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	100(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	12(%rsp),%r12d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	104(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	16(%rsp),%r12d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	108(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	20(%rsp),%r12d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	112(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	24(%rsp),%r12d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	116(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	28(%rsp),%r12d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	120(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	32(%rsp),%r12d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	124(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	64(%rbp),%rbp
+	cmpb	$0x19,3(%rbp)
+	jnz	.Lrounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	addl	%r14d,%eax
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop
+
+	leaq	64+24+48(%rsp),%r11
+.cfi_def_cfa	%r11,8
+	movq	64+24(%rsp),%r15
+.cfi_restore	%r15
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbp
+.cfi_restore	%rbp
+	movq	-8(%r11),%rbx
+.cfi_restore	%rbx
+
+	leaq	(%r11),%rsp
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	blst_sha256_block_data_order,.-blst_sha256_block_data_order
+
+.align	64
+.type	K256,@object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.globl	blst_sha256_emit
+.hidden	blst_sha256_emit
+.type	blst_sha256_emit,@function
+.align	16
+blst_sha256_emit:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	bswapq	%r8
+	movq	24(%rsi),%r11
+	bswapq	%r9
+	movl	%r8d,4(%rdi)
+	bswapq	%r10
+	movl	%r9d,12(%rdi)
+	bswapq	%r11
+	movl	%r10d,20(%rdi)
+	shrq	$32,%r8
+	movl	%r11d,28(%rdi)
+	shrq	$32,%r9
+	movl	%r8d,0(%rdi)
+	shrq	$32,%r10
+	movl	%r9d,8(%rdi)
+	shrq	$32,%r11
+	movl	%r10d,16(%rdi)
+	movl	%r11d,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_emit,.-blst_sha256_emit
+
+.globl	blst_sha256_bcopy
+.hidden	blst_sha256_bcopy
+.type	blst_sha256_bcopy,@function
+.align	16
+blst_sha256_bcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	subq	%rsi,%rdi
+.Loop_bcopy:
+	movzbl	(%rsi),%eax
+	leaq	1(%rsi),%rsi
+	movb	%al,-1(%rdi,%rsi,1)
+	decq	%rdx
+	jnz	.Loop_bcopy
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_bcopy,.-blst_sha256_bcopy
+
+.globl	blst_sha256_hcopy
+.hidden	blst_sha256_hcopy
+.type	blst_sha256_hcopy,@function
+.align	16
+blst_sha256_hcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_hcopy,.-blst_sha256_hcopy
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/elf/sha256-x86_64.s b/blst/elf/sha256-x86_64.s
new file mode 100644
index 0000000..47fdc5b
--- /dev/null
+++ b/blst/elf/sha256-x86_64.s
@@ -0,0 +1,1446 @@
+.text	
+
+.align	64
+.type	K256,@object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.globl	blst_sha256_block_data_order_shaext
+.hidden	blst_sha256_block_data_order_shaext
+.type	blst_sha256_block_data_order_shaext,@function
+.align	64
+blst_sha256_block_data_order_shaext:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	256-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	16-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	48-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	80-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	112-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	144-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	176-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	208-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	240-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	.Loop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext
+.globl	blst_sha256_block_data_order
+.hidden	blst_sha256_block_data_order
+.type	blst_sha256_block_data_order,@function
+.align	64
+blst_sha256_block_data_order:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$40,%rsp
+.cfi_adjust_cfa_offset	40
+	leaq	(%rsi,%rdx,4),%rdx
+	movq	%rdi,0(%rsp)
+
+	movq	%rdx,16(%rsp)
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+
+
+	leaq	-64(%rsp),%rsp
+	movl	0(%rdi),%eax
+	andq	$-64,%rsp
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+
+
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	K256+256(%rip),%xmm7
+	movq	%rsi,8(%rbp)
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+.byte	102,15,56,0,199
+	movdqu	48(%rsi),%xmm3
+	leaq	K256(%rip),%rsi
+.byte	102,15,56,0,207
+	movdqa	0(%rsi),%xmm4
+	movdqa	16(%rsi),%xmm5
+.byte	102,15,56,0,215
+	paddd	%xmm0,%xmm4
+	movdqa	32(%rsi),%xmm6
+.byte	102,15,56,0,223
+	movdqa	48(%rsi),%xmm7
+	paddd	%xmm1,%xmm5
+	paddd	%xmm2,%xmm6
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	movdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	movdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	movdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	subq	$-64,%rsi
+	rorl	$14,%r13d
+	movdqa	%xmm1,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm3,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,224,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,250,4
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm0
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm0
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	0(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm0,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,0(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm2,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm0,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,225,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,251,4
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm1
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm1
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	16(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm1,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,16(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm3,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm1,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,226,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,248,4
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm2
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm2
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	32(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm2,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,32(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm0,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm2,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,227,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,249,4
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm3
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm3
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	48(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm3,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,48(%rsp)
+	cmpb	$0,67(%rsi)
+	jne	.Lssse3_00_47
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	0(%rbp),%rdi
+	movl	%r14d,%eax
+	movq	8(%rbp),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	leaq	64(%rsi),%rsi
+	cmpq	16(%rbp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_ssse3
+
+	xorps	%xmm0,%xmm0
+	leaq	40+48(%rbp),%r11
+.cfi_def_cfa	%r11,8
+	movaps	%xmm0,0(%rsp)
+	movaps	%xmm0,16(%rsp)
+	movaps	%xmm0,32(%rsp)
+	movaps	%xmm0,48(%rsp)
+	movq	40(%rbp),%r15
+.cfi_restore	%r15
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbx
+.cfi_restore	%rbx
+	movq	-8(%r11),%rbp
+.cfi_restore	%rbp
+
+	leaq	(%r11),%rsp
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	blst_sha256_block_data_order,.-blst_sha256_block_data_order
+.globl	blst_sha256_emit
+.hidden	blst_sha256_emit
+.type	blst_sha256_emit,@function
+.align	16
+blst_sha256_emit:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	bswapq	%r8
+	movq	24(%rsi),%r11
+	bswapq	%r9
+	movl	%r8d,4(%rdi)
+	bswapq	%r10
+	movl	%r9d,12(%rdi)
+	bswapq	%r11
+	movl	%r10d,20(%rdi)
+	shrq	$32,%r8
+	movl	%r11d,28(%rdi)
+	shrq	$32,%r9
+	movl	%r8d,0(%rdi)
+	shrq	$32,%r10
+	movl	%r9d,8(%rdi)
+	shrq	$32,%r11
+	movl	%r10d,16(%rdi)
+	movl	%r11d,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_emit,.-blst_sha256_emit
+
+.globl	blst_sha256_bcopy
+.hidden	blst_sha256_bcopy
+.type	blst_sha256_bcopy,@function
+.align	16
+blst_sha256_bcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	subq	%rsi,%rdi
+.Loop_bcopy:
+	movzbl	(%rsi),%eax
+	leaq	1(%rsi),%rsi
+	movb	%al,-1(%rdi,%rsi,1)
+	decq	%rdx
+	jnz	.Loop_bcopy
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_bcopy,.-blst_sha256_bcopy
+
+.globl	blst_sha256_hcopy
+.hidden	blst_sha256_hcopy
+.type	blst_sha256_hcopy,@function
+.align	16
+blst_sha256_hcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_hcopy,.-blst_sha256_hcopy
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/blst/errors.h b/blst/errors.h
new file mode 100644
index 0000000..425daeb
--- /dev/null
+++ b/blst/errors.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_ERRORS_H__
+#define __BLS12_381_ASM_ERRORS_H__
+
+typedef enum {
+    BLST_SUCCESS = 0,
+    BLST_BAD_ENCODING,
+    BLST_POINT_NOT_ON_CURVE,
+    BLST_POINT_NOT_IN_GROUP,
+    BLST_AGGR_TYPE_MISMATCH,
+    BLST_VERIFY_FAIL,
+    BLST_PK_IS_INFINITY,
+} BLST_ERROR;
+
+#endif
diff --git a/blst/exp.c b/blst/exp.c
new file mode 100644
index 0000000..55c5c5a
--- /dev/null
+++ b/blst/exp.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "vect.h"
+
+/*
+ * |out| = |inp|^|pow|, small footprint, public exponent
+ */
+static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow,
+                         size_t pow_bits, const vec384 p, limb_t n0)
+{
+#if 1
+    vec384 ret;
+
+    vec_copy(ret, inp, sizeof(ret));  /* ret = inp^1 */
+    --pow_bits; /* most significant bit is set, skip over */
+    while (pow_bits--) {
+        sqr_mont_384(ret, ret, p, n0);
+        if (is_bit_set(pow, pow_bits))
+            mul_mont_384(ret, ret, inp, p, n0);
+    }
+    vec_copy(out, ret, sizeof(ret));  /* out = ret */
+#else
+    unsigned int i;
+    vec384 sqr;
+
+    vec_copy(sqr, inp, sizeof(sqr));
+    for (i = 0; !is_bit_set(pow, i++);)
+        sqr_mont_384(sqr, sqr, sqr, p, n0);
+    vec_copy(out, sqr, sizeof(sqr));
+    for (; i < pow_bits; i++) {
+        sqr_mont_384(sqr, sqr, sqr, p, n0);
+        if (is_bit_set(pow, i))
+            mul_mont_384(out, out, sqr, p, n0);
+    }
+#endif
+}
+
+static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow,
+                          size_t pow_bits, const vec384 p, limb_t n0)
+{
+    vec384x ret;
+
+    vec_copy(ret, inp, sizeof(ret));  /* |ret| = |inp|^1 */
+    --pow_bits; /* most significant bit is accounted for, skip over */
+    while (pow_bits--) {
+        sqr_mont_384x(ret, ret, p, n0);
+        if (is_bit_set(pow, pow_bits))
+            mul_mont_384x(ret, ret, inp, p, n0);
+    }
+    vec_copy(out, ret, sizeof(ret));  /* |out| = |ret| */
+}
diff --git a/blst/exports.c b/blst/exports.c
new file mode 100644
index 0000000..833c18a
--- /dev/null
+++ b/blst/exports.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * Why this file? Overall goal is to ensure that all internal calls
+ * remain internal after linking application. This is to both
+ *
+ * a) minimize possibility of external name conflicts (since all
+ *    non-blst-prefixed and [assembly subroutines] remain static);
+ * b) preclude possibility of unintentional internal reference
+ *    overload in shared library context (one can achieve same
+ *    effect with -Bsymbolic, but we don't want to rely on end-user
+ *    to remember to use it);
+ */
+
+#include "fields.h"
+
+/*
+ * BLS12-381-specifc Fr shortcuts to assembly.
+ */
+void blst_fr_add(vec256 ret, const vec256 a, const vec256 b)
+{   add_mod_256(ret, a, b, BLS12_381_r);   }
+
+void blst_fr_sub(vec256 ret, const vec256 a, const vec256 b)
+{   sub_mod_256(ret, a, b, BLS12_381_r);   }
+
+void blst_fr_mul_by_3(vec256 ret, const vec256 a)
+{   mul_by_3_mod_256(ret, a, BLS12_381_r);   }
+
+void blst_fr_lshift(vec256 ret, const vec256 a, size_t count)
+{   lshift_mod_256(ret, a, count, BLS12_381_r);   }
+
+void blst_fr_rshift(vec256 ret, const vec256 a, size_t count)
+{   rshift_mod_256(ret, a, count, BLS12_381_r);   }
+
+void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b)
+{   mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0);   }
+
+void blst_fr_sqr(vec256 ret, const vec256 a)
+{   sqr_mont_sparse_256(ret, a, BLS12_381_r, r0);   }
+
+void blst_fr_cneg(vec256 ret, const vec256 a, int flag)
+{   cneg_mod_256(ret, a, is_zero(flag) ^ 1, BLS12_381_r);   }
+
+void blst_fr_to(vec256 ret, const vec256 a)
+{   mul_mont_sparse_256(ret, a, BLS12_381_rRR, BLS12_381_r, r0);   }
+
+void blst_fr_from(vec256 ret, const vec256 a)
+{   from_mont_256(ret, a, BLS12_381_r, r0);   }
+
+void blst_fr_from_scalar(vec256 ret, const pow256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if ((uptr_t)ret == (uptr_t)a && is_endian.little) {
+        mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR,
+                                                    BLS12_381_r, r0);
+    } else {
+        vec256 out;
+        limbs_from_le_bytes(out, a, 32);
+        mul_mont_sparse_256(ret, out, BLS12_381_rRR, BLS12_381_r, r0);
+        vec_zero(out, sizeof(out));
+    }
+}
+
+void blst_scalar_from_fr(pow256 ret, const vec256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if ((uptr_t)ret == (uptr_t)a && is_endian.little) {
+        from_mont_256((limb_t *)ret, a, BLS12_381_r, r0);
+    } else {
+        vec256 out;
+        from_mont_256(out, a, BLS12_381_r, r0);
+        le_bytes_from_limbs(ret, out, 32);
+        vec_zero(out, sizeof(out));
+    }
+}
+
+int blst_scalar_fr_check(const pow256 a)
+{   return (int)(check_mod_256(a, BLS12_381_r) |
+                 bytes_are_zero(a, sizeof(pow256)));
+}
+
+int blst_sk_check(const pow256 a)
+{   return (int)check_mod_256(a, BLS12_381_r);   }
+
+int blst_sk_add_n_check(pow256 ret, const pow256 a, const pow256 b)
+{   return (int)add_n_check_mod_256(ret, a, b, BLS12_381_r);   }
+
+int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b)
+{   return (int)sub_n_check_mod_256(ret, a, b, BLS12_381_r);   }
+
+int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b)
+{
+    vec256 a_fr, b_fr;
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) {
+        limbs_from_le_bytes(a_fr, a, sizeof(a_fr));
+        limbs_from_le_bytes(b_fr, b, sizeof(a_fr));
+        a = (const byte *)a_fr;
+        b = (const byte *)b_fr;
+    }
+    mul_mont_sparse_256(a_fr, (const limb_t *)a, BLS12_381_rRR,
+                                                 BLS12_381_r, r0);
+    mul_mont_sparse_256(b_fr, (const limb_t *)b, BLS12_381_rRR,
+                                                 BLS12_381_r, r0);
+    mul_mont_sparse_256(a_fr, a_fr, b_fr, BLS12_381_r, r0);
+    from_mont_256(a_fr, a_fr, BLS12_381_r, r0);
+    le_bytes_from_limbs(ret, a_fr, sizeof(a_fr));
+
+    return (int)(vec_is_zero(a_fr, sizeof(a_fr)) ^ 1);
+}
+
+void blst_sk_inverse(pow256 ret, const pow256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (((size_t)a|(size_t)ret)%sizeof(limb_t) == 0 && is_endian.little) {
+        limb_t *out = (limb_t *)ret;
+        mul_mont_sparse_256(out, (const limb_t *)a, BLS12_381_rRR,
+                                                    BLS12_381_r, r0);
+        reciprocal_fr(out, out);
+        from_mont_256(out, out, BLS12_381_r, r0);
+    } else {
+        vec256 out;
+        limbs_from_le_bytes(out, a, 32);
+        mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0);
+        reciprocal_fr(out, out);
+        from_mont_256(out, out, BLS12_381_r, r0);
+        le_bytes_from_limbs(ret, out, 32);
+        vec_zero(out, sizeof(out));
+    }
+}
+
+/*
+ * BLS12-381-specifc Fp shortcuts to assembly.
+ */
+void blst_fp_add(vec384 ret, const vec384 a, const vec384 b)
+{   add_fp(ret, a, b);   }
+
+void blst_fp_sub(vec384 ret, const vec384 a, const vec384 b)
+{   sub_fp(ret, a, b);   }
+
+void blst_fp_mul_by_3(vec384 ret, const vec384 a)
+{   mul_by_3_fp(ret, a);   }
+
+void blst_fp_mul_by_8(vec384 ret, const vec384 a)
+{   mul_by_8_fp(ret, a);   }
+
+void blst_fp_lshift(vec384 ret, const vec384 a, size_t count)
+{   lshift_fp(ret, a, count);   }
+
+void blst_fp_mul(vec384 ret, const vec384 a, const vec384 b)
+{   mul_fp(ret, a, b);   }
+
+void blst_fp_sqr(vec384 ret, const vec384 a)
+{   sqr_fp(ret, a);   }
+
+void blst_fp_cneg(vec384 ret, const vec384 a, int flag)
+{   cneg_fp(ret, a, is_zero(flag) ^ 1);   }
+
+void blst_fp_to(vec384 ret, const vec384 a)
+{   mul_fp(ret, a, BLS12_381_RR);   }
+
+void blst_fp_from(vec384 ret, const vec384 a)
+{   from_fp(ret, a);   }
+
+/*
+ * Fp serialization/deserialization.
+ */
+void blst_fp_from_uint32(vec384 ret, const unsigned int a[12])
+{
+    if (sizeof(limb_t) == 8) {
+        int i;
+        for (i = 0; i < 6; i++)
+            ret[i] = a[2*i] | ((limb_t)a[2*i+1] << (32 & (8*sizeof(limb_t)-1)));
+        a = (const unsigned int *)ret;
+    }
+    mul_fp(ret, (const limb_t *)a, BLS12_381_RR);
+}
+
+void blst_uint32_from_fp(unsigned int ret[12], const vec384 a)
+{
+    if (sizeof(limb_t) == 4) {
+        from_fp((limb_t *)ret, a);
+    } else {
+        vec384 out;
+        int i;
+
+        from_fp(out, a);
+        for (i = 0; i < 6; i++) {
+            limb_t limb = out[i];
+            ret[2*i]   = (unsigned int)limb;
+            ret[2*i+1] = (unsigned int)(limb >> (32 & (8*sizeof(limb_t)-1)));
+        }
+    }
+}
+
+void blst_fp_from_uint64(vec384 ret, const unsigned long long a[6])
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (sizeof(limb_t) == 4 && !is_endian.little) {
+        int i;
+        for (i = 0; i < 6; i++) {
+            unsigned long long limb = a[i];
+            ret[2*i]   = (limb_t)limb;
+            ret[2*i+1] = (limb_t)(limb >> 32);
+        }
+        a = (const unsigned long long *)ret;
+    }
+    mul_fp(ret, (const limb_t *)a, BLS12_381_RR);
+}
+
+void blst_uint64_from_fp(unsigned long long ret[6], const vec384 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (sizeof(limb_t) == 8 || is_endian.little) {
+        from_fp((limb_t *)ret, a);
+    } else {
+        vec384 out;
+        int i;
+
+        from_fp(out, a);
+        for (i = 0; i < 6; i++)
+            ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32);
+    }
+}
+
+void blst_fp_from_bendian(vec384 ret, const unsigned char a[48])
+{
+    vec384 out;
+
+    limbs_from_be_bytes(out, a, sizeof(vec384));
+    mul_fp(ret, out, BLS12_381_RR);
+}
+
+void blst_bendian_from_fp(unsigned char ret[48], const vec384 a)
+{
+    vec384 out;
+
+    from_fp(out, a);
+    be_bytes_from_limbs(ret, out, sizeof(vec384));
+}
+
+void blst_fp_from_lendian(vec384 ret, const unsigned char a[48])
+{
+    vec384 out;
+
+    limbs_from_le_bytes(out, a, sizeof(vec384));
+    mul_fp(ret, out, BLS12_381_RR);
+}
+
+void blst_lendian_from_fp(unsigned char ret[48], const vec384 a)
+{
+    vec384 out;
+
+    from_fp(out, a);
+    le_bytes_from_limbs(ret, out, sizeof(vec384));
+}
+
+/*
+ * BLS12-381-specifc Fp2 shortcuts to assembly.
+ */
+void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b)
+{   add_fp2(ret, a, b);   }
+
+void blst_fp2_sub(vec384x ret, const vec384x a, const vec384x b)
+{   sub_fp2(ret, a, b);   }
+
+void blst_fp2_mul_by_3(vec384x ret, const vec384x a)
+{   mul_by_3_fp2(ret, a);   }
+
+void blst_fp2_mul_by_8(vec384x ret, const vec384x a)
+{   mul_by_8_fp2(ret, a);   }
+
+void blst_fp2_lshift(vec384x ret, const vec384x a, size_t count)
+{   lshift_fp2(ret, a, count);    }
+
+void blst_fp2_mul(vec384x ret, const vec384x a, const vec384x b)
+{   mul_fp2(ret, a, b);   }
+
+void blst_fp2_sqr(vec384x ret, const vec384x a)
+{   sqr_fp2(ret, a);   }
+
+void blst_fp2_cneg(vec384x ret, const vec384x a, int flag)
+{   cneg_fp2(ret, a, is_zero(flag) ^ 1);   }
+
+/*
+ * Scalar serialization/deseriazation
+ */
+void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8])
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a && is_endian.little)
+        return;
+
+    for(i = 0; i < 8; i++) {
+        unsigned int w = a[i];
+        *ret++ = (byte)w;
+        *ret++ = (byte)(w >> 8);
+        *ret++ = (byte)(w >> 16);
+        *ret++ = (byte)(w >> 24);
+    }
+}
+
+void blst_uint32_from_scalar(unsigned int ret[8], const pow256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a && is_endian.little)
+        return;
+
+    for(i = 0; i < 8; i++) {
+        unsigned int w = (unsigned int)(*a++);
+        w |= (unsigned int)(*a++) << 8;
+        w |= (unsigned int)(*a++) << 16;
+        w |= (unsigned int)(*a++) << 24;
+        ret[i] = w;
+    }
+}
+
+void blst_scalar_from_uint64(pow256 ret, const unsigned long long a[4])
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a && is_endian.little)
+        return;
+
+    for(i = 0; i < 4; i++) {
+        unsigned long long w = a[i];
+        *ret++ = (byte)w;
+        *ret++ = (byte)(w >> 8);
+        *ret++ = (byte)(w >> 16);
+        *ret++ = (byte)(w >> 24);
+        *ret++ = (byte)(w >> 32);
+        *ret++ = (byte)(w >> 40);
+        *ret++ = (byte)(w >> 48);
+        *ret++ = (byte)(w >> 56);
+    }
+}
+
+void blst_uint64_from_scalar(unsigned long long ret[4], const pow256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a && is_endian.little)
+        return;
+
+    for(i = 0; i < 4; i++) {
+        unsigned long long w = (unsigned long long)(*a++);
+        w |= (unsigned long long)(*a++) << 8;
+        w |= (unsigned long long)(*a++) << 16;
+        w |= (unsigned long long)(*a++) << 24;
+        w |= (unsigned long long)(*a++) << 32;
+        w |= (unsigned long long)(*a++) << 40;
+        w |= (unsigned long long)(*a++) << 48;
+        w |= (unsigned long long)(*a++) << 56;
+        ret[i] = w;
+    }
+}
+
+void blst_scalar_from_bendian(pow256 ret, const unsigned char a[32])
+{
+    vec256 out;
+    limbs_from_be_bytes(out, a, sizeof(out));
+    le_bytes_from_limbs(ret, out, sizeof(out));
+    vec_zero(out, sizeof(out));
+}
+
+void blst_bendian_from_scalar(unsigned char ret[32], const pow256 a)
+{
+    vec256 out;
+    limbs_from_le_bytes(out, a, sizeof(out));
+    be_bytes_from_limbs(ret, out, sizeof(out));
+    vec_zero(out, sizeof(out));
+}
+
+void blst_scalar_from_lendian(pow256 ret, const unsigned char a[32])
+{
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a)
+        return;
+
+    for (i = 0; i < 32; i++)
+        ret[i] = a[i];
+}
+
+void blst_lendian_from_scalar(unsigned char ret[32], const pow256 a)
+{
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a)
+        return;
+
+    for (i = 0; i < 32; i++)
+        ret[i] = a[i];
+}
+
+void blst_fr_from_uint64(vec256 ret, const unsigned long long a[4])
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (sizeof(limb_t) == 4 && !is_endian.little) {
+        int i;
+        for (i = 0; i < 4; i++) {
+            unsigned long long limb = a[i];
+            ret[2*i]   = (limb_t)limb;
+            ret[2*i+1] = (limb_t)(limb >> 32);
+        }
+        a = (const unsigned long long *)ret;
+    }
+    mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0);
+}
+
+void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (sizeof(limb_t) == 8 || is_endian.little) {
+        from_mont_256((limb_t *)ret, a, BLS12_381_r, r0);
+    } else {
+        vec256 out;
+        int i;
+
+        from_mont_256(out, a, BLS12_381_r, r0);
+        for (i = 0; i < 4; i++)
+            ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32);
+        vec_zero(out, sizeof(out));
+    }
+}
+
+int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n)
+{
+    struct { vec256 out, digit, radix; } t;
+    limb_t ret;
+
+    vec_zero(t.out, sizeof(t.out));
+    vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix));
+
+    while (n > 32) {
+        limbs_from_le_bytes(t.digit, bytes, 32);
+        from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
+        mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
+        add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
+        mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0);
+        bytes += 32;
+        n -= 32;
+    }
+
+    vec_zero(t.digit, sizeof(t.digit));
+    limbs_from_le_bytes(t.digit, bytes, n);
+    from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
+    mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
+    add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
+
+    ret = vec_is_zero(t.out, sizeof(t.out));
+    le_bytes_from_limbs(out, t.out, 32);
+    vec_zero(t.out, 2*sizeof(t.out));
+
+    return (int)(ret^1);
+}
+
+int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n)
+{
+    struct { vec256 out, digit, radix; } t;
+    limb_t ret;
+
+    vec_zero(t.out, sizeof(t.out));
+    vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix));
+
+    bytes += n;
+    while (n > 32) {
+        limbs_from_be_bytes(t.digit, bytes -= 32, 32);
+        from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
+        mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
+        add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
+        mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0);
+        n -= 32;
+    }
+
+    vec_zero(t.digit, sizeof(t.digit));
+    limbs_from_be_bytes(t.digit, bytes -= n, n);
+    from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
+    mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
+    add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
+
+    ret = vec_is_zero(t.out, sizeof(t.out));
+    le_bytes_from_limbs(out, t.out, 32);
+    vec_zero(t.out, 2*sizeof(t.out));
+
+    return (int)(ret^1);
+}
+
+/*
+ * Test facilitator
+ */
+static unsigned char nibble(char c)
+{
+    if (c >= '0' && c <= '9')
+        return c - '0';
+    else if (c >= 'a' && c <= 'f')
+        return 10 + c - 'a';
+    else if (c >= 'A' && c <= 'F')
+        return 10 + c - 'A';
+    else
+        return 16;
+}
+
+static void limbs_from_hexascii(limb_t *ret, size_t sz, const char *hex)
+{
+    size_t len;
+    limb_t limb = 0;
+
+    if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X'))
+        hex += 2;
+
+    for (len = 0; len<2*sz && nibble(hex[len])<16; len++) ;
+
+    vec_zero(ret, sz);
+
+    while(len--) {
+        limb <<= 4;
+        limb |= nibble(*hex++);
+        if (len % (2*sizeof(limb_t)) == 0)
+            ret[len / (2*sizeof(limb_t))] = limb;
+    }
+}
+
+void blst_scalar_from_hexascii(vec256 ret, const char *hex)
+{   limbs_from_hexascii(ret, sizeof(vec256), hex);   }
+
+void blst_fp_from_hexascii(vec384 ret, const char *hex)
+{
+    limbs_from_hexascii(ret, sizeof(vec384), hex);
+    mul_fp(ret, ret, BLS12_381_RR);
+}
diff --git a/blst/fields.h b/blst/fields.h
new file mode 100644
index 0000000..3e451c4
--- /dev/null
+++ b/blst/fields.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_FIELDS_H__
+#define __BLS12_381_ASM_FIELDS_H__
+
+#include "vect.h"
+#include "consts.h"
+
+#ifndef __CUDA_ARCH__
+/*
+ * BLS12-381-specifc Fp shortcuts to assembly.
+ */
+static inline void add_fp(vec384 ret, const vec384 a, const vec384 b)
+{   add_mod_384(ret, a, b, BLS12_381_P);   }
+
+static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b)
+{   sub_mod_384(ret, a, b, BLS12_381_P);   }
+
+static inline void mul_by_3_fp(vec384 ret, const vec384 a)
+{   mul_by_3_mod_384(ret, a, BLS12_381_P);   }
+
+static inline void mul_by_8_fp(vec384 ret, const vec384 a)
+{   mul_by_8_mod_384(ret, a, BLS12_381_P);   }
+
+static inline void lshift_fp(vec384 ret, const vec384 a, size_t count)
+{   lshift_mod_384(ret, a, count, BLS12_381_P);   }
+
+static inline void rshift_fp(vec384 ret, const vec384 a, size_t count)
+{   rshift_mod_384(ret, a, count, BLS12_381_P);   }
+
+static inline void div_by_2_fp(vec384 ret, const vec384 a)
+{   div_by_2_mod_384(ret, a, BLS12_381_P);   }
+
+static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b)
+{   mul_mont_384(ret, a, b, BLS12_381_P, p0);   }
+
+static inline void sqr_fp(vec384 ret, const vec384 a)
+{   sqr_mont_384(ret, a, BLS12_381_P, p0);   }
+
+static inline void cneg_fp(vec384 ret, const vec384 a, bool_t flag)
+{   cneg_mod_384(ret, a, flag, BLS12_381_P);   }
+
+static inline void from_fp(vec384 ret, const vec384 a)
+{   from_mont_384(ret, a, BLS12_381_P, p0);   }
+
+static inline void redc_fp(vec384 ret, const vec768 a)
+{   redc_mont_384(ret, a, BLS12_381_P, p0);   }
+
+/*
+ * BLS12-381-specifc Fp2 shortcuts to assembly.
+ */
+static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b)
+{   add_mod_384x(ret, a, b, BLS12_381_P);   }
+
+static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b)
+{   sub_mod_384x(ret, a, b, BLS12_381_P);   }
+
+static inline void mul_by_3_fp2(vec384x ret, const vec384x a)
+{   mul_by_3_mod_384x(ret, a, BLS12_381_P);   }
+
+static inline void mul_by_8_fp2(vec384x ret, const vec384x a)
+{   mul_by_8_mod_384x(ret, a, BLS12_381_P);   }
+
+static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count)
+{
+    lshift_mod_384(ret[0], a[0], count, BLS12_381_P);
+    lshift_mod_384(ret[1], a[1], count, BLS12_381_P);
+}
+
+static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b)
+{   mul_mont_384x(ret, a, b, BLS12_381_P, p0);   }
+
+static inline void sqr_fp2(vec384x ret, const vec384x a)
+{   sqr_mont_384x(ret, a, BLS12_381_P, p0);   }
+
+static inline void cneg_fp2(vec384x ret, const vec384x a, bool_t flag)
+{
+    cneg_mod_384(ret[0], a[0], flag, BLS12_381_P);
+    cneg_mod_384(ret[1], a[1], flag, BLS12_381_P);
+}
+
+#define vec_load_global vec_copy
+
+static void reciprocal_fp(vec384 out, const vec384 inp);
+static void flt_reciprocal_fp(vec384 out, const vec384 inp);
+static bool_t recip_sqrt_fp(vec384 out, const vec384 inp);
+static bool_t sqrt_fp(vec384 out, const vec384 inp);
+
+static void reciprocal_fp2(vec384x out, const vec384x inp);
+static void flt_reciprocal_fp2(vec384x out, const vec384x inp);
+static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp,
+                             const vec384x recip_ZZZ, const vec384x magic_ZZZ);
+static bool_t sqrt_fp2(vec384x out, const vec384x inp);
+static bool_t sqrt_align_fp2(vec384x out, const vec384x ret,
+                             const vec384x sqrt, const vec384x inp);
+
+typedef vec384x   vec384fp2;
+typedef vec384fp2 vec384fp6[3];
+typedef vec384fp6 vec384fp12[2];
+
+static void sqr_fp12(vec384fp12 ret, const vec384fp12 a);
+static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a);
+static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b);
+static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a,
+                                               const vec384fp6 xy00z0);
+static void conjugate_fp12(vec384fp12 a);
+static void inverse_fp12(vec384fp12 ret, const vec384fp12 a);
+/* caveat lector! |n| has to be non-zero and not more than 3! */
+static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n);
+
+#else
+
+extern "C" {
+__device__ void mul_fp(vec384 ret, const vec384 a, const vec384 b);
+__device__ void sqr_fp(vec384 ret, const vec384 a);
+__device__ void add_fp(vec384 ret, const vec384 a, const vec384 b);
+__device__ void sub_fp(vec384 ret, const vec384 a, const vec384 b);
+__device__ void cneg_fp(vec384 ret, const vec384 ap, unsigned int flag);
+__device__ void rshift_fp(vec384 ret, const vec384 a, unsigned int cnt);
+__device__ void lshift_fp(vec384 ret, const vec384 a, unsigned int cnt);
+__device__ void mul_by_3_fp(vec384 ret, const vec384 a);
+__device__ void from_fp(vec384 ret, const vec384 a);
+
+#pragma diag_suppress 3151
+__device__ void mul_384(vec768 ret, const vec384 a, const vec384 b);
+__device__ void sqr_384(vec768 ret, const vec384 a);
+#pragma diag_default 3151
+__device__ void redc_fp(vec384 ret, const vec768 a);
+__device__ void add_fpx2(vec768 ret, const vec768 a, const vec768 b);
+__device__ void sub_fpx2(vec768 ret, const vec768 a, const vec768 b);
+
+__device__ void vec_load_global(limb_t *ret, const limb_t *a,
+                                unsigned int sz = 48);
+}
+
+static inline void mul_by_8_fp(vec384 ret, const vec384 a)
+{   lshift_fp(ret, a, 3);   }
+
+static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b)
+{
+    add_fp(ret[0], a[0], b[0]);
+    add_fp(ret[1], a[1], b[1]);
+}
+
+static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b)
+{
+    sub_fp(ret[0], a[0], b[0]);
+    sub_fp(ret[1], a[1], b[1]);
+}
+
+static inline void mul_by_3_fp2(vec384x ret, const vec384x a)
+{
+    mul_by_3_fp(ret[0], a[0]);
+    mul_by_3_fp(ret[1], a[1]);
+}
+
+static inline void mul_by_8_fp2(vec384x ret, const vec384x a)
+{
+    lshift_fp(ret[0], a[0], 3);
+    lshift_fp(ret[1], a[1], 3);
+}
+
+static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count)
+{
+    lshift_fp(ret[0], a[0], count);
+    lshift_fp(ret[1], a[1], count);
+}
+
+static inline void cneg_fp2(vec384x ret, const vec384x a, limb_t flag)
+{
+    cneg_fp(ret[0], a[0], flag);
+    cneg_fp(ret[1], a[1], flag);
+}
+
+static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b)
+{
+    vec384 aa, bb, cc;
+
+    add_fp(aa, a[0], a[1]);
+    add_fp(bb, b[0], b[1]);
+    mul_fp(bb, bb, aa);
+
+    mul_fp(aa, a[0], b[0]);
+    mul_fp(cc, a[1], b[1]);
+
+    sub_fp(ret[0], aa, cc);
+    sub_fp(ret[1], bb, aa);
+    sub_fp(ret[1], ret[1], cc);
+}
+
+static inline void sqr_fp2(vec384x ret, const vec384x a)
+{
+    vec384 t0, t1;
+
+    add_fp(t0, a[0], a[1]);
+    sub_fp(t1, a[0], a[1]);
+
+    mul_fp(ret[1], a[0], a[1]);
+    add_fp(ret[1], ret[1], ret[1]);
+
+    mul_fp(ret[0], t0, t1);
+}
+#endif
+
+#define neg_fp(r,a) cneg_fp((r),(a),1)
+#define neg_fp2(r,a) cneg_fp2((r),(a),1)
+
+#endif /* __BLS12_381_ASM_FIELDS_H__ */
diff --git a/blst/fp12_tower.c b/blst/fp12_tower.c
new file mode 100644
index 0000000..037b7db
--- /dev/null
+++ b/blst/fp12_tower.c
@@ -0,0 +1,771 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+
+/*
+ * Fp2  = Fp[u]  / (u^2 + 1)
+ * Fp6  = Fp2[v] / (v^3 - u - 1)
+ * Fp12 = Fp6[w] / (w^2 - v)
+ */
+
+static inline void mul_by_u_plus_1_fp2(vec384x ret, const vec384x a)
+{   mul_by_1_plus_i_mod_384x(ret, a, BLS12_381_P);   }
+
+#if 1 && !defined(__BLST_NO_ASM__)
+#define __FP2x2__
+/*
+ * Fp2x2 is a "widened" version of Fp2, which allows to consolidate
+ * reductions from several multiplications. In other words instead of
+ * "mul_redc-mul_redc-add" we get "mul-mul-add-redc," where latter
+ * addition is double-width... To be more specific this gives ~7-10%
+ * faster pairing depending on platform...
+ */
+typedef vec768 vec768x[2];
+
+static inline void add_fp2x2(vec768x ret, const vec768x a, const vec768x b)
+{
+    add_mod_384x384(ret[0], a[0], b[0], BLS12_381_P);
+    add_mod_384x384(ret[1], a[1], b[1], BLS12_381_P);
+}
+
+static inline void sub_fp2x2(vec768x ret, const vec768x a, const vec768x b)
+{
+    sub_mod_384x384(ret[0], a[0], b[0], BLS12_381_P);
+    sub_mod_384x384(ret[1], a[1], b[1], BLS12_381_P);
+}
+
+static inline void mul_by_u_plus_1_fp2x2(vec768x ret, const vec768x a)
+{
+    /* caveat lector! |ret| may not be same as |a| */
+    sub_mod_384x384(ret[0], a[0], a[1], BLS12_381_P);
+    add_mod_384x384(ret[1], a[0], a[1], BLS12_381_P);
+}
+
+static inline void redc_fp2x2(vec384x ret, const vec768x a)
+{
+    redc_mont_384(ret[0], a[0], BLS12_381_P, p0);
+    redc_mont_384(ret[1], a[1], BLS12_381_P, p0);
+}
+
+static void mul_fp2x2(vec768x ret, const vec384x a, const vec384x b)
+{
+#if 1
+    mul_382x(ret, a, b, BLS12_381_P);   /* +~6% in Miller loop */
+#else
+    union { vec384 x[2]; vec768 x2; } t;
+
+    add_mod_384(t.x[0], a[0], a[1], BLS12_381_P);
+    add_mod_384(t.x[1], b[0], b[1], BLS12_381_P);
+    mul_384(ret[1], t.x[0], t.x[1]);
+
+    mul_384(ret[0], a[0], b[0]);
+    mul_384(t.x2,   a[1], b[1]);
+
+    sub_mod_384x384(ret[1], ret[1], ret[0], BLS12_381_P);
+    sub_mod_384x384(ret[1], ret[1], t.x2, BLS12_381_P);
+
+    sub_mod_384x384(ret[0], ret[0], t.x2, BLS12_381_P);
+#endif
+}
+
+static void sqr_fp2x2(vec768x ret, const vec384x a)
+{
+#if 1
+    sqr_382x(ret, a, BLS12_381_P);      /* +~5% in final exponentiation */
+#else
+    vec384 t0, t1;
+
+    add_mod_384(t0, a[0], a[1], BLS12_381_P);
+    sub_mod_384(t1, a[0], a[1], BLS12_381_P);
+
+    mul_384(ret[1], a[0], a[1]);
+    add_mod_384x384(ret[1], ret[1], ret[1], BLS12_381_P);
+
+    mul_384(ret[0], t0, t1);
+#endif
+}
+#endif  /* __FP2x2__ */
+
+/*
+ * Fp6 extension
+ */
+#if defined(__FP2x2__)  /* ~10-13% improvement for mul_fp12 and sqr_fp12 */
+typedef vec768x vec768fp6[3];
+
+static inline void sub_fp6x2(vec768fp6 ret, const vec768fp6 a,
+                                            const vec768fp6 b)
+{
+    sub_fp2x2(ret[0], a[0], b[0]);
+    sub_fp2x2(ret[1], a[1], b[1]);
+    sub_fp2x2(ret[2], a[2], b[2]);
+}
+
+static void mul_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    vec768x t0, t1, t2;
+    vec384x aa, bb;
+
+    mul_fp2x2(t0, a[0], b[0]);
+    mul_fp2x2(t1, a[1], b[1]);
+    mul_fp2x2(t2, a[2], b[2]);
+
+    /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0
+              = (a1*b2 + a2*b1)*(u+1) + a0*b0 */
+    add_fp2(aa, a[1], a[2]);
+    add_fp2(bb, b[1], b[2]);
+    mul_fp2x2(ret[0], aa, bb);
+    sub_fp2x2(ret[0], ret[0], t1);
+    sub_fp2x2(ret[0], ret[0], t2);
+    mul_by_u_plus_1_fp2x2(ret[1], ret[0]);  /* borrow ret[1] for a moment */
+    add_fp2x2(ret[0], ret[1], t0);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1)
+              = a0*b1 + a1*b0 + a2*b2*(u+1) */
+    add_fp2(aa, a[0], a[1]);
+    add_fp2(bb, b[0], b[1]);
+    mul_fp2x2(ret[1], aa, bb);
+    sub_fp2x2(ret[1], ret[1], t0);
+    sub_fp2x2(ret[1], ret[1], t1);
+    mul_by_u_plus_1_fp2x2(ret[2], t2);      /* borrow ret[2] for a moment */
+    add_fp2x2(ret[1], ret[1], ret[2]);
+
+    /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1
+              = a0*b2 + a2*b0 + a1*b1 */
+    add_fp2(aa, a[0], a[2]);
+    add_fp2(bb, b[0], b[2]);
+    mul_fp2x2(ret[2], aa, bb);
+    sub_fp2x2(ret[2], ret[2], t0);
+    sub_fp2x2(ret[2], ret[2], t2);
+    add_fp2x2(ret[2], ret[2], t1);
+}
+
+static inline void redc_fp6x2(vec384fp6 ret, const vec768fp6 a)
+{
+    redc_fp2x2(ret[0], a[0]);
+    redc_fp2x2(ret[1], a[1]);
+    redc_fp2x2(ret[2], a[2]);
+}
+
+static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    vec768fp6 r;
+
+    mul_fp6x2(r, a, b);
+    redc_fp6x2(ret, r); /* narrow to normal width */
+}
+
+static void sqr_fp6(vec384fp6 ret, const vec384fp6 a)
+{
+    vec768x s0, m01, m12, s2, rx;
+
+    sqr_fp2x2(s0, a[0]);
+
+    mul_fp2x2(m01, a[0], a[1]);
+    add_fp2x2(m01, m01, m01);
+
+    mul_fp2x2(m12, a[1], a[2]);
+    add_fp2x2(m12, m12, m12);
+
+    sqr_fp2x2(s2, a[2]);
+
+    /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2)
+              = a1^2 + 2*(a0*a2) */
+    add_fp2(ret[2], a[2], a[1]);
+    add_fp2(ret[2], ret[2], a[0]);
+    sqr_fp2x2(rx, ret[2]);
+    sub_fp2x2(rx, rx, s0);
+    sub_fp2x2(rx, rx, s2);
+    sub_fp2x2(rx, rx, m01);
+    sub_fp2x2(rx, rx, m12);
+    redc_fp2x2(ret[2], rx);
+
+    /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */
+    mul_by_u_plus_1_fp2x2(rx, m12);
+    add_fp2x2(rx, rx, s0);
+    redc_fp2x2(ret[0], rx);
+
+    /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */
+    mul_by_u_plus_1_fp2x2(rx, s2);
+    add_fp2x2(rx, rx, m01);
+    redc_fp2x2(ret[1], rx);
+}
+#else
+static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    vec384x t0, t1, t2, t3, t4, t5;
+
+    mul_fp2(t0, a[0], b[0]);
+    mul_fp2(t1, a[1], b[1]);
+    mul_fp2(t2, a[2], b[2]);
+
+    /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0
+              = (a1*b2 + a2*b1)*(u+1) + a0*b0 */
+    add_fp2(t4, a[1], a[2]);
+    add_fp2(t5, b[1], b[2]);
+    mul_fp2(t3, t4, t5);
+    sub_fp2(t3, t3, t1);
+    sub_fp2(t3, t3, t2);
+    mul_by_u_plus_1_fp2(t3, t3);
+    /* add_fp2(ret[0], t3, t0); considering possible aliasing... */
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1)
+              = a0*b1 + a1*b0 + a2*b2*(u+1) */
+    add_fp2(t4, a[0], a[1]);
+    add_fp2(t5, b[0], b[1]);
+    mul_fp2(ret[1], t4, t5);
+    sub_fp2(ret[1], ret[1], t0);
+    sub_fp2(ret[1], ret[1], t1);
+    mul_by_u_plus_1_fp2(t4, t2);
+    add_fp2(ret[1], ret[1], t4);
+
+    /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1
+              = a0*b2 + a2*b0 + a1*b1 */
+    add_fp2(t4, a[0], a[2]);
+    add_fp2(t5, b[0], b[2]);
+    mul_fp2(ret[2], t4, t5);
+    sub_fp2(ret[2], ret[2], t0);
+    sub_fp2(ret[2], ret[2], t2);
+    add_fp2(ret[2], ret[2], t1);
+
+    add_fp2(ret[0], t3, t0);    /* ... moved from above */
+}
+
+static void sqr_fp6(vec384fp6 ret, const vec384fp6 a)
+{
+    vec384x s0, m01, m12, s2;
+
+    sqr_fp2(s0, a[0]);
+
+    mul_fp2(m01, a[0], a[1]);
+    add_fp2(m01, m01, m01);
+
+    mul_fp2(m12, a[1], a[2]);
+    add_fp2(m12, m12, m12);
+
+    sqr_fp2(s2, a[2]);
+
+    /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2)
+              = a1^2 + 2*(a0*a2) */
+    add_fp2(ret[2], a[2], a[1]);
+    add_fp2(ret[2], ret[2], a[0]);
+    sqr_fp2(ret[2], ret[2]);
+    sub_fp2(ret[2], ret[2], s0);
+    sub_fp2(ret[2], ret[2], s2);
+    sub_fp2(ret[2], ret[2], m01);
+    sub_fp2(ret[2], ret[2], m12);
+
+    /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */
+    mul_by_u_plus_1_fp2(ret[0], m12);
+    add_fp2(ret[0], ret[0], s0);
+
+    /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */
+    mul_by_u_plus_1_fp2(ret[1], s2);
+    add_fp2(ret[1], ret[1], m01);
+}
+#endif
+
+static void add_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    add_fp2(ret[0], a[0], b[0]);
+    add_fp2(ret[1], a[1], b[1]);
+    add_fp2(ret[2], a[2], b[2]);
+}
+
+static void sub_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    sub_fp2(ret[0], a[0], b[0]);
+    sub_fp2(ret[1], a[1], b[1]);
+    sub_fp2(ret[2], a[2], b[2]);
+}
+
+static void neg_fp6(vec384fp6 ret, const vec384fp6 a)
+{
+    neg_fp2(ret[0], a[0]);
+    neg_fp2(ret[1], a[1]);
+    neg_fp2(ret[2], a[2]);
+}
+
+#if 0
+#define mul_by_v_fp6 mul_by_v_fp6
+static void mul_by_v_fp6(vec384fp6 ret, const vec384fp6 a)
+{
+    vec384x t;
+
+    mul_by_u_plus_1_fp2(t, a[2]);
+    vec_copy(ret[2], a[1], sizeof(a[1]));
+    vec_copy(ret[1], a[0], sizeof(a[0]));
+    vec_copy(ret[0], t, sizeof(t));
+}
+#endif
+
+/*
+ * Fp12 extension
+ */
+#if defined(__FP2x2__)
+static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b)
+{
+    vec768fp6 t0, t1, rx;
+    vec384fp6 t2;
+
+    mul_fp6x2(t0, a[0], b[0]);
+    mul_fp6x2(t1, a[1], b[1]);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1
+              = a0*b1 + a1*b0 */
+    add_fp6(t2, a[0], a[1]);
+    add_fp6(ret[1], b[0], b[1]);
+    mul_fp6x2(rx, ret[1], t2);
+    sub_fp6x2(rx, rx, t0);
+    sub_fp6x2(rx, rx, t1);
+    redc_fp6x2(ret[1], rx);
+
+    /* ret[0] = a0*b0 + a1*b1*v */
+    mul_by_u_plus_1_fp2x2(rx[0], t1[2]);
+    add_fp2x2(rx[0], t0[0], rx[0]);
+    add_fp2x2(rx[1], t0[1], t1[0]);
+    add_fp2x2(rx[2], t0[2], t1[1]);
+    redc_fp6x2(ret[0], rx);
+}
+
+static inline void mul_by_0y0_fp6x2(vec768fp6 ret, const vec384fp6 a,
+                                                   const vec384fp2 b)
+{
+    mul_fp2x2(ret[1], a[2], b);     /* borrow ret[1] for a moment */
+    mul_by_u_plus_1_fp2x2(ret[0], ret[1]);
+    mul_fp2x2(ret[1], a[0], b);
+    mul_fp2x2(ret[2], a[1], b);
+}
+
+static void mul_by_xy0_fp6x2(vec768fp6 ret, const vec384fp6 a,
+                                            const vec384fp6 b)
+{
+    vec768x t0, t1;
+    vec384x aa, bb;
+
+    mul_fp2x2(t0, a[0], b[0]);
+    mul_fp2x2(t1, a[1], b[1]);
+
+    /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0
+              = (a1*0 + a2*b1)*(u+1) + a0*b0 */
+    mul_fp2x2(ret[1], a[2], b[1]);  /* borrow ret[1] for a moment */
+    mul_by_u_plus_1_fp2x2(ret[0], ret[1]);
+    add_fp2x2(ret[0], ret[0], t0);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1)
+              = a0*b1 + a1*b0 + a2*0*(u+1) */
+    add_fp2(aa, a[0], a[1]);
+    add_fp2(bb, b[0], b[1]);
+    mul_fp2x2(ret[1], aa, bb);
+    sub_fp2x2(ret[1], ret[1], t0);
+    sub_fp2x2(ret[1], ret[1], t1);
+
+    /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1
+              = a0*0 + a2*b0 + a1*b1 */
+    mul_fp2x2(ret[2], a[2], b[0]);
+    add_fp2x2(ret[2], ret[2], t1);
+}
+
+static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a,
+                                               const vec384fp6 xy00z0)
+{
+    vec768fp6 t0, t1, rr;
+    vec384fp6 t2;
+
+    mul_by_xy0_fp6x2(t0, a[0], xy00z0);
+    mul_by_0y0_fp6x2(t1, a[1], xy00z0[2]);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1
+              = a0*b1 + a1*b0 */
+    vec_copy(t2[0], xy00z0[0], sizeof(t2[0]));
+    add_fp2(t2[1], xy00z0[1], xy00z0[2]);
+    add_fp6(ret[1], a[0], a[1]);
+    mul_by_xy0_fp6x2(rr, ret[1], t2);
+    sub_fp6x2(rr, rr, t0);
+    sub_fp6x2(rr, rr, t1);
+    redc_fp6x2(ret[1], rr);
+
+    /* ret[0] = a0*b0 + a1*b1*v */
+    mul_by_u_plus_1_fp2x2(rr[0], t1[2]);
+    add_fp2x2(rr[0], t0[0], rr[0]);
+    add_fp2x2(rr[1], t0[1], t1[0]);
+    add_fp2x2(rr[2], t0[2], t1[1]);
+    redc_fp6x2(ret[0], rr);
+}
+#else
+static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b)
+{
+    vec384fp6 t0, t1, t2;
+
+    mul_fp6(t0, a[0], b[0]);
+    mul_fp6(t1, a[1], b[1]);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1
+              = a0*b1 + a1*b0 */
+    add_fp6(t2, a[0], a[1]);
+    add_fp6(ret[1], b[0], b[1]);
+    mul_fp6(ret[1], ret[1], t2);
+    sub_fp6(ret[1], ret[1], t0);
+    sub_fp6(ret[1], ret[1], t1);
+
+    /* ret[0] = a0*b0 + a1*b1*v */
+#ifdef mul_by_v_fp6
+    mul_by_v_fp6(t1, t1);
+    add_fp6(ret[0], t0, t1);
+#else
+    mul_by_u_plus_1_fp2(t1[2], t1[2]);
+    add_fp2(ret[0][0], t0[0], t1[2]);
+    add_fp2(ret[0][1], t0[1], t1[0]);
+    add_fp2(ret[0][2], t0[2], t1[1]);
+#endif
+}
+
+static inline void mul_by_0y0_fp6(vec384fp6 ret, const vec384fp6 a,
+                                                 const vec384fp2 b)
+{
+    vec384x t;
+
+    mul_fp2(t,      a[2], b);
+    mul_fp2(ret[2], a[1], b);
+    mul_fp2(ret[1], a[0], b);
+    mul_by_u_plus_1_fp2(ret[0], t);
+}
+
+static void mul_by_xy0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    vec384x t0, t1, /*t2,*/ t3, t4, t5;
+
+    mul_fp2(t0, a[0], b[0]);
+    mul_fp2(t1, a[1], b[1]);
+
+    /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0
+              = (a1*0 + a2*b1)*(u+1) + a0*b0 */
+    mul_fp2(t3, a[2], b[1]);
+    mul_by_u_plus_1_fp2(t3, t3);
+    /* add_fp2(ret[0], t3, t0); considering possible aliasing... */
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1)
+              = a0*b1 + a1*b0 + a2*0*(u+1) */
+    add_fp2(t4, a[0], a[1]);
+    add_fp2(t5, b[0], b[1]);
+    mul_fp2(ret[1], t4, t5);
+    sub_fp2(ret[1], ret[1], t0);
+    sub_fp2(ret[1], ret[1], t1);
+
+    /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1
+              = a0*0 + a2*b0 + a1*b1 */
+    mul_fp2(ret[2], a[2], b[0]);
+    add_fp2(ret[2], ret[2], t1);
+
+    add_fp2(ret[0], t3, t0);    /* ... moved from above */
+}
+
+static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a,
+                                               const vec384fp6 xy00z0)
+{
+    vec384fp6 t0, t1, t2;
+
+    mul_by_xy0_fp6(t0, a[0], xy00z0);
+    mul_by_0y0_fp6(t1, a[1], xy00z0[2]);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1
+              = a0*b1 + a1*b0 */
+    vec_copy(t2[0], xy00z0[0], sizeof(t2[0]));
+    add_fp2(t2[1], xy00z0[1], xy00z0[2]);
+    add_fp6(ret[1], a[0], a[1]);
+    mul_by_xy0_fp6(ret[1], ret[1], t2);
+    sub_fp6(ret[1], ret[1], t0);
+    sub_fp6(ret[1], ret[1], t1);
+
+    /* ret[0] = a0*b0 + a1*b1*v */
+#ifdef mul_by_v_fp6
+    mul_by_v_fp6(t1, t1);
+    add_fp6(ret[0], t0, t1);
+#else
+    mul_by_u_plus_1_fp2(t1[2], t1[2]);
+    add_fp2(ret[0][0], t0[0], t1[2]);
+    add_fp2(ret[0][1], t0[1], t1[0]);
+    add_fp2(ret[0][2], t0[2], t1[1]);
+#endif
+}
+#endif
+
+static void sqr_fp12(vec384fp12 ret, const vec384fp12 a)
+{
+    vec384fp6 t0, t1;
+
+    add_fp6(t0, a[0], a[1]);
+#ifdef mul_by_v_fp6
+    mul_by_v_fp6(t1, a[1]);
+    add_fp6(t1, a[0], t1);
+#else
+    mul_by_u_plus_1_fp2(t1[2], a[1][2]);
+    add_fp2(t1[0], a[0][0], t1[2]);
+    add_fp2(t1[1], a[0][1], a[1][0]);
+    add_fp2(t1[2], a[0][2], a[1][1]);
+#endif
+    mul_fp6(t0, t0, t1);
+    mul_fp6(t1, a[0], a[1]);
+
+    /* ret[1] = 2*(a0*a1) */
+    add_fp6(ret[1], t1, t1);
+
+    /* ret[0] = (a0 + a1)*(a0 + a1*v) - a0*a1 - a0*a1*v
+              = a0^2 + a1^2*v */
+    sub_fp6(ret[0], t0, t1);
+#ifdef mul_by_v_fp6
+    mul_by_v_fp6(t1, t1);
+    sub_fp6(ret[0], ret[0], t1);
+#else
+    mul_by_u_plus_1_fp2(t1[2], t1[2]);
+    sub_fp2(ret[0][0], ret[0][0], t1[2]);
+    sub_fp2(ret[0][1], ret[0][1], t1[0]);
+    sub_fp2(ret[0][2], ret[0][2], t1[1]);
+#endif
+}
+
+static void conjugate_fp12(vec384fp12 a)
+{   neg_fp6(a[1], a[1]);   }
+
+static void inverse_fp6(vec384fp6 ret, const vec384fp6 a)
+{
+    vec384x c0, c1, c2, t0, t1;
+
+    /* c0 = a0^2 - (a1*a2)*(u+1) */
+    sqr_fp2(c0, a[0]);
+    mul_fp2(t0, a[1], a[2]);
+    mul_by_u_plus_1_fp2(t0, t0);
+    sub_fp2(c0, c0, t0);
+
+    /* c1 = a2^2*(u+1) - (a0*a1) */
+    sqr_fp2(c1, a[2]);
+    mul_by_u_plus_1_fp2(c1, c1);
+    mul_fp2(t0, a[0], a[1]);
+    sub_fp2(c1, c1, t0);
+ 
+    /* c2 = a1^2 - a0*a2 */
+    sqr_fp2(c2, a[1]);
+    mul_fp2(t0, a[0], a[2]);
+    sub_fp2(c2, c2, t0);
+
+    /* (a2*c1 + a1*c2)*(u+1) + a0*c0 */
+    mul_fp2(t0, c1, a[2]);
+    mul_fp2(t1, c2, a[1]);
+    add_fp2(t0, t0, t1);
+    mul_by_u_plus_1_fp2(t0, t0);
+    mul_fp2(t1, c0, a[0]);
+    add_fp2(t0, t0, t1);
+
+    reciprocal_fp2(t1, t0);
+
+    mul_fp2(ret[0], c0, t1);
+    mul_fp2(ret[1], c1, t1);
+    mul_fp2(ret[2], c2, t1);
+}
+
+static void inverse_fp12(vec384fp12 ret, const vec384fp12 a)
+{
+    vec384fp6 t0, t1;
+
+    sqr_fp6(t0, a[0]);
+    sqr_fp6(t1, a[1]);
+#ifdef mul_by_v_fp6
+    mul_by_v_fp6(t1, t1);
+    sub_fp6(t0, t0, t1);
+#else
+    mul_by_u_plus_1_fp2(t1[2], t1[2]);
+    sub_fp2(t0[0], t0[0], t1[2]);
+    sub_fp2(t0[1], t0[1], t1[0]);
+    sub_fp2(t0[2], t0[2], t1[1]);
+#endif
+
+    inverse_fp6(t1, t0);
+
+    mul_fp6(ret[0], a[0], t1);
+    mul_fp6(ret[1], a[1], t1);
+    neg_fp6(ret[1], ret[1]);
+}
+
+typedef vec384x vec384fp4[2];
+
+#if defined(__FP2x2__)
+static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1)
+{
+    vec768x t0, t1, t2;
+
+    sqr_fp2x2(t0, a0);
+    sqr_fp2x2(t1, a1);
+    add_fp2(ret[1], a0, a1);
+
+    mul_by_u_plus_1_fp2x2(t2, t1);
+    add_fp2x2(t2, t2, t0);
+    redc_fp2x2(ret[0], t2);
+
+    sqr_fp2x2(t2, ret[1]);
+    sub_fp2x2(t2, t2, t0);
+    sub_fp2x2(t2, t2, t1);
+    redc_fp2x2(ret[1], t2);
+}
+#else
+static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1)
+{
+    vec384x t0, t1;
+
+    sqr_fp2(t0, a0);
+    sqr_fp2(t1, a1);
+    add_fp2(ret[1], a0, a1);
+
+    mul_by_u_plus_1_fp2(ret[0], t1);
+    add_fp2(ret[0], ret[0], t0);
+
+    sqr_fp2(ret[1], ret[1]);
+    sub_fp2(ret[1], ret[1], t0);
+    sub_fp2(ret[1], ret[1], t1);
+}
+#endif
+
+static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a)
+{
+    vec384fp4 t0, t1, t2;
+
+    sqr_fp4(t0, a[0][0], a[1][1]);
+    sqr_fp4(t1, a[1][0], a[0][2]);
+    sqr_fp4(t2, a[0][1], a[1][2]);
+
+    sub_fp2(ret[0][0], t0[0],     a[0][0]);
+    add_fp2(ret[0][0], ret[0][0], ret[0][0]);
+    add_fp2(ret[0][0], ret[0][0], t0[0]);
+
+    sub_fp2(ret[0][1], t1[0],     a[0][1]);
+    add_fp2(ret[0][1], ret[0][1], ret[0][1]);
+    add_fp2(ret[0][1], ret[0][1], t1[0]);
+
+    sub_fp2(ret[0][2], t2[0],     a[0][2]);
+    add_fp2(ret[0][2], ret[0][2], ret[0][2]);
+    add_fp2(ret[0][2], ret[0][2], t2[0]);
+
+    mul_by_u_plus_1_fp2(t2[1], t2[1]);
+    add_fp2(ret[1][0], t2[1],     a[1][0]);
+    add_fp2(ret[1][0], ret[1][0], ret[1][0]);
+    add_fp2(ret[1][0], ret[1][0], t2[1]);
+
+    add_fp2(ret[1][1], t0[1],     a[1][1]);
+    add_fp2(ret[1][1], ret[1][1], ret[1][1]);
+    add_fp2(ret[1][1], ret[1][1], t0[1]);
+
+    add_fp2(ret[1][2], t1[1],     a[1][2]);
+    add_fp2(ret[1][2], ret[1][2], ret[1][2]);
+    add_fp2(ret[1][2], ret[1][2], t1[1]);
+}
+
+/*
+ * caveat lector! |n| has to be non-zero and not more than 3!
+ */
+static inline void frobenius_map_fp2(vec384x ret, const vec384x a, size_t n)
+{
+    vec_copy(ret[0], a[0], sizeof(ret[0]));
+    cneg_fp(ret[1], a[1], n & 1);
+}
+
+static void frobenius_map_fp6(vec384fp6 ret, const vec384fp6 a, size_t n)
+{
+    static const vec384x coeffs1[] = {  /* (u + 1)^((P^n - 1) / 3) */
+      { { 0 },
+        { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2),
+          TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e),
+          TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) } },
+      { { TO_LIMB_T(0x30f1361b798a64e8), TO_LIMB_T(0xf3b8ddab7ece5a2a),
+          TO_LIMB_T(0x16a8ca3ac61577f7), TO_LIMB_T(0xc26a2ff874fd029b),
+          TO_LIMB_T(0x3636b76660701c6e), TO_LIMB_T(0x051ba4ab241b6160) } },
+      { { 0 }, { ONE_MONT_P } }
+    };
+    static const vec384 coeffs2[] = {  /* (u + 1)^((2P^n - 2) / 3) */
+      {   TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5),
+          TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024),
+          TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a)   },
+      {   TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2),
+          TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e),
+          TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741)   },
+      {   TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd),
+          TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a),
+          TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206)   }
+    };
+
+    frobenius_map_fp2(ret[0], a[0], n);
+    frobenius_map_fp2(ret[1], a[1], n);
+    frobenius_map_fp2(ret[2], a[2], n);
+    --n;    /* implied ONE_MONT_P at index 0 */
+    mul_fp2(ret[1], ret[1], coeffs1[n]);
+    mul_fp(ret[2][0], ret[2][0], coeffs2[n]);
+    mul_fp(ret[2][1], ret[2][1], coeffs2[n]);
+}
+
+static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n)
+{
+    static const vec384x coeffs[] = {  /* (u + 1)^((P^n - 1) / 6) */
+      { { TO_LIMB_T(0x07089552b319d465), TO_LIMB_T(0xc6695f92b50a8313),
+          TO_LIMB_T(0x97e83cccd117228f), TO_LIMB_T(0xa35baecab2dc29ee),
+          TO_LIMB_T(0x1ce393ea5daace4d), TO_LIMB_T(0x08f2220fb0fb66eb) },
+	{ TO_LIMB_T(0xb2f66aad4ce5d646), TO_LIMB_T(0x5842a06bfc497cec),
+          TO_LIMB_T(0xcf4895d42599d394), TO_LIMB_T(0xc11b9cba40a8e8d0),
+          TO_LIMB_T(0x2e3813cbe5a0de89), TO_LIMB_T(0x110eefda88847faf) } },
+      { { TO_LIMB_T(0xecfb361b798dba3a), TO_LIMB_T(0xc100ddb891865a2c),
+          TO_LIMB_T(0x0ec08ff1232bda8e), TO_LIMB_T(0xd5c13cc6f1ca4721),
+          TO_LIMB_T(0x47222a47bf7b5c04), TO_LIMB_T(0x0110f184e51c5f59) } },
+      { { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183),
+          TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18),
+          TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) },
+	{ TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c),
+          TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7),
+          TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } },
+    };
+
+    frobenius_map_fp6(ret[0], a[0], n);
+    frobenius_map_fp6(ret[1], a[1], n);
+    --n;    /* implied ONE_MONT_P at index 0 */
+    mul_fp2(ret[1][0], ret[1][0], coeffs[n]);
+    mul_fp2(ret[1][1], ret[1][1], coeffs[n]);
+    mul_fp2(ret[1][2], ret[1][2], coeffs[n]);
+}
+
+
+/*
+ * BLS12-381-specifc Fp12 shortcuts.
+ */
+void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a)
+{   sqr_fp12(ret, a);   }
+
+void blst_fp12_cyclotomic_sqr(vec384fp12 ret, const vec384fp12 a)
+{   cyclotomic_sqr_fp12(ret, a);   }
+
+void blst_fp12_mul(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b)
+{   mul_fp12(ret, a, b);   }
+
+void blst_fp12_mul_by_xy00z0(vec384fp12 ret, const vec384fp12 a,
+                                             const vec384fp6 xy00z0)
+{   mul_by_xy00z0_fp12(ret, a, xy00z0);   }
+
+void blst_fp12_conjugate(vec384fp12 a)
+{   conjugate_fp12(a);   }
+
+void blst_fp12_inverse(vec384fp12 ret, const vec384fp12 a)
+{   inverse_fp12(ret, a);   }
+
+/* caveat lector! |n| has to be non-zero and not more than 3! */
+void blst_fp12_frobenius_map(vec384fp12 ret, const vec384fp12 a, size_t n)
+{   frobenius_map_fp12(ret, a, n);   }
+
+int blst_fp12_is_equal(const vec384fp12 a, const vec384fp12 b)
+{   return (int)vec_is_equal(a, b, sizeof(vec384fp12));   }
+
+int blst_fp12_is_one(const vec384fp12 a)
+{
+    return (int)(vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) &
+                 vec_is_zero(a[0][1], sizeof(vec384fp12) - sizeof(a[0][0])));
+}
+
+const vec384fp12 *blst_fp12_one(void)
+{   return (const vec384fp12 *)BLS12_381_Rx.p12;   }
diff --git a/blst/hash_to_field.c b/blst/hash_to_field.c
new file mode 100644
index 0000000..42733b1
--- /dev/null
+++ b/blst/hash_to_field.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "consts.h"
+#include "sha256.h"
+
+static const vec384 BLS12_381_RRRR = {  /* RR^2 */
+    TO_LIMB_T(0xed48ac6bd94ca1e0), TO_LIMB_T(0x315f831e03a7adf8),
+    TO_LIMB_T(0x9a53352a615e29dd), TO_LIMB_T(0x34c04e5e921e1761),
+    TO_LIMB_T(0x2512d43565724728), TO_LIMB_T(0x0aa6346091755d4d)
+};
+
+#ifdef expand_message_xmd
+void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes,
+                        const unsigned char *aug, size_t aug_len,
+                        const unsigned char *msg, size_t msg_len,
+                        const unsigned char *DST, size_t DST_len);
+#else
+static void sha256_init_Zpad(SHA256_CTX *ctx)
+{
+    ctx->h[0] = 0xda5698beU;
+    ctx->h[1] = 0x17b9b469U;
+    ctx->h[2] = 0x62335799U;
+    ctx->h[3] = 0x779fbecaU;
+    ctx->h[4] = 0x8ce5d491U;
+    ctx->h[5] = 0xc0d26243U;
+    ctx->h[6] = 0xbafef9eaU;
+    ctx->h[7] = 0x1837a9d8U;
+    ctx->N = 64;
+    vec_zero(ctx->buf, sizeof(ctx->buf));
+    ctx->off = 0;
+}
+
+static void vec_xor(void *restrict ret, const void *restrict a,
+                                        const void *restrict b, size_t num)
+{
+    limb_t *rp = (limb_t *)ret;
+    const limb_t *ap = (const limb_t *)a;
+    const limb_t *bp = (const limb_t *)b;
+    size_t i;
+
+    num /= sizeof(limb_t);
+
+    for (i = 0; i < num; i++)
+        rp[i] = ap[i] ^ bp[i];
+}
+
+static void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes,
+                               const unsigned char *aug, size_t aug_len,
+                               const unsigned char *msg, size_t msg_len,
+                               const unsigned char *DST, size_t DST_len)
+{
+    union { limb_t align; unsigned char c[32]; } b_0;
+    union { limb_t align; unsigned char c[33+256+31]; } b_i;
+    unsigned char *p;
+    size_t i, b_i_bits, b_i_blocks;
+    SHA256_CTX ctx;
+
+    /*
+     * compose template for 'strxor(b_0, b_(i-1)) || I2OSP(i, 1) || DST_prime'
+     */
+    if (DST_len > 255) {
+        sha256_init(&ctx);
+        sha256_update(&ctx, "H2C-OVERSIZE-DST-", 17);
+        sha256_update(&ctx, DST, DST_len);
+        sha256_final(b_0.c, &ctx);
+        DST = b_0.c, DST_len = 32;
+    }
+    b_i_blocks = ((33 + DST_len + 1 + 9) + 63) & -64;
+    vec_zero(b_i.c + b_i_blocks - 64, 64);
+
+    p = b_i.c + 33;
+    for (i = 0; i < DST_len; i++)
+        p[i] = DST[i];
+    p[i++] = (unsigned char)DST_len;
+    p[i++] = 0x80;
+    p[i+6] = p[i+5] = p[i+4] = p[i+3] = p[i+2] = p[i+1] = p[i+0] = 0;
+    b_i_bits = (33 + DST_len + 1) * 8;
+    p = b_i.c + b_i_blocks;
+    p[-2] = (unsigned char)(b_i_bits >> 8);
+    p[-1] = (unsigned char)(b_i_bits);
+
+    sha256_init_Zpad(&ctx);                         /* Z_pad | */
+    sha256_update(&ctx, aug, aug_len);              /* | aug | */
+    sha256_update(&ctx, msg, msg_len);              /* | msg | */
+    /* | I2OSP(len_in_bytes, 2) || I2OSP(0, 1) || DST_prime    */
+    b_i.c[30] = (unsigned char)(len_in_bytes >> 8);
+    b_i.c[31] = (unsigned char)(len_in_bytes);
+    b_i.c[32] = 0;
+    sha256_update(&ctx, b_i.c + 30, 3 + DST_len + 1);
+    sha256_final(b_0.c, &ctx);
+
+    sha256_init_h(ctx.h);
+    vec_copy(b_i.c, b_0.c, 32);
+    ++b_i.c[32];
+    sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64);
+    sha256_emit(bytes, ctx.h);
+
+    len_in_bytes += 31; /* ell = ceil(len_in_bytes / b_in_bytes), with */
+    len_in_bytes /= 32; /* caller being responsible for accordingly large
+                         * buffer. hash_to_field passes one with length
+                         * divisible by 64, remember? which works... */
+    while (--len_in_bytes) {
+        sha256_init_h(ctx.h);
+        vec_xor(b_i.c, b_0.c, bytes, 32);
+        bytes += 32;
+        ++b_i.c[32];
+        sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64);
+        sha256_emit(bytes, ctx.h);
+    }
+}
+#endif
+
+/*
+ * |nelems| is 'count * m' from spec
+ */
+static void hash_to_field(vec384 elems[], size_t nelems,
+                          const unsigned char *aug, size_t aug_len,
+                          const unsigned char *msg, size_t msg_len,
+                          const unsigned char *DST, size_t DST_len)
+{
+    size_t L = sizeof(vec384) + 128/8;  /* ceil((ceil(log2(p)) + k) / 8) */
+    size_t len_in_bytes = L * nelems;   /* divisible by 64, hurray!      */
+#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901
+    limb_t *pseudo_random = alloca(len_in_bytes);
+#else
+    limb_t pseudo_random[len_in_bytes/sizeof(limb_t)];
+#endif
+    unsigned char *bytes;
+    vec768 elem;
+
+    aug_len = aug!=NULL ? aug_len : 0;
+    DST_len = DST!=NULL ? DST_len : 0;
+
+    expand_message_xmd((unsigned char *)pseudo_random, len_in_bytes,
+                       aug, aug_len, msg, msg_len, DST, DST_len);
+
+    vec_zero(elem, sizeof(elem));
+    bytes = (unsigned char *)pseudo_random;
+    while (nelems--) {
+        limbs_from_be_bytes(elem, bytes, L);
+        bytes += L;
+        /*
+         * L-bytes block % P, output is in Montgomery domain...
+         */
+        redc_mont_384(elems[0], elem, BLS12_381_P, p0);
+        mul_mont_384(elems[0], elems[0], BLS12_381_RRRR, BLS12_381_P, p0);
+        elems++;
+    }
+}
+
+void blst_expand_message_xmd(unsigned char *bytes, size_t len_in_bytes,
+                             const unsigned char *msg, size_t msg_len,
+                             const unsigned char *DST, size_t DST_len)
+{
+    size_t buf_len = (len_in_bytes+31) & ((size_t)0-32);
+    unsigned char *buf_ptr = bytes;
+
+    if (buf_len > 255*32)
+        return;
+
+    if (buf_len != len_in_bytes)
+        buf_ptr = alloca(buf_len);
+
+    expand_message_xmd(buf_ptr, len_in_bytes, NULL, 0, msg, msg_len,
+                                              DST, DST_len);
+    if (buf_ptr != bytes) {
+        unsigned char *ptr = buf_ptr;
+        while (len_in_bytes--)
+            *bytes++ = *ptr++;
+        vec_zero(buf_ptr, buf_len);
+    }
+}
diff --git a/blst/keygen.c b/blst/keygen.c
new file mode 100644
index 0000000..de749ac
--- /dev/null
+++ b/blst/keygen.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "consts.h"
+#include "sha256.h"
+
+typedef struct {
+    SHA256_CTX ctx;
+    unsigned int h_ipad[8];
+    unsigned int h_opad[8];
+    union { limb_t l[64/sizeof(limb_t)]; unsigned char c[64]; } tail;
+} HMAC_SHA256_CTX;
+
+static void HMAC_init(HMAC_SHA256_CTX *ctx, const void *K, size_t K_len)
+{
+    size_t i;
+
+    if (K == NULL) {            /* reuse h_ipad and h_opad */
+        sha256_hcopy(ctx->ctx.h, ctx->h_ipad);
+        ctx->ctx.N = 64;
+        vec_zero(ctx->ctx.buf, sizeof(ctx->ctx.buf));
+        ctx->ctx.off = 0;
+
+        return;
+    }
+
+    vec_zero(ctx->tail.c, sizeof(ctx->tail));
+    if (K_len > 64) {
+        sha256_init(&ctx->ctx);
+        sha256_update(&ctx->ctx, K, K_len);
+        sha256_final(ctx->tail.c, &ctx->ctx);
+    } else {
+        sha256_bcopy(ctx->tail.c, K, K_len);
+    }
+
+    for (i = 0; i < 64/sizeof(limb_t); i++)
+        ctx->tail.l[i] ^= (limb_t)0x3636363636363636;
+
+    sha256_init(&ctx->ctx);
+    sha256_update(&ctx->ctx, ctx->tail.c, 64);
+    sha256_hcopy(ctx->h_ipad, ctx->ctx.h);
+
+    for (i = 0; i < 64/sizeof(limb_t); i++)
+        ctx->tail.l[i] ^= (limb_t)(0x3636363636363636 ^ 0x5c5c5c5c5c5c5c5c);
+
+    sha256_init_h(ctx->h_opad);
+    sha256_block_data_order(ctx->h_opad, ctx->tail.c, 1);
+
+    vec_zero(ctx->tail.c, sizeof(ctx->tail));
+    ctx->tail.c[32] = 0x80;
+    ctx->tail.c[62] = 3;        /* (64+32)*8 in big endian */
+    ctx->tail.c[63] = 0;
+}
+
+static void HMAC_update(HMAC_SHA256_CTX *ctx, const unsigned char *inp,
+                                              size_t len)
+{   sha256_update(&ctx->ctx, inp, len);   }
+
+static void HMAC_final(unsigned char md[32], HMAC_SHA256_CTX *ctx)
+{
+    sha256_final(ctx->tail.c, &ctx->ctx);
+    sha256_hcopy(ctx->ctx.h, ctx->h_opad);
+    sha256_block_data_order(ctx->ctx.h, ctx->tail.c, 1);
+    sha256_emit(md, ctx->ctx.h);
+}
+
+static void HKDF_Extract(unsigned char PRK[32],
+                         const void *salt, size_t salt_len,
+                         const void *IKM,  size_t IKM_len,
+                         HMAC_SHA256_CTX *ctx)
+{
+    unsigned char zero[1] = { 0 };
+
+    HMAC_init(ctx, salt != NULL ? salt : zero, salt_len);
+    HMAC_update(ctx, IKM, IKM_len);
+#ifndef __BLST_HKDF_TESTMODE__
+    /* Section 2.3 KeyGen in BLS-signature draft */
+    HMAC_update(ctx, zero, 1);
+#endif
+    HMAC_final(PRK, ctx);
+}
+
+static void HKDF_Expand(unsigned char *OKM, size_t L,
+                        const unsigned char PRK[32],
+                        const void *info, size_t info_len,
+                        HMAC_SHA256_CTX *ctx)
+{
+#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901
+    unsigned char *info_prime = alloca(info_len + 2 + 1);
+#else
+    unsigned char info_prime[info_len + 2 + 1];
+#endif
+
+    HMAC_init(ctx, PRK, 32);
+
+    if (info_len != 0)
+        sha256_bcopy(info_prime, info, info_len);
+#ifndef __BLST_HKDF_TESTMODE__
+    /* Section 2.3 KeyGen in BLS-signature draft */
+    info_prime[info_len + 0] = (unsigned char)(L >> 8);
+    info_prime[info_len + 1] = (unsigned char)(L);
+    info_len += 2;
+#endif
+    info_prime[info_len] = 1;   /* counter */
+    HMAC_update(ctx, info_prime, info_len + 1);
+    HMAC_final(ctx->tail.c, ctx);
+    while (L > 32) {
+        sha256_hcopy((unsigned int *)OKM, (const unsigned int *)ctx->tail.c);
+        OKM += 32; L -= 32;
+        ++info_prime[info_len]; /* counter */
+        HMAC_init(ctx, NULL, 0);
+        HMAC_update(ctx, ctx->tail.c, 32);
+        HMAC_update(ctx, info_prime, info_len + 1);
+        HMAC_final(ctx->tail.c, ctx);
+    }
+    sha256_bcopy(OKM, ctx->tail.c, L);
+}
+
+#ifndef __BLST_HKDF_TESTMODE__
+void blst_keygen(pow256 SK, const void *IKM, size_t IKM_len,
+                            const void *info, size_t info_len)
+{
+    struct {
+        HMAC_SHA256_CTX ctx;
+        unsigned char PRK[32], OKM[48];
+        vec512 key;
+    } scratch;
+    unsigned char salt[32] = "BLS-SIG-KEYGEN-SALT-";
+    size_t salt_len = 20;
+
+    if (IKM_len < 32) {
+        vec_zero(SK, sizeof(pow256));
+        return;
+    }
+
+    /*
+     * Vet |info| since some callers were caught to be sloppy, e.g.
+     * SWIG-4.0-generated Python wrapper...
+     */
+    info_len = info==NULL ? 0 : info_len;
+
+    do {
+        /* salt = H(salt) */
+        sha256_init(&scratch.ctx.ctx);
+        sha256_update(&scratch.ctx.ctx, salt, salt_len);
+        sha256_final(salt, &scratch.ctx.ctx);
+        salt_len = sizeof(salt);
+
+        /* PRK = HKDF-Extract(salt, IKM || I2OSP(0, 1)) */
+        HKDF_Extract(scratch.PRK, salt, salt_len,
+                                  IKM, IKM_len, &scratch.ctx);
+
+        /* OKM = HKDF-Expand(PRK, key_info || I2OSP(L, 2), L) */
+        HKDF_Expand(scratch.OKM, sizeof(scratch.OKM), scratch.PRK,
+                    info, info_len, &scratch.ctx);
+
+        /* SK = OS2IP(OKM) mod r */
+        vec_zero(scratch.key, sizeof(scratch.key));
+        limbs_from_be_bytes(scratch.key, scratch.OKM, sizeof(scratch.OKM));
+        redc_mont_256(scratch.key, scratch.key, BLS12_381_r, r0);
+        /*
+         * Given that mul_mont_sparse_256 has special boundary conditions
+         * it's appropriate to mention that redc_mont_256 output is fully
+         * reduced at this point. Because we started with 384-bit input,
+         * one with most significant half smaller than the modulus.
+         */
+        mul_mont_sparse_256(scratch.key, scratch.key, BLS12_381_rRR,
+                            BLS12_381_r, r0);
+    } while (vec_is_zero(scratch.key, sizeof(vec256)));
+
+    le_bytes_from_limbs(SK, scratch.key, sizeof(pow256));
+
+    /*
+     * scrub the stack just in case next callee inadvertently flashes
+     * a fragment across application boundary...
+     */
+    vec_zero(&scratch, sizeof(scratch));
+}
+#endif
diff --git a/blst/map_to_g1.c b/blst/map_to_g1.c
new file mode 100644
index 0000000..6613d68
--- /dev/null
+++ b/blst/map_to_g1.c
@@ -0,0 +1,559 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "point.h"
+#include "fields.h"
+
+/*
+ * y^2 = x^3 + A'*x + B', isogenous one
+ */
+static const vec384 Aprime_E1 = {
+    /* (0x00144698a3b8e9433d693a02c96d4982b0ea985383ee66a8
+          d8e8981aefd881ac98936f8da0e0f97f5cf428082d584c1d << 384) % P */
+    TO_LIMB_T(0x2f65aa0e9af5aa51), TO_LIMB_T(0x86464c2d1e8416c3),
+    TO_LIMB_T(0xb85ce591b7bd31e2), TO_LIMB_T(0x27e11c91b5f24e7c),
+    TO_LIMB_T(0x28376eda6bfc1835), TO_LIMB_T(0x155455c3e5071d85)
+};
+static const vec384 Bprime_E1 = {
+    /* (0x12e2908d11688030018b12e8753eee3b2016c1f0f24f4070
+          a0b9c14fcef35ef55a23215a316ceaa5d1cc48e98e172be0 << 384) % P */
+    TO_LIMB_T(0xfb996971fe22a1e0), TO_LIMB_T(0x9aa93eb35b742d6f),
+    TO_LIMB_T(0x8c476013de99c5c4), TO_LIMB_T(0x873e27c3a221e571),
+    TO_LIMB_T(0xca72b5e45a52d888), TO_LIMB_T(0x06824061418a386b)
+};
+
+static void map_fp_times_Zz(vec384 map[], const vec384 isogeny_map[],
+                            const vec384 Zz_powers[], size_t n)
+{
+    while (n--)
+        mul_fp(map[n], isogeny_map[n], Zz_powers[n]);
+}
+
+static void map_fp(vec384 acc, const vec384 x, const vec384 map[], size_t n)
+{
+    while (n--) {
+        mul_fp(acc, acc, x);
+        add_fp(acc, acc, map[n]);
+    }
+}
+
+static void isogeny_map_to_E1(POINTonE1 *out, const POINTonE1 *p)
+{
+    /*
+     * x = x_num / x_den, where
+     * x_num = k_(1,11) * x'^11 + k_(1,10) * x'^10 + k_(1,9) * x'^9 +
+     *         ... + k_(1,0)
+     * ...
+     */
+    static const vec384 isogeny_map_x_num[] = { /*  (k_(1,*)<<384) % P  */
+      { TO_LIMB_T(0x4d18b6f3af00131c), TO_LIMB_T(0x19fa219793fee28c),
+        TO_LIMB_T(0x3f2885f1467f19ae), TO_LIMB_T(0x23dcea34f2ffb304),
+        TO_LIMB_T(0xd15b58d2ffc00054), TO_LIMB_T(0x0913be200a20bef4)  },
+      { TO_LIMB_T(0x898985385cdbbd8b), TO_LIMB_T(0x3c79e43cc7d966aa),
+        TO_LIMB_T(0x1597e193f4cd233a), TO_LIMB_T(0x8637ef1e4d6623ad),
+        TO_LIMB_T(0x11b22deed20d827b), TO_LIMB_T(0x07097bc5998784ad)  },
+      { TO_LIMB_T(0xa542583a480b664b), TO_LIMB_T(0xfc7169c026e568c6),
+        TO_LIMB_T(0x5ba2ef314ed8b5a6), TO_LIMB_T(0x5b5491c05102f0e7),
+        TO_LIMB_T(0xdf6e99707d2a0079), TO_LIMB_T(0x0784151ed7605524)  },
+      { TO_LIMB_T(0x494e212870f72741), TO_LIMB_T(0xab9be52fbda43021),
+        TO_LIMB_T(0x26f5577994e34c3d), TO_LIMB_T(0x049dfee82aefbd60),
+        TO_LIMB_T(0x65dadd7828505289), TO_LIMB_T(0x0e93d431ea011aeb)  },
+      { TO_LIMB_T(0x90ee774bd6a74d45), TO_LIMB_T(0x7ada1c8a41bfb185),
+        TO_LIMB_T(0x0f1a8953b325f464), TO_LIMB_T(0x104c24211be4805c),
+        TO_LIMB_T(0x169139d319ea7a8f), TO_LIMB_T(0x09f20ead8e532bf6)  },
+      { TO_LIMB_T(0x6ddd93e2f43626b7), TO_LIMB_T(0xa5482c9aa1ccd7bd),
+        TO_LIMB_T(0x143245631883f4bd), TO_LIMB_T(0x2e0a94ccf77ec0db),
+        TO_LIMB_T(0xb0282d480e56489f), TO_LIMB_T(0x18f4bfcbb4368929)  },
+      { TO_LIMB_T(0x23c5f0c953402dfd), TO_LIMB_T(0x7a43ff6958ce4fe9),
+        TO_LIMB_T(0x2c390d3d2da5df63), TO_LIMB_T(0xd0df5c98e1f9d70f),
+        TO_LIMB_T(0xffd89869a572b297), TO_LIMB_T(0x1277ffc72f25e8fe)  },
+      { TO_LIMB_T(0x79f4f0490f06a8a6), TO_LIMB_T(0x85f894a88030fd81),
+        TO_LIMB_T(0x12da3054b18b6410), TO_LIMB_T(0xe2a57f6505880d65),
+        TO_LIMB_T(0xbba074f260e400f1), TO_LIMB_T(0x08b76279f621d028)  },
+      { TO_LIMB_T(0xe67245ba78d5b00b), TO_LIMB_T(0x8456ba9a1f186475),
+        TO_LIMB_T(0x7888bff6e6b33bb4), TO_LIMB_T(0xe21585b9a30f86cb),
+        TO_LIMB_T(0x05a69cdcef55feee), TO_LIMB_T(0x09e699dd9adfa5ac)  },
+      { TO_LIMB_T(0x0de5c357bff57107), TO_LIMB_T(0x0a0db4ae6b1a10b2),
+        TO_LIMB_T(0xe256bb67b3b3cd8d), TO_LIMB_T(0x8ad456574e9db24f),
+        TO_LIMB_T(0x0443915f50fd4179), TO_LIMB_T(0x098c4bf7de8b6375)  },
+      { TO_LIMB_T(0xe6b0617e7dd929c7), TO_LIMB_T(0xfe6e37d442537375),
+        TO_LIMB_T(0x1dafdeda137a489e), TO_LIMB_T(0xe4efd1ad3f767ceb),
+        TO_LIMB_T(0x4a51d8667f0fe1cf), TO_LIMB_T(0x054fdf4bbf1d821c)  },
+      { TO_LIMB_T(0x72db2a50658d767b), TO_LIMB_T(0x8abf91faa257b3d5),
+        TO_LIMB_T(0xe969d6833764ab47), TO_LIMB_T(0x464170142a1009eb),
+        TO_LIMB_T(0xb14f01aadb30be2f), TO_LIMB_T(0x18ae6a856f40715d)  }
+    };
+    /* ...
+     * x_den = x'^10 + k_(2,9) * x'^9 + k_(2,8) * x'^8 + ... + k_(2,0)
+     */
+    static const vec384 isogeny_map_x_den[] = { /*  (k_(2,*)<<384) % P  */
+      { TO_LIMB_T(0xb962a077fdb0f945), TO_LIMB_T(0xa6a9740fefda13a0),
+        TO_LIMB_T(0xc14d568c3ed6c544), TO_LIMB_T(0xb43fc37b908b133e),
+        TO_LIMB_T(0x9c0b3ac929599016), TO_LIMB_T(0x0165aa6c93ad115f)  },
+      { TO_LIMB_T(0x23279a3ba506c1d9), TO_LIMB_T(0x92cfca0a9465176a),
+        TO_LIMB_T(0x3b294ab13755f0ff), TO_LIMB_T(0x116dda1c5070ae93),
+        TO_LIMB_T(0xed4530924cec2045), TO_LIMB_T(0x083383d6ed81f1ce)  },
+      { TO_LIMB_T(0x9885c2a6449fecfc), TO_LIMB_T(0x4a2b54ccd37733f0),
+        TO_LIMB_T(0x17da9ffd8738c142), TO_LIMB_T(0xa0fba72732b3fafd),
+        TO_LIMB_T(0xff364f36e54b6812), TO_LIMB_T(0x0f29c13c660523e2)  },
+      { TO_LIMB_T(0xe349cc118278f041), TO_LIMB_T(0xd487228f2f3204fb),
+        TO_LIMB_T(0xc9d325849ade5150), TO_LIMB_T(0x43a92bd69c15c2df),
+        TO_LIMB_T(0x1c2c7844bc417be4), TO_LIMB_T(0x12025184f407440c)  },
+      { TO_LIMB_T(0x587f65ae6acb057b), TO_LIMB_T(0x1444ef325140201f),
+        TO_LIMB_T(0xfbf995e71270da49), TO_LIMB_T(0xccda066072436a42),
+        TO_LIMB_T(0x7408904f0f186bb2), TO_LIMB_T(0x13b93c63edf6c015)  },
+      { TO_LIMB_T(0xfb918622cd141920), TO_LIMB_T(0x4a4c64423ecaddb4),
+        TO_LIMB_T(0x0beb232927f7fb26), TO_LIMB_T(0x30f94df6f83a3dc2),
+        TO_LIMB_T(0xaeedd424d780f388), TO_LIMB_T(0x06cc402dd594bbeb)  },
+      { TO_LIMB_T(0xd41f761151b23f8f), TO_LIMB_T(0x32a92465435719b3),
+        TO_LIMB_T(0x64f436e888c62cb9), TO_LIMB_T(0xdf70a9a1f757c6e4),
+        TO_LIMB_T(0x6933a38d5b594c81), TO_LIMB_T(0x0c6f7f7237b46606)  },
+      { TO_LIMB_T(0x693c08747876c8f7), TO_LIMB_T(0x22c9850bf9cf80f0),
+        TO_LIMB_T(0x8e9071dab950c124), TO_LIMB_T(0x89bc62d61c7baf23),
+        TO_LIMB_T(0xbc6be2d8dad57c23), TO_LIMB_T(0x17916987aa14a122)  },
+      { TO_LIMB_T(0x1be3ff439c1316fd), TO_LIMB_T(0x9965243a7571dfa7),
+        TO_LIMB_T(0xc7f7f62962f5cd81), TO_LIMB_T(0x32c6aa9af394361c),
+        TO_LIMB_T(0xbbc2ee18e1c227f4), TO_LIMB_T(0x0c102cbac531bb34)  },
+      { TO_LIMB_T(0x997614c97bacbf07), TO_LIMB_T(0x61f86372b99192c0),
+        TO_LIMB_T(0x5b8c95fc14353fc3), TO_LIMB_T(0xca2b066c2a87492f),
+        TO_LIMB_T(0x16178f5bbf698711), TO_LIMB_T(0x12a6dcd7f0f4e0e8)  }
+    };
+    /*
+     * y = y' * y_num / y_den, where
+     * y_num = k_(3,15) * x'^15 + k_(3,14) * x'^14 + k_(3,13) * x'^13 +
+     *         ... + k_(3,0)
+     * ...
+     */
+    static const vec384 isogeny_map_y_num[] = { /*  (k_(3,*)<<384) % P  */
+      { TO_LIMB_T(0x2b567ff3e2837267), TO_LIMB_T(0x1d4d9e57b958a767),
+        TO_LIMB_T(0xce028fea04bd7373), TO_LIMB_T(0xcc31a30a0b6cd3df),
+        TO_LIMB_T(0x7d7b18a682692693), TO_LIMB_T(0x0d300744d42a0310)  },
+      { TO_LIMB_T(0x99c2555fa542493f), TO_LIMB_T(0xfe7f53cc4874f878),
+        TO_LIMB_T(0x5df0608b8f97608a), TO_LIMB_T(0x14e03832052b49c8),
+        TO_LIMB_T(0x706326a6957dd5a4), TO_LIMB_T(0x0a8dadd9c2414555)  },
+      { TO_LIMB_T(0x13d942922a5cf63a), TO_LIMB_T(0x357e33e36e261e7d),
+        TO_LIMB_T(0xcf05a27c8456088d), TO_LIMB_T(0x0000bd1de7ba50f0),
+        TO_LIMB_T(0x83d0c7532f8c1fde), TO_LIMB_T(0x13f70bf38bbf2905)  },
+      { TO_LIMB_T(0x5c57fd95bfafbdbb), TO_LIMB_T(0x28a359a65e541707),
+        TO_LIMB_T(0x3983ceb4f6360b6d), TO_LIMB_T(0xafe19ff6f97e6d53),
+        TO_LIMB_T(0xb3468f4550192bf7), TO_LIMB_T(0x0bb6cde49d8ba257)  },
+      { TO_LIMB_T(0x590b62c7ff8a513f), TO_LIMB_T(0x314b4ce372cacefd),
+        TO_LIMB_T(0x6bef32ce94b8a800), TO_LIMB_T(0x6ddf84a095713d5f),
+        TO_LIMB_T(0x64eace4cb0982191), TO_LIMB_T(0x0386213c651b888d)  },
+      { TO_LIMB_T(0xa5310a31111bbcdd), TO_LIMB_T(0xa14ac0f5da148982),
+        TO_LIMB_T(0xf9ad9cc95423d2e9), TO_LIMB_T(0xaa6ec095283ee4a7),
+        TO_LIMB_T(0xcf5b1f022e1c9107), TO_LIMB_T(0x01fddf5aed881793)  },
+      { TO_LIMB_T(0x65a572b0d7a7d950), TO_LIMB_T(0xe25c2d8183473a19),
+        TO_LIMB_T(0xc2fcebe7cb877dbd), TO_LIMB_T(0x05b2d36c769a89b0),
+        TO_LIMB_T(0xba12961be86e9efb), TO_LIMB_T(0x07eb1b29c1dfde1f)  },
+      { TO_LIMB_T(0x93e09572f7c4cd24), TO_LIMB_T(0x364e929076795091),
+        TO_LIMB_T(0x8569467e68af51b5), TO_LIMB_T(0xa47da89439f5340f),
+        TO_LIMB_T(0xf4fa918082e44d64), TO_LIMB_T(0x0ad52ba3e6695a79)  },
+      { TO_LIMB_T(0x911429844e0d5f54), TO_LIMB_T(0xd03f51a3516bb233),
+        TO_LIMB_T(0x3d587e5640536e66), TO_LIMB_T(0xfa86d2a3a9a73482),
+        TO_LIMB_T(0xa90ed5adf1ed5537), TO_LIMB_T(0x149c9c326a5e7393)  },
+      { TO_LIMB_T(0x462bbeb03c12921a), TO_LIMB_T(0xdc9af5fa0a274a17),
+        TO_LIMB_T(0x9a558ebde836ebed), TO_LIMB_T(0x649ef8f11a4fae46),
+        TO_LIMB_T(0x8100e1652b3cdc62), TO_LIMB_T(0x1862bd62c291dacb)  },
+      { TO_LIMB_T(0x05c9b8ca89f12c26), TO_LIMB_T(0x0194160fa9b9ac4f),
+        TO_LIMB_T(0x6a643d5a6879fa2c), TO_LIMB_T(0x14665bdd8846e19d),
+        TO_LIMB_T(0xbb1d0d53af3ff6bf), TO_LIMB_T(0x12c7e1c3b28962e5)  },
+      { TO_LIMB_T(0xb55ebf900b8a3e17), TO_LIMB_T(0xfedc77ec1a9201c4),
+        TO_LIMB_T(0x1f07db10ea1a4df4), TO_LIMB_T(0x0dfbd15dc41a594d),
+        TO_LIMB_T(0x389547f2334a5391), TO_LIMB_T(0x02419f98165871a4)  },
+      { TO_LIMB_T(0xb416af000745fc20), TO_LIMB_T(0x8e563e9d1ea6d0f5),
+        TO_LIMB_T(0x7c763e17763a0652), TO_LIMB_T(0x01458ef0159ebbef),
+        TO_LIMB_T(0x8346fe421f96bb13), TO_LIMB_T(0x0d2d7b829ce324d2)  },
+      { TO_LIMB_T(0x93096bb538d64615), TO_LIMB_T(0x6f2a2619951d823a),
+        TO_LIMB_T(0x8f66b3ea59514fa4), TO_LIMB_T(0xf563e63704f7092f),
+        TO_LIMB_T(0x724b136c4cf2d9fa), TO_LIMB_T(0x046959cfcfd0bf49)  },
+      { TO_LIMB_T(0xea748d4b6e405346), TO_LIMB_T(0x91e9079c2c02d58f),
+        TO_LIMB_T(0x41064965946d9b59), TO_LIMB_T(0xa06731f1d2bbe1ee),
+        TO_LIMB_T(0x07f897e267a33f1b), TO_LIMB_T(0x1017290919210e5f)  },
+      { TO_LIMB_T(0x872aa6c17d985097), TO_LIMB_T(0xeecc53161264562a),
+        TO_LIMB_T(0x07afe37afff55002), TO_LIMB_T(0x54759078e5be6838),
+        TO_LIMB_T(0xc4b92d15db8acca8), TO_LIMB_T(0x106d87d1b51d13b9)  }
+    };
+    /* ...
+     * y_den = x'^15 + k_(4,14) * x'^14 + k_(4,13) * x'^13 + ... + k_(4,0)
+     */
+    static const vec384 isogeny_map_y_den[] = { /*  (k_(4,*)<<384) % P  */
+      { TO_LIMB_T(0xeb6c359d47e52b1c), TO_LIMB_T(0x18ef5f8a10634d60),
+        TO_LIMB_T(0xddfa71a0889d5b7e), TO_LIMB_T(0x723e71dcc5fc1323),
+        TO_LIMB_T(0x52f45700b70d5c69), TO_LIMB_T(0x0a8b981ee47691f1)  },
+      { TO_LIMB_T(0x616a3c4f5535b9fb), TO_LIMB_T(0x6f5f037395dbd911),
+        TO_LIMB_T(0xf25f4cc5e35c65da), TO_LIMB_T(0x3e50dffea3c62658),
+        TO_LIMB_T(0x6a33dca523560776), TO_LIMB_T(0x0fadeff77b6bfe3e)  },
+      { TO_LIMB_T(0x2be9b66df470059c), TO_LIMB_T(0x24a2c159a3d36742),
+        TO_LIMB_T(0x115dbe7ad10c2a37), TO_LIMB_T(0xb6634a652ee5884d),
+        TO_LIMB_T(0x04fe8bb2b8d81af4), TO_LIMB_T(0x01c2a7a256fe9c41)  },
+      { TO_LIMB_T(0xf27bf8ef3b75a386), TO_LIMB_T(0x898b367476c9073f),
+        TO_LIMB_T(0x24482e6b8c2f4e5f), TO_LIMB_T(0xc8e0bbd6fe110806),
+        TO_LIMB_T(0x59b0c17f7631448a), TO_LIMB_T(0x11037cd58b3dbfbd)  },
+      { TO_LIMB_T(0x31c7912ea267eec6), TO_LIMB_T(0x1dbf6f1c5fcdb700),
+        TO_LIMB_T(0xd30d4fe3ba86fdb1), TO_LIMB_T(0x3cae528fbee9a2a4),
+        TO_LIMB_T(0xb1cce69b6aa9ad9a), TO_LIMB_T(0x044393bb632d94fb)  },
+      { TO_LIMB_T(0xc66ef6efeeb5c7e8), TO_LIMB_T(0x9824c289dd72bb55),
+        TO_LIMB_T(0x71b1a4d2f119981d), TO_LIMB_T(0x104fc1aafb0919cc),
+        TO_LIMB_T(0x0e49df01d942a628), TO_LIMB_T(0x096c3a09773272d4)  },
+      { TO_LIMB_T(0x9abc11eb5fadeff4), TO_LIMB_T(0x32dca50a885728f0),
+        TO_LIMB_T(0xfb1fa3721569734c), TO_LIMB_T(0xc4b76271ea6506b3),
+        TO_LIMB_T(0xd466a75599ce728e), TO_LIMB_T(0x0c81d4645f4cb6ed)  },
+      { TO_LIMB_T(0x4199f10e5b8be45b), TO_LIMB_T(0xda64e495b1e87930),
+        TO_LIMB_T(0xcb353efe9b33e4ff), TO_LIMB_T(0x9e9efb24aa6424c6),
+        TO_LIMB_T(0xf08d33680a237465), TO_LIMB_T(0x0d3378023e4c7406)  },
+      { TO_LIMB_T(0x7eb4ae92ec74d3a5), TO_LIMB_T(0xc341b4aa9fac3497),
+        TO_LIMB_T(0x5be603899e907687), TO_LIMB_T(0x03bfd9cca75cbdeb),
+        TO_LIMB_T(0x564c2935a96bfa93), TO_LIMB_T(0x0ef3c33371e2fdb5)  },
+      { TO_LIMB_T(0x7ee91fd449f6ac2e), TO_LIMB_T(0xe5d5bd5cb9357a30),
+        TO_LIMB_T(0x773a8ca5196b1380), TO_LIMB_T(0xd0fda172174ed023),
+        TO_LIMB_T(0x6cb95e0fa776aead), TO_LIMB_T(0x0d22d5a40cec7cff)  },
+      { TO_LIMB_T(0xf727e09285fd8519), TO_LIMB_T(0xdc9d55a83017897b),
+        TO_LIMB_T(0x7549d8bd057894ae), TO_LIMB_T(0x178419613d90d8f8),
+        TO_LIMB_T(0xfce95ebdeb5b490a), TO_LIMB_T(0x0467ffaef23fc49e)  },
+      { TO_LIMB_T(0xc1769e6a7c385f1b), TO_LIMB_T(0x79bc930deac01c03),
+        TO_LIMB_T(0x5461c75a23ede3b5), TO_LIMB_T(0x6e20829e5c230c45),
+        TO_LIMB_T(0x828e0f1e772a53cd), TO_LIMB_T(0x116aefa749127bff)  },
+      { TO_LIMB_T(0x101c10bf2744c10a), TO_LIMB_T(0xbbf18d053a6a3154),
+        TO_LIMB_T(0xa0ecf39ef026f602), TO_LIMB_T(0xfc009d4996dc5153),
+        TO_LIMB_T(0xb9000209d5bd08d3), TO_LIMB_T(0x189e5fe4470cd73c)  },
+      { TO_LIMB_T(0x7ebd546ca1575ed2), TO_LIMB_T(0xe47d5a981d081b55),
+        TO_LIMB_T(0x57b2b625b6d4ca21), TO_LIMB_T(0xb0a1ba04228520cc),
+        TO_LIMB_T(0x98738983c2107ff3), TO_LIMB_T(0x13dddbc4799d81d6)  },
+      { TO_LIMB_T(0x09319f2e39834935), TO_LIMB_T(0x039e952cbdb05c21),
+        TO_LIMB_T(0x55ba77a9a2f76493), TO_LIMB_T(0xfd04e3dfc6086467),
+        TO_LIMB_T(0xfb95832e7d78742e), TO_LIMB_T(0x0ef9c24eccaf5e0e)  }
+    };
+    vec384 Zz_powers[15], map[15], xn, xd, yn, yd;
+
+    /* lay down Z^2 powers in descending order                          */
+    sqr_fp(Zz_powers[14], p->Z);                        /* ZZ^1         */
+#ifdef __OPTIMIZE_SIZE__
+    for (size_t i = 14; i > 0; i--)
+        mul_fp(Zz_powers[i-1], Zz_powers[i], Zz_powers[14]);
+#else
+    sqr_fp(Zz_powers[13], Zz_powers[14]);               /* ZZ^2  1+1    */
+    mul_fp(Zz_powers[12], Zz_powers[14], Zz_powers[13]);/* ZZ^3  2+1    */
+    sqr_fp(Zz_powers[11], Zz_powers[13]);               /* ZZ^4  2+2    */
+    mul_fp(Zz_powers[10], Zz_powers[13], Zz_powers[12]);/* ZZ^5  2+3    */
+    sqr_fp(Zz_powers[9],  Zz_powers[12]);               /* ZZ^6  3+3    */
+    mul_fp(Zz_powers[8],  Zz_powers[12], Zz_powers[11]);/* ZZ^7  3+4    */
+    sqr_fp(Zz_powers[7],  Zz_powers[11]);               /* ZZ^8  4+4    */
+    mul_fp(Zz_powers[6],  Zz_powers[11], Zz_powers[10]);/* ZZ^9  4+5    */
+    sqr_fp(Zz_powers[5],  Zz_powers[10]);               /* ZZ^10 5+5    */
+    mul_fp(Zz_powers[4],  Zz_powers[10], Zz_powers[9]); /* ZZ^11 5+6    */
+    sqr_fp(Zz_powers[3],  Zz_powers[9]);                /* ZZ^12 6+6    */
+    mul_fp(Zz_powers[2],  Zz_powers[9],  Zz_powers[8]); /* ZZ^13 6+7    */
+    sqr_fp(Zz_powers[1],  Zz_powers[8]);                /* ZZ^14 7+7    */
+    mul_fp(Zz_powers[0],  Zz_powers[8],  Zz_powers[7]); /* ZZ^15 7+8    */
+#endif
+
+    map_fp_times_Zz(map, isogeny_map_x_num, Zz_powers + 4, 11);
+    mul_fp(xn, p->X, isogeny_map_x_num[11]);
+    add_fp(xn, xn, map[10]);
+    map_fp(xn, p->X, map, 10);
+
+    map_fp_times_Zz(map, isogeny_map_x_den, Zz_powers + 5, 10);
+    add_fp(xd, p->X, map[9]);
+    map_fp(xd, p->X, map, 9);
+    mul_fp(xd, xd, Zz_powers[14]);      /* xd *= Z^2                    */
+
+    map_fp_times_Zz(map, isogeny_map_y_num, Zz_powers, 15);
+    mul_fp(yn, p->X, isogeny_map_y_num[15]);
+    add_fp(yn, yn, map[14]);
+    map_fp(yn, p->X, map, 14);
+    mul_fp(yn, yn, p->Y);               /* yn *= Y                      */
+
+    map_fp_times_Zz(map, isogeny_map_y_den, Zz_powers, 15);
+    add_fp(yd, p->X, map[14]);
+    map_fp(yd, p->X, map, 14);
+    mul_fp(Zz_powers[14], Zz_powers[14], p->Z);
+    mul_fp(yd, yd, Zz_powers[14]);      /* yd *= Z^3                    */
+
+    /* convert (xn, xd, yn, yd) to Jacobian coordinates                 */
+    mul_fp(out->Z, xd, yd);             /* Z = xd * yd                  */
+    mul_fp(out->X, xn, yd);
+    mul_fp(out->X, out->X, out->Z);     /* X = xn * xd * yd^2           */
+    sqr_fp(out->Y, out->Z);
+    mul_fp(out->Y, out->Y, xd);
+    mul_fp(out->Y, out->Y, yn);         /* Y = yn * xd^3 * yd^2         */
+}
+
+static void map_to_isogenous_E1(POINTonE1 *p, const vec384 u)
+{
+    static const vec384 minus_A = { /* P - A */
+        TO_LIMB_T(0x8a9955f1650a005a), TO_LIMB_T(0x9865b3d192cfe93c),
+        TO_LIMB_T(0xaed3ed0f3ef3c441), TO_LIMB_T(0x3c962ef33d92c442),
+        TO_LIMB_T(0x22e438dbd74f94a2), TO_LIMB_T(0x04acbc265478c915)
+    };
+    static const vec384 Z = {       /* (11<<384) % P */
+        TO_LIMB_T(0x886c00000023ffdc), TO_LIMB_T(0x0f70008d3090001d),
+        TO_LIMB_T(0x77672417ed5828c3), TO_LIMB_T(0x9dac23e943dc1740),
+        TO_LIMB_T(0x50553f1b9c131521), TO_LIMB_T(0x078c712fbe0ab6e8)
+    };
+    static const vec384 sqrt_minus_ZZZ = {
+        TO_LIMB_T(0x43b571cad3215f1f), TO_LIMB_T(0xccb460ef1c702dc2),
+        TO_LIMB_T(0x742d884f4f97100b), TO_LIMB_T(0xdb2c3e3238a3382b),
+        TO_LIMB_T(0xe40f3fa13fce8f88), TO_LIMB_T(0x0073a2af9892a2ff)
+    };
+    static const vec384 ZxA = {
+        TO_LIMB_T(0x7f674ea0a8915178), TO_LIMB_T(0xb0f945fc13b8fa65),
+        TO_LIMB_T(0x4b46759a38e87d76), TO_LIMB_T(0x2e7a929641bbb6a1),
+        TO_LIMB_T(0x1668ddfa462bf6b6), TO_LIMB_T(0x00960e2ed1cf294c)
+    };
+    vec384 uu, tv2, x2n, gx1, gxd, y2;
+#if 0
+    vec384 xn, x1n, xd, y, y1, Zuu, tv4;
+#else
+# define xn     p->X
+# define y      p->Y
+# define xd     p->Z
+# define x1n    xn
+# define y1     y
+# define Zuu    x2n
+# define tv4    y1
+#endif
+#define sgn0_fp(a) (sgn0_pty_mont_384((a), BLS12_381_P, p0) & 1)
+    bool_t e1, e2;
+
+    /*
+     * as per map_to_curve() from poc/sswu_opt.sage at
+     * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve
+     */
+    /* x numerator variants                                             */
+    sqr_fp(uu, u);                      /* uu = u^2                     */
+    mul_fp(Zuu, Z, uu);                 /* Zuu = Z * uu                 */
+    sqr_fp(tv2, Zuu);                   /* tv2 = Zuu^2                  */
+    add_fp(tv2, tv2, Zuu);              /* tv2 = tv2 + Zuu              */
+    add_fp(x1n, tv2, BLS12_381_Rx.p);   /* x1n = tv2 + 1                */
+    mul_fp(x1n, x1n, Bprime_E1);        /* x1n = x1n * B                */
+    mul_fp(x2n, Zuu, x1n);              /* x2n = Zuu * x1n              */
+
+    /* x denumenator                                                    */
+    mul_fp(xd, minus_A, tv2);           /* xd = -A * tv2                */
+    e1 = vec_is_zero(xd, sizeof(xd));   /* e1 = xd == 0                 */
+    vec_select(xd, ZxA, xd, sizeof(xd), e1);    /*              # If xd == 0, set xd = Z*A */
+
+    /* y numerators variants                                            */
+    sqr_fp(tv2, xd);                    /* tv2 = xd^2                   */
+    mul_fp(gxd, xd, tv2);               /* gxd = xd^3                   */
+    mul_fp(tv2, Aprime_E1, tv2);        /* tv2 = A * tv2                */
+    sqr_fp(gx1, x1n);                   /* gx1 = x1n^2                  */
+    add_fp(gx1, gx1, tv2);              /* gx1 = gx1 + tv2      # x1n^2 + A*xd^2 */
+    mul_fp(gx1, gx1, x1n);              /* gx1 = gx1 * x1n      # x1n^3 + A*x1n*xd^2 */
+    mul_fp(tv2, Bprime_E1, gxd);        /* tv2 = B * gxd                */
+    add_fp(gx1, gx1, tv2);              /* gx1 = gx1 + tv2      # x1^3 + A*x1*xd^2 + B*xd^3 */
+    sqr_fp(tv4, gxd);                   /* tv4 = gxd^2                  */
+    mul_fp(tv2, gx1, gxd);              /* tv2 = gx1 * gxd              */
+    mul_fp(tv4, tv4, tv2);              /* tv4 = tv4 * tv2      # gx1*gxd^3 */
+    e2 = recip_sqrt_fp(y1, tv4);        /* y1 = tv4^c1          # (gx1*gxd^3)^((p-3)/4) */
+    mul_fp(y1, y1, tv2);                /* y1 = y1 * tv2        # gx1*gxd*y1 */
+    mul_fp(y2, y1, sqrt_minus_ZZZ);     /* y2 = y1 * c2         # y2 = y1*sqrt(-Z^3) */
+    mul_fp(y2, y2, uu);                 /* y2 = y2 * uu                 */
+    mul_fp(y2, y2, u);                  /* y2 = y2 * u                  */
+
+    /* choose numerators                                                */
+    vec_select(xn, x1n, x2n, sizeof(xn), e2);   /* xn = e2 ? x1n : x2n  */
+    vec_select(y, y1, y2, sizeof(y), e2);       /* y  = e2 ? y1 : y2    */
+
+    e1 = sgn0_fp(u);
+    e2 = sgn0_fp(y);
+    cneg_fp(y, y, e1^e2);               /* fix sign of y                */
+                                        /* return (xn, xd, y, 1)        */
+
+    /* convert (xn, xd, y, 1) to Jacobian projective coordinates        */
+    mul_fp(p->X, xn, xd);               /* X = xn * xd                  */
+    mul_fp(p->Y, y, gxd);               /* Y = y * xd^3                 */
+#ifndef xd
+    vec_copy(p->Z, xd, sizeof(xd));     /* Z = xd                       */
+#else
+# undef xn
+# undef y
+# undef xd
+# undef x1n
+# undef y1
+# undef Zuu
+# undef tv4
+#endif
+#undef sgn0_fp
+}
+
+static void POINTonE1_add_n_dbl(POINTonE1 *out, const POINTonE1 *p, size_t n)
+{
+    POINTonE1_dadd(out, out, p, NULL);
+    while(n--)
+        POINTonE1_double(out, out);
+}
+
+static void POINTonE1_times_minus_z(POINTonE1 *out, const POINTonE1 *in)
+{
+    POINTonE1_double(out, in);          /*      1: 0x2                  */
+    POINTonE1_add_n_dbl(out, in, 2);    /*   2..4: 0x3..0xc             */
+    POINTonE1_add_n_dbl(out, in, 3);    /*   5..8: 0xd..0x68            */
+    POINTonE1_add_n_dbl(out, in, 9);    /*  9..18: 0x69..0xd200         */
+    POINTonE1_add_n_dbl(out, in, 32);   /* 19..51: ..0xd20100000000     */
+    POINTonE1_add_n_dbl(out, in, 16);   /* 52..68: ..0xd201000000010000 */
+}
+
+/*
+ * |u|, |v| are expected to be in Montgomery representation
+ */
+static void map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v)
+{
+    POINTonE1 p;
+
+    map_to_isogenous_E1(&p, u);
+
+    if (v != NULL) {
+        map_to_isogenous_E1(out, v);    /* borrow |out|                 */
+        POINTonE1_dadd(&p, &p, out, Aprime_E1);
+    }
+
+    isogeny_map_to_E1(&p, &p);          /* sprinkle isogenous powder    */
+
+    /* clear the cofactor by multiplying |p| by 1-z, 0xd201000000010001 */
+    POINTonE1_times_minus_z(out, &p);
+    POINTonE1_dadd(out, out, &p, NULL);
+}
+
+void blst_map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v)
+{   map_to_g1(out, u, v);   }
+
+static void Encode_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len,
+                                       const unsigned char *DST, size_t DST_len,
+                                       const unsigned char *aug, size_t aug_len)
+{
+    vec384 u[1];
+
+    hash_to_field(u, 1, aug, aug_len, msg, msg_len, DST, DST_len);
+    map_to_g1(p, u[0], NULL);
+}
+
+void blst_encode_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len,
+                                     const unsigned char *DST, size_t DST_len,
+                                     const unsigned char *aug, size_t aug_len)
+{   Encode_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len);   }
+
+static void Hash_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len,
+                                     const unsigned char *DST, size_t DST_len,
+                                     const unsigned char *aug, size_t aug_len)
+{
+    vec384 u[2];
+
+    hash_to_field(u, 2, aug, aug_len, msg, msg_len, DST, DST_len);
+    map_to_g1(p, u[0], u[1]);
+}
+
+void blst_hash_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len,
+                                   const unsigned char *DST, size_t DST_len,
+                                   const unsigned char *aug, size_t aug_len)
+{   Hash_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len);   }
+
+static void sigma(POINTonE1 *out, const POINTonE1 *in);
+
+#if 0
+#ifdef __OPTIMIZE_SIZE__
+static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out,
+                                                const POINTonE1 *in)
+{
+    static const byte zz_minus_1_div_by_3[] = {
+        TO_BYTES(0x0000000055555555ULL), TO_BYTES(0x396c8c005555e156)
+    };
+    size_t n = 126-1;
+    const POINTonE1 *dblin = in;
+
+    while(n--) {
+        POINTonE1_double(out, dblin);   dblin = out;
+        if (is_bit_set(zz_minus_1_div_by_3, n))
+            POINTonE1_dadd(out, out, in, NULL);
+    }
+}
+#else
+static void POINTonE1_dbl_n_add(POINTonE1 *out, size_t n, const POINTonE1 *p)
+{
+    while(n--)
+        POINTonE1_double(out, out);
+    POINTonE1_dadd(out, out, p, NULL);
+}
+
+static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out,
+                                                const POINTonE1 *in)
+{
+    POINTonE1 t3, t5, t7, t11, t85;
+
+    POINTonE1_double(&t7, in);              /* 2P */
+    POINTonE1_dadd(&t3, &t7, in, NULL);     /* 3P */
+    POINTonE1_dadd(&t5, &t3, &t7, NULL);    /* 5P */
+    POINTonE1_dadd(&t7, &t5, &t7, NULL);    /* 7P */
+    POINTonE1_double(&t85, &t5);            /* 10P */
+    POINTonE1_dadd(&t11, &t85, in, NULL);   /* 11P */
+    POINTonE1_dbl_n_add(&t85, 3, &t5);      /* 0x55P */
+                                            /* (-0xd201000000010000^2 - 1) / 3 */
+    POINTonE1_double(out, &t7);             /* 0xe */
+    POINTonE1_dbl_n_add(out, 5,  &t11);     /* 0x1cb */
+    POINTonE1_dbl_n_add(out, 3,  &t3);      /* 0xe5b */
+    POINTonE1_dbl_n_add(out, 3,  in);       /* 0x72d9 */
+    POINTonE1_dbl_n_add(out, 5,  &t3);      /* 0xe5b23 */
+    POINTonE1_dbl_n_add(out, 18, &t85);     /* 0x396c8c0055 */
+    POINTonE1_dbl_n_add(out, 8,  &t85);     /* 0x396c8c005555 */
+    POINTonE1_dbl_n_add(out, 3,  &t7);      /* 0x1cb646002aaaf */
+    POINTonE1_dbl_n_add(out, 7,  &t5);      /* 0xe5b23001555785 */
+    POINTonE1_dbl_n_add(out, 5,  &t11);     /* 0x1cb646002aaaf0ab */
+    POINTonE1_dbl_n_add(out, 41, &t85);     /* 0x396c8c005555e1560000000055 */
+    POINTonE1_dbl_n_add(out, 8,  &t85);     /* 0x396c8c005555e156000000005555 */
+    POINTonE1_dbl_n_add(out, 8,  &t85);     /* 0x396c8c005555e15600000000555555 */
+    POINTonE1_dbl_n_add(out, 8,  &t85);     /* 0x396c8c005555e1560000000055555555 */
+}
+#endif
+
+static bool_t POINTonE1_in_G1(const POINTonE1 *P)
+{
+    POINTonE1 t0, t1, t2;
+
+    /* Bowe, S., "Faster subgroup checks for BLS12-381"                   */
+    sigma(&t0, P);                        /* σ(P)                         */
+    sigma(&t1, &t0);                      /* σ²(P)                        */
+
+    POINTonE1_double(&t0, &t0);           /* 2σ(P)                        */
+    POINTonE1_dadd(&t2, &t1, P, NULL);    /* P +  σ²(P)                   */
+    POINTonE1_cneg(&t2, 1);               /* - P - σ²(P)                  */
+    POINTonE1_dadd(&t2, &t2, &t0, NULL);  /* 2σ(P) - P - σ²(P)            */
+    POINTonE1_times_zz_minus_1_div_by_3(  &t0, &t2);
+    POINTonE1_cneg(&t1, 1);
+    POINTonE1_dadd(&t0, &t0, &t1, NULL);  /* [(z²-1)/3](2σ(P) - P - σ²(P)) */
+                                          /* - σ²(P) */
+    return vec_is_zero(t0.Z, sizeof(t0.Z));
+}
+#else
+static bool_t POINTonE1_in_G1(const POINTonE1 *P)
+{
+    POINTonE1 t0, t1;
+
+    /* Scott, M., https://eprint.iacr.org/2021/1130 */
+    POINTonE1_times_minus_z(&t0, P);
+    POINTonE1_times_minus_z(&t1, &t0);
+    POINTonE1_cneg(&t1, 1);             /* [-z²]P   */
+
+    sigma(&t0, P);                      /* σ(P)     */
+    sigma(&t0, &t0);                    /* σ²(P)    */
+
+    return POINTonE1_is_equal(&t0, &t1);
+}
+#endif
+
+int blst_p1_in_g1(const POINTonE1 *p)
+{   return (int)POINTonE1_in_G1(p);   }
+
+int blst_p1_affine_in_g1(const POINTonE1_affine *p)
+{
+    POINTonE1 P;
+
+    vec_copy(P.X, p->X, 2*sizeof(P.X));
+    vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z),
+                     vec_is_zero(p, sizeof(*p)));
+
+    return (int)POINTonE1_in_G1(&P);
+}
diff --git a/blst/map_to_g2.c b/blst/map_to_g2.c
new file mode 100644
index 0000000..90fd86e
--- /dev/null
+++ b/blst/map_to_g2.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "point.h"
+#include "fields.h"
+
+/*
+ * y^2 = x^3 + A'*x + B', isogenous one
+ */
+static const vec384x Aprime_E2 = {      /* 240*i */
+  { 0 },
+  { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285),
+    TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601),
+    TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) }
+};
+static const vec384x Bprime_E2 = {      /* 1012 + 1012*i */
+  { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4),
+    TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba),
+    TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) },
+  { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4),
+    TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba),
+    TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) }
+};
+
+static void map_fp2_times_Zz(vec384x map[], const vec384x isogeny_map[],
+                             const vec384x Zz_powers[], size_t n)
+{
+    while (n--)
+        mul_fp2(map[n], isogeny_map[n], Zz_powers[n]);
+}
+
+static void map_fp2(vec384x acc, const vec384x x, const vec384x map[], size_t n)
+{
+    while (n--) {
+        mul_fp2(acc, acc, x);
+        add_fp2(acc, acc, map[n]);
+    }
+}
+
+static void isogeny_map_to_E2(POINTonE2 *out, const POINTonE2 *p)
+{
+    /*
+     * x = x_num / x_den, where
+     * x_num = k_(1,3) * x'^3 + k_(1,2) * x'^2 + k_(1,1) * x' + k_(1,0)
+     * ...
+     */
+    static const vec384x isogeny_map_x_num[] = {    /* (k_(1,*)<<384) % P   */
+     {{ TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e),
+        TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062),
+        TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) },
+      { TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e),
+        TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062),
+        TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }},
+     {{ 0 },
+      { TO_LIMB_T(0x5fe55555554c71d0), TO_LIMB_T(0x873fffdd236aaaa3),
+        TO_LIMB_T(0x6a6b4619b26ef918), TO_LIMB_T(0x21c2888408874945),
+        TO_LIMB_T(0x2836cda7028cabc5), TO_LIMB_T(0x0ac73310a7fd5abd) }},
+     {{ TO_LIMB_T(0x0a0c5555555971c3), TO_LIMB_T(0xdb0c00101f9eaaae),
+        TO_LIMB_T(0xb1fb2f941d797997), TO_LIMB_T(0xd3960742ef416e1c),
+        TO_LIMB_T(0xb70040e2c20556f4), TO_LIMB_T(0x149d7861e581393b) },
+      { TO_LIMB_T(0xaff2aaaaaaa638e8), TO_LIMB_T(0x439fffee91b55551),
+        TO_LIMB_T(0xb535a30cd9377c8c), TO_LIMB_T(0x90e144420443a4a2),
+        TO_LIMB_T(0x941b66d3814655e2), TO_LIMB_T(0x0563998853fead5e) }},
+     {{ TO_LIMB_T(0x40aac71c71c725ed), TO_LIMB_T(0x190955557a84e38e),
+        TO_LIMB_T(0xd817050a8f41abc3), TO_LIMB_T(0xd86485d4c87f6fb1),
+        TO_LIMB_T(0x696eb479f885d059), TO_LIMB_T(0x198e1a74328002d2) },
+      { 0 }}
+    };
+    /* ...
+     * x_den = x'^2 + k_(2,1) * x' + k_(2,0)
+     */
+    static const vec384x isogeny_map_x_den[] = {    /* (k_(2,*)<<384) % P   */
+     {{ 0 },
+      { TO_LIMB_T(0x1f3affffff13ab97), TO_LIMB_T(0xf25bfc611da3ff3e),
+        TO_LIMB_T(0xca3757cb3819b208), TO_LIMB_T(0x3e6427366f8cec18),
+        TO_LIMB_T(0x03977bc86095b089), TO_LIMB_T(0x04f69db13f39a952) }},
+     {{ TO_LIMB_T(0x447600000027552e), TO_LIMB_T(0xdcb8009a43480020),
+        TO_LIMB_T(0x6f7ee9ce4a6e8b59), TO_LIMB_T(0xb10330b7c0a95bc6),
+        TO_LIMB_T(0x6140b1fcfb1e54b7), TO_LIMB_T(0x0381be097f0bb4e1) },
+      { TO_LIMB_T(0x7588ffffffd8557d), TO_LIMB_T(0x41f3ff646e0bffdf),
+        TO_LIMB_T(0xf7b1e8d2ac426aca), TO_LIMB_T(0xb3741acd32dbb6f8),
+        TO_LIMB_T(0xe9daf5b9482d581f), TO_LIMB_T(0x167f53e0ba7431b8) }}
+    };
+    /*
+     * y = y' * y_num / y_den, where
+     * y_num = k_(3,3) * x'^3 + k_(3,2) * x'^2 + k_(3,1) * x' + k_(3,0)
+     * ...
+     */
+    static const vec384x isogeny_map_y_num[] = {    /* (k_(3,*)<<384) % P   */
+     {{ TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2),
+        TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1),
+        TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) },
+      { TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2),
+        TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1),
+        TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }},
+     {{ 0 },
+      { TO_LIMB_T(0xbf0a71c71c91b406), TO_LIMB_T(0x4d6d55d28b7638fd),
+        TO_LIMB_T(0x9d82f98e5f205aee), TO_LIMB_T(0xa27aa27b1d1a18d5),
+        TO_LIMB_T(0x02c3b2b2d2938e86), TO_LIMB_T(0x0c7d13420b09807f) }},
+     {{ TO_LIMB_T(0xd7f9555555531c74), TO_LIMB_T(0x21cffff748daaaa8),
+        TO_LIMB_T(0x5a9ad1866c9bbe46), TO_LIMB_T(0x4870a2210221d251),
+        TO_LIMB_T(0x4a0db369c0a32af1), TO_LIMB_T(0x02b1ccc429ff56af) },
+      { TO_LIMB_T(0xe205aaaaaaac8e37), TO_LIMB_T(0xfcdc000768795556),
+        TO_LIMB_T(0x0c96011a8a1537dd), TO_LIMB_T(0x1c06a963f163406e),
+        TO_LIMB_T(0x010df44c82a881e6), TO_LIMB_T(0x174f45260f808feb) }},
+     {{ TO_LIMB_T(0xa470bda12f67f35c), TO_LIMB_T(0xc0fe38e23327b425),
+        TO_LIMB_T(0xc9d3d0f2c6f0678d), TO_LIMB_T(0x1c55c9935b5a982e),
+        TO_LIMB_T(0x27f6c0e2f0746764), TO_LIMB_T(0x117c5e6e28aa9054) },
+      { 0 }}
+    };
+    /* ...
+     * y_den = x'^3 + k_(4,2) * x'^2 + k_(4,1) * x' + k_(4,0)
+     */
+    static const vec384x isogeny_map_y_den[] = {    /* (k_(4,*)<<384) % P   */
+     {{ TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75),
+        TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5),
+        TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) },
+      { TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75),
+        TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5),
+        TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }},
+     {{ 0 },
+      { TO_LIMB_T(0x5db0fffffd3b02c5), TO_LIMB_T(0xd713f52358ebfdba),
+        TO_LIMB_T(0x5ea60761a84d161a), TO_LIMB_T(0xbb2c75a34ea6c44a),
+        TO_LIMB_T(0x0ac6735921c1119b), TO_LIMB_T(0x0ee3d913bdacfbf6) }},
+     {{ TO_LIMB_T(0x66b10000003affc5), TO_LIMB_T(0xcb1400e764ec0030),
+        TO_LIMB_T(0xa73e5eb56fa5d106), TO_LIMB_T(0x8984c913a0fe09a9),
+        TO_LIMB_T(0x11e10afb78ad7f13), TO_LIMB_T(0x05429d0e3e918f52) },
+      { TO_LIMB_T(0x534dffffffc4aae6), TO_LIMB_T(0x5397ff174c67ffcf),
+        TO_LIMB_T(0xbff273eb870b251d), TO_LIMB_T(0xdaf2827152870915),
+        TO_LIMB_T(0x393a9cbaca9e2dc3), TO_LIMB_T(0x14be74dbfaee5748) }}
+    };
+    vec384x Zz_powers[3], map[3], xn, xd, yn, yd;
+
+    /* lay down Z^2 powers in descending order                          */
+    sqr_fp2(Zz_powers[2], p->Z);                       /* ZZ^1          */
+    sqr_fp2(Zz_powers[1], Zz_powers[2]);               /* ZZ^2  1+1     */
+    mul_fp2(Zz_powers[0], Zz_powers[2], Zz_powers[1]); /* ZZ^3  2+1     */
+
+    map_fp2_times_Zz(map, isogeny_map_x_num, Zz_powers, 3);
+    mul_fp2(xn, p->X, isogeny_map_x_num[3]);
+    add_fp2(xn, xn, map[2]);
+    map_fp2(xn, p->X, map, 2);
+
+    map_fp2_times_Zz(map, isogeny_map_x_den, Zz_powers + 1, 2);
+    add_fp2(xd, p->X, map[1]);
+    map_fp2(xd, p->X, map, 1);
+    mul_fp2(xd, xd, Zz_powers[2]);      /* xd *= Z^2                    */
+
+    map_fp2_times_Zz(map, isogeny_map_y_num, Zz_powers, 3);
+    mul_fp2(yn, p->X, isogeny_map_y_num[3]);
+    add_fp2(yn, yn, map[2]);
+    map_fp2(yn, p->X, map, 2);
+    mul_fp2(yn, yn, p->Y);              /* yn *= Y                      */
+
+    map_fp2_times_Zz(map, isogeny_map_y_den, Zz_powers, 3);
+    add_fp2(yd, p->X, map[2]);
+    map_fp2(yd, p->X, map, 2);
+    mul_fp2(Zz_powers[2], Zz_powers[2], p->Z);
+    mul_fp2(yd, yd, Zz_powers[2]);      /* yd *= Z^3                    */
+
+    /* convert (xn, xd, yn, yd) to Jacobian coordinates                 */
+    mul_fp2(out->Z, xd, yd);            /* Z = xd * yd                  */
+    mul_fp2(out->X, xn, yd);
+    mul_fp2(out->X, out->X, out->Z);    /* X = xn * xd * yd^2           */
+    sqr_fp2(out->Y, out->Z);
+    mul_fp2(out->Y, out->Y, xd);
+    mul_fp2(out->Y, out->Y, yn);        /* Y = yn * xd^3 * yd^2         */
+}
+
+static void map_to_isogenous_E2(POINTonE2 *p, const vec384x u)
+{
+    static const vec384x minus_A = {
+      { 0 },
+      { TO_LIMB_T(0xd4c4fffffcec5869), TO_LIMB_T(0x1da3f3eed25bfd79),
+        TO_LIMB_T(0x7fa833c5136fff67), TO_LIMB_T(0x59261433cd540cbd),
+        TO_LIMB_T(0x48450f5f2b84682c), TO_LIMB_T(0x07e05d00bf959233) }
+    };
+    static const vec384x Z = {              /* -2 - i */
+      { TO_LIMB_T(0x87ebfffffff9555c), TO_LIMB_T(0x656fffe5da8ffffa),
+        TO_LIMB_T(0x0fd0749345d33ad2), TO_LIMB_T(0xd951e663066576f4),
+        TO_LIMB_T(0xde291a3d41e980d3), TO_LIMB_T(0x0815664c7dfe040d) },
+      { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd),
+        TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a),
+        TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) }
+    };
+    static const vec384x recip_ZZZ = {      /* 1/(Z^3) */
+      { TO_LIMB_T(0x65018f5c28f598eb), TO_LIMB_T(0xe6020417f022d916),
+        TO_LIMB_T(0xd6327313288369c7), TO_LIMB_T(0x622ded8eb447156f),
+        TO_LIMB_T(0xe52a2aee72c2a01f), TO_LIMB_T(0x089812fb8481ffe4) },
+      { TO_LIMB_T(0x2574eb851eb8619f), TO_LIMB_T(0xdba2e97912925604),
+        TO_LIMB_T(0x67e495a909e7a18e), TO_LIMB_T(0xdf2da23b8145b8f7),
+        TO_LIMB_T(0xcf5d3728310ebf6d), TO_LIMB_T(0x11be446236f4c116) }
+    };
+    static const vec384x magic_ZZZ = {      /* 1/Z^3 = a + b*i */
+                                            /* a^2 + b^2 */
+      { TO_LIMB_T(0xaa7eb851eb8508e0), TO_LIMB_T(0x1c54fdf360989374),
+        TO_LIMB_T(0xc87f2fc6e716c62e), TO_LIMB_T(0x0124aefb1f9efea7),
+        TO_LIMB_T(0xb2f8be63e844865c), TO_LIMB_T(0x08b47f775a7ef35a) },
+                                            /* (a^2 + b^2)^((P-3)/4) */
+      { TO_LIMB_T(0xe4132bbd838cf70a), TO_LIMB_T(0x01d769ac83772c19),
+        TO_LIMB_T(0xa83dd6e974c22e45), TO_LIMB_T(0xbc8ec3e777b08dff),
+        TO_LIMB_T(0xc035c2042ecf5da3), TO_LIMB_T(0x073929e97f0850bf) }
+    };
+    static const vec384x ZxA = {            /* 240 - 480*i */
+      { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285),
+        TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601),
+        TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) },
+      { TO_LIMB_T(0xa989fffff9d8b0d2), TO_LIMB_T(0x3b47e7dda4b7faf3),
+        TO_LIMB_T(0xff50678a26dffece), TO_LIMB_T(0xb24c28679aa8197a),
+        TO_LIMB_T(0x908a1ebe5708d058), TO_LIMB_T(0x0fc0ba017f2b2466) }
+    };
+    vec384x uu, tv2, tv4, x2n, gx1, gxd, y2;
+#if 0
+    vec384x xn, x1n, xd, y, y1, Zuu;
+#else
+# define xn     p->X
+# define y      p->Y
+# define xd     p->Z
+# define x1n    xn
+# define y1     y
+# define Zuu    x2n
+#endif
+#define sgn0_fp2(a) (sgn0_pty_mont_384x((a), BLS12_381_P, p0) & 1)
+    bool_t e1, e2;
+
+    /*
+     * as per map_to_curve() from poc/sswu_opt.sage at
+     * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve
+     * with 9mod16 twists...
+     */
+    /* x numerator variants                                             */
+    sqr_fp2(uu, u);                     /* uu = u^2                     */
+    mul_fp2(Zuu, Z, uu);                /* Zuu = Z * uu                 */
+    sqr_fp2(tv2, Zuu);                  /* tv2 = Zuu^2                  */
+    add_fp2(tv2, tv2, Zuu);             /* tv2 = tv2 + Zuu              */
+    add_fp2(x1n, tv2, BLS12_381_Rx.p2); /* x1n = tv2 + 1                */
+    mul_fp2(x1n, x1n, Bprime_E2);       /* x1n = x1n * B                */
+    mul_fp2(x2n, Zuu, x1n);             /* x2n = Zuu * x1n              */
+
+    /* x denumenator                                                    */
+    mul_fp2(xd, minus_A, tv2);          /* xd = -A * tv2                */
+    e1 = vec_is_zero(xd, sizeof(xd));   /* e1 = xd == 0                 */
+    vec_select(xd, ZxA, xd, sizeof(xd), e1);    /*              # If xd == 0, set xd = Z*A */
+
+    /* y numerators variants                                            */
+    sqr_fp2(tv2, xd);                   /* tv2 = xd^2                   */
+    mul_fp2(gxd, xd, tv2);              /* gxd = xd^3                   */
+    mul_fp2(tv2, Aprime_E2, tv2);       /* tv2 = A * tv2                */
+    sqr_fp2(gx1, x1n);                  /* gx1 = x1n^2                  */
+    add_fp2(gx1, gx1, tv2);             /* gx1 = gx1 + tv2      # x1n^2 + A*xd^2 */
+    mul_fp2(gx1, gx1, x1n);             /* gx1 = gx1 * x1n      # x1n^3 + A*x1n*xd^2 */
+    mul_fp2(tv2, Bprime_E2, gxd);       /* tv2 = B * gxd                */
+    add_fp2(gx1, gx1, tv2);             /* gx1 = gx1 + tv2      # x1^3 + A*x1*xd^2 + B*xd^3 */
+    sqr_fp2(tv4, gxd);                  /* tv4 = gxd^2                  */
+    mul_fp2(tv2, gx1, gxd);             /* tv2 = gx1 * gxd              */
+    mul_fp2(tv4, tv4, tv2);             /* tv4 = tv4 * tv2      # gx1*gxd^3 */
+    e2 = recip_sqrt_fp2(y1, tv4,        /* y1 = tv4^c1          # (gx1*gxd^3)^((p^2-9)/16) */
+                        recip_ZZZ, magic_ZZZ);
+    mul_fp2(y1, y1, tv2);               /* y1 = y1 * tv2        # gx1*gxd*y1 */
+    mul_fp2(y2, y1, uu);                /* y2 = y1 * uu                 */
+    mul_fp2(y2, y2, u);                 /* y2 = y2 * u                  */
+
+    /* choose numerators                                                */
+    vec_select(xn, x1n, x2n, sizeof(xn), e2);   /* xn = e2 ? x1n : x2n  */
+    vec_select(y, y1, y2, sizeof(y), e2);       /* y  = e2 ? y1 : y2    */
+
+    e1 = sgn0_fp2(u);
+    e2 = sgn0_fp2(y);
+    cneg_fp2(y, y, e1^e2);              /* fix sign of y                */
+                                        /* return (xn, xd, y, 1)        */
+
+    /* convert (xn, xd, y, 1) to Jacobian projective coordinates        */
+    mul_fp2(p->X, xn, xd);              /* X = xn * xd                  */
+    mul_fp2(p->Y, y, gxd);              /* Y = y * xd^3                 */
+#ifndef xd
+    vec_copy(p->Z, xd, sizeof(xd));     /* Z = xd                       */
+#else
+# undef xn
+# undef y
+# undef xd
+# undef x1n
+# undef y1
+# undef Zuu
+# undef tv4
+#endif
+#undef sgn0_fp2
+}
+
+#if 0
+static const byte h_eff[] = {
+    TO_BYTES(0xe8020005aaa95551), TO_BYTES(0x59894c0adebbf6b4),
+    TO_BYTES(0xe954cbc06689f6a3), TO_BYTES(0x2ec0ec69d7477c1a),
+    TO_BYTES(0x6d82bf015d1212b0), TO_BYTES(0x329c2f178731db95),
+    TO_BYTES(0x9986ff031508ffe1), TO_BYTES(0x88e2a8e9145ad768),
+    TO_BYTES(0x584c6a0ea91b3528), TO_BYTES(0x0bc69f08f2ee75b3)
+};
+
+static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p)
+{    POINTonE2_mult_w5(out, p, h_eff, 636);   }
+#else
+/*
+ * As per suggestions in "7. Clearing the cofactor" at
+ * https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06
+ */
+static void POINTonE2_add_n_dbl(POINTonE2 *out, const POINTonE2 *p, size_t n)
+{
+    POINTonE2_dadd(out, out, p, NULL);
+    while(n--)
+        POINTonE2_double(out, out);
+}
+
+static void POINTonE2_times_minus_z(POINTonE2 *out, const POINTonE2 *in)
+{
+    POINTonE2_double(out, in);          /*      1: 0x2                  */
+    POINTonE2_add_n_dbl(out, in, 2);    /*   2..4: 0x3..0xc             */
+    POINTonE2_add_n_dbl(out, in, 3);    /*   5..8: 0xd..0x68            */
+    POINTonE2_add_n_dbl(out, in, 9);    /*  9..18: 0x69..0xd200         */
+    POINTonE2_add_n_dbl(out, in, 32);   /* 19..51: ..0xd20100000000     */
+    POINTonE2_add_n_dbl(out, in, 16);   /* 52..68: ..0xd201000000010000 */
+}
+
+static void psi(POINTonE2 *out, const POINTonE2 *in);
+
+static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p)
+{
+    POINTonE2 t0, t1;
+
+    /* A.Budroni, F.Pintore, "Efficient hash maps to G2 on BLS curves"  */
+    POINTonE2_double(out, p);           /* out = 2P                     */
+    psi(out, out);                      /* out = Ψ(2P)                  */
+    psi(out, out);                      /* out = Ψ²(2P)                 */
+
+    vec_copy(&t0, p, sizeof(t0));
+    POINTonE2_cneg(&t0, 1);             /* t0 = -P                      */
+    psi(&t1, &t0);                      /* t1 = -Ψ(P)                   */
+    POINTonE2_dadd(out, out, &t0, NULL);/* out = Ψ²(2P) - P             */
+    POINTonE2_dadd(out, out, &t1, NULL);/* out = Ψ²(2P) - P - Ψ(P)      */
+
+    POINTonE2_times_minus_z(&t0, p);    /* t0 = [-z]P                   */
+    POINTonE2_dadd(&t0, &t0, p, NULL);  /* t0 = [-z + 1]P               */
+    POINTonE2_dadd(&t0, &t0, &t1, NULL);/* t0 = [-z + 1]P - Ψ(P)        */
+    POINTonE2_times_minus_z(&t1, &t0);  /* t1 = [z² - z]P + [z]Ψ(P)     */
+    POINTonE2_dadd(out, out, &t1, NULL);/* out = [z² - z - 1]P          */
+                                        /*     + [z - 1]Ψ(P)            */
+                                        /*     + Ψ²(2P)                 */
+}
+#endif
+
+/*
+ * |u|, |v| are expected to be in Montgomery representation
+ */
+static void map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v)
+{
+    POINTonE2 p;
+
+    map_to_isogenous_E2(&p, u);
+
+    if (v != NULL) {
+        map_to_isogenous_E2(out, v);    /* borrow |out|                 */
+        POINTonE2_dadd(&p, &p, out, Aprime_E2);
+    }
+
+    isogeny_map_to_E2(&p, &p);          /* sprinkle isogenous powder    */
+    clear_cofactor(out, &p);
+}
+
+void blst_map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v)
+{   map_to_g2(out, u, v);   }
+
+static void Encode_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len,
+                                       const unsigned char *DST, size_t DST_len,
+                                       const unsigned char *aug, size_t aug_len)
+{
+    vec384x u[1];
+
+    hash_to_field(u[0], 2, aug, aug_len, msg, msg_len, DST, DST_len);
+    map_to_g2(p, u[0], NULL);
+}
+
+void blst_encode_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len,
+                                     const unsigned char *DST, size_t DST_len,
+                                     const unsigned char *aug, size_t aug_len)
+{   Encode_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len);   }
+
+static void Hash_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len,
+                                     const unsigned char *DST, size_t DST_len,
+                                     const unsigned char *aug, size_t aug_len)
+{
+    vec384x u[2];
+
+    hash_to_field(u[0], 4, aug, aug_len, msg, msg_len, DST, DST_len);
+    map_to_g2(p, u[0], u[1]);
+}
+
+void blst_hash_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len,
+                                   const unsigned char *DST, size_t DST_len,
+                                   const unsigned char *aug, size_t aug_len)
+{   Hash_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len);   }
+
+static bool_t POINTonE2_in_G2(const POINTonE2 *P)
+{
+#if 0
+    POINTonE2 t0, t1, t2;
+
+    /* Bowe, S., "Faster subgroup checks for BLS12-381"                 */
+    psi(&t0, P);                        /* Ψ(P)                         */
+    psi(&t0, &t0);                      /* Ψ²(P)                        */
+    psi(&t1, &t0);                      /* Ψ³(P)                        */
+
+    POINTonE2_times_minus_z(&t2, &t1);
+    POINTonE2_dadd(&t0, &t0, &t2, NULL);
+    POINTonE2_cneg(&t0, 1);
+    POINTonE2_dadd(&t0, &t0, P, NULL);  /* [z]Ψ³(P) - Ψ²(P) + P         */
+
+    return vec_is_zero(t0.Z, sizeof(t0.Z));
+#else
+    POINTonE2 t0, t1;
+
+    /* Scott, M., https://eprint.iacr.org/2021/1130 */
+    psi(&t0, P);                            /* Ψ(P) */
+
+    POINTonE2_times_minus_z(&t1, P);
+    POINTonE2_cneg(&t1, 1);                 /* [z]P */
+
+    return POINTonE2_is_equal(&t0, &t1);
+#endif
+}
+
+int blst_p2_in_g2(const POINTonE2 *p)
+{   return (int)POINTonE2_in_G2(p);   }
+
+int blst_p2_affine_in_g2(const POINTonE2_affine *p)
+{
+    POINTonE2 P;
+
+    vec_copy(P.X, p->X, 2*sizeof(P.X));
+    vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z),
+                     vec_is_zero(p, sizeof(*p)));
+
+    return (int)POINTonE2_in_G2(&P);
+}
diff --git a/blst/multi_scalar.c b/blst/multi_scalar.c
new file mode 100644
index 0000000..d0b3dee
--- /dev/null
+++ b/blst/multi_scalar.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+#include "point.h"
+
+/*
+ * Infinite point among inputs would be devastating. Shall we change it?
+ */
+#define POINTS_TO_AFFINE_IMPL(prefix, ptype, bits, field) \
+static void ptype##s_to_affine(ptype##_affine dst[], \
+                               const ptype *const points[], size_t npoints) \
+{ \
+    size_t i; \
+    vec##bits *acc, ZZ, ZZZ; \
+    const ptype *point = NULL; \
+    const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 1536 : 768; \
+\
+    while (npoints) { \
+        const ptype *p, *const *walkback; \
+        size_t delta = stride<npoints ? stride : npoints; \
+\
+        point = *points ? *points++ : point+1; \
+        acc = (vec##bits *)dst; \
+        vec_copy(acc++, point->Z, sizeof(vec##bits)); \
+        for (i = 1; i < delta; i++, acc++) \
+            point = *points ? *points++ : point+1, \
+            mul_##field(acc[0], acc[-1], point->Z); \
+\
+        --acc; reciprocal_##field(acc[0], acc[0]); \
+\
+        walkback = points-1, p = point, --delta, dst += delta; \
+        for (i = 0; i < delta; i++, acc--, dst--) { \
+            mul_##field(acc[-1], acc[-1], acc[0]);  /* 1/Z        */\
+            sqr_##field(ZZ, acc[-1]);               /* 1/Z^2      */\
+            mul_##field(ZZZ, ZZ, acc[-1]);          /* 1/Z^3      */\
+            mul_##field(acc[-1], p->Z, acc[0]);     \
+            mul_##field(dst->X,  p->X, ZZ);         /* X = X'/Z^2 */\
+            mul_##field(dst->Y,  p->Y, ZZZ);        /* Y = Y'/Z^3 */\
+            p = (p == *walkback) ? *--walkback : p-1; \
+        } \
+        sqr_##field(ZZ, acc[0]);                    /* 1/Z^2      */\
+        mul_##field(ZZZ, ZZ, acc[0]);               /* 1/Z^3      */\
+        mul_##field(dst->X, p->X, ZZ);              /* X = X'/Z^2 */\
+        mul_##field(dst->Y, p->Y, ZZZ);             /* Y = Y'/Z^3 */\
+        ++delta, dst += delta, npoints -= delta; \
+    } \
+} \
+\
+void prefix##s_to_affine(ptype##_affine dst[], const ptype *const points[], \
+                         size_t npoints) \
+{   ptype##s_to_affine(dst, points, npoints);   }
+
+POINTS_TO_AFFINE_IMPL(blst_p1, POINTonE1, 384, fp)
+POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2)
+
+/*
+ * This is two-step multi-scalar multiplication procedure. First, given
+ * a set of points you pre-compute a table for chosen windowing factor
+ * [expressed in bits with value between 2 and 14], and then you pass
+ * this table to the actual multiplication procedure along with scalars.
+ * Idea is that the pre-computed table will be reused multiple times. In
+ * which case multiplication runs faster than below Pippenger algorithm
+ * implementation for up to ~16K points for wbits=8, naturally at the
+ * expense of multi-megabyte table. One can trade even more memory for
+ * performance, but each wbits increment doubles the memory requirement,
+ * so at some point it gets prohibively large... For reference, without
+ * reusing the table it's faster than Pippenger algorithm for up ~32
+ * points [with wbits=5]...
+ */
+
+#define SCRATCH_SZ(ptype) (sizeof(ptype)==sizeof(POINTonE1) ? 8192 : 4096)
+
+#define PRECOMPUTE_WBITS_IMPL(prefix, ptype, bits, field, one) \
+static void ptype##_precompute_row_wbits(ptype row[], size_t wbits, \
+                                         const ptype##_affine *point) \
+{ \
+    size_t i, j, n = (size_t)1 << (wbits-1); \
+                                          /* row[-1] is implicit infinity */\
+    vec_copy(&row[0], point, sizeof(*point));           /* row[0]=p*1     */\
+    vec_copy(&row[0].Z, one, sizeof(row[0].Z));                             \
+    ptype##_double(&row[1],  &row[0]);                  /* row[1]=p*(1+1) */\
+    for (i = 2, j = 1; i < n; i += 2, j++) \
+        ptype##_add_affine(&row[i], &row[i-1], point),  /* row[2]=p*(2+1) */\
+        ptype##_double(&row[i+1], &row[j]);             /* row[3]=p*(2+2) */\
+}                                                       /* row[4] ...     */\
+\
+static void ptype##s_to_affine_row_wbits(ptype##_affine dst[], ptype src[], \
+                                         size_t wbits, size_t npoints) \
+{ \
+    size_t total = npoints << (wbits-1); \
+    size_t nwin = (size_t)1 << (wbits-1); \
+    size_t i, j; \
+    vec##bits *acc, ZZ, ZZZ; \
+\
+    src += total; \
+    acc = (vec##bits *)src; \
+    vec_copy(acc++, one, sizeof(vec##bits)); \
+    for (i = 0; i < npoints; i++) \
+        for (j = nwin; --src, --j; acc++)    \
+            mul_##field(acc[0], acc[-1], src->Z); \
+\
+    --acc; reciprocal_##field(acc[0], acc[0]); \
+\
+    for (i = 0; i < npoints; i++) { \
+        vec_copy(dst++, src++, sizeof(ptype##_affine)); \
+        for (j = 1; j < nwin; j++, acc--, src++, dst++) { \
+            mul_##field(acc[-1], acc[-1], acc[0]);  /* 1/Z        */\
+            sqr_##field(ZZ, acc[-1]);               /* 1/Z^2      */\
+            mul_##field(ZZZ, ZZ, acc[-1]);          /* 1/Z^3      */\
+            mul_##field(acc[-1], src->Z, acc[0]);                   \
+            mul_##field(dst->X, src->X, ZZ);        /* X = X'/Z^2 */\
+            mul_##field(dst->Y, src->Y, ZZZ);       /* Y = Y'/Z^3 */\
+        } \
+    } \
+} \
+\
+/* flat |points[n]| can be placed at the end of |table[n<<(wbits-1)]| */\
+static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \
+                                      const ptype##_affine *const points[], \
+                                      size_t npoints) \
+{ \
+    size_t total = npoints << (wbits-1); \
+    size_t nwin = (size_t)1 << (wbits-1); \
+    size_t nmin = wbits>9 ? (size_t)1: (size_t)1 << (9-wbits); \
+    size_t i, top = 0; \
+    ptype *rows, *row; \
+    const ptype##_affine *point = NULL; \
+    size_t stride = ((512*1024)/sizeof(ptype##_affine)) >> wbits; \
+    if (stride == 0) stride = 1; \
+\
+    while (npoints >= nmin) { \
+        size_t limit = total - npoints; \
+\
+        if (top + (stride << wbits) > limit) { \
+            stride = (limit - top) >> wbits;   \
+            if (stride == 0) break;            \
+        } \
+        rows = row = (ptype *)(&table[top]); \
+        for (i = 0; i < stride; i++, row += nwin) \
+            point = *points ? *points++ : point+1, \
+            ptype##_precompute_row_wbits(row, wbits, point); \
+        ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \
+        top += stride << (wbits-1); \
+        npoints -= stride; \
+    } \
+    rows = row = alloca(2*sizeof(ptype##_affine) * npoints * nwin); \
+    for (i = 0; i < npoints; i++, row += nwin) \
+        point = *points ? *points++ : point+1, \
+        ptype##_precompute_row_wbits(row, wbits, point); \
+    ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \
+} \
+\
+size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \
+{ return (sizeof(ptype##_affine)*npoints) << (wbits-1); } \
+void prefix##s_mult_wbits_precompute(ptype##_affine table[], size_t wbits, \
+                                     const ptype##_affine *const points[], \
+                                     size_t npoints) \
+{ ptype##s_precompute_wbits(table, wbits, points, npoints); }
+
+#define POINTS_MULT_WBITS_IMPL(prefix, ptype, bits, field, one) \
+static void ptype##_gather_booth_wbits(ptype *p, const ptype##_affine row[], \
+                                       size_t wbits, limb_t booth_idx) \
+{ \
+    bool_t booth_sign = (booth_idx >> wbits) & 1; \
+    bool_t idx_is_zero; \
+    static const ptype##_affine infinity = { 0 }; \
+\
+    booth_idx &= ((limb_t)1 << wbits) - 1; \
+    idx_is_zero = is_zero(booth_idx); \
+    booth_idx -= 1 ^ idx_is_zero; \
+    vec_select(p, &infinity, &row[booth_idx], sizeof(row[0]), idx_is_zero); \
+    ptype##_cneg(p, booth_sign); \
+} \
+\
+static void ptype##s_mult_wbits(ptype *ret, const ptype##_affine table[], \
+                                size_t wbits, size_t npoints, \
+                                const byte *const scalars[], size_t nbits, \
+                                ptype scratch[]) \
+{ \
+    limb_t wmask, wval; \
+    size_t i, j, z, nbytes, window, nwin = (size_t)1 << (wbits-1); \
+    const byte *scalar, *const *scalar_s = scalars; \
+    const ptype##_affine *row = table; \
+\
+    size_t scratch_sz = SCRATCH_SZ(ptype); \
+    if (scratch == NULL) { \
+        scratch_sz /= 4; /* limit to 288K */ \
+        scratch_sz = scratch_sz < npoints ? scratch_sz : npoints; \
+        scratch = alloca(sizeof(ptype) * scratch_sz); \
+    } \
+\
+    nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \
+    scalar = *scalar_s++; \
+\
+    /* top excess bits modulo target window size */ \
+    window = nbits % wbits; /* yes, it may be zero */ \
+    wmask = ((limb_t)1 << (window + 1)) - 1; \
+\
+    nbits -= window; \
+    z = is_zero(nbits); \
+    wval = (get_wval_limb(scalar, nbits - (z^1), wbits + (z^1)) << z) & wmask; \
+    wval = booth_encode(wval, wbits); \
+    ptype##_gather_booth_wbits(&scratch[0], row, wbits, wval); \
+    row += nwin; \
+\
+    i = 1; vec_zero(ret, sizeof(*ret)); \
+    while (nbits > 0) { \
+        for (j = i; i < npoints; i++, j++, row += nwin) { \
+            if (j == scratch_sz) \
+                ptype##s_accumulate(ret, scratch, j), j = 0; \
+            scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \
+            wval = get_wval_limb(scalar, nbits - 1, window + 1) & wmask; \
+            wval = booth_encode(wval, wbits); \
+            ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \
+        } \
+        ptype##s_accumulate(ret, scratch, j); \
+\
+        for (j = 0; j < wbits; j++) \
+            ptype##_double(ret, ret); \
+\
+        window = wbits; \
+        wmask = ((limb_t)1 << (window + 1)) - 1; \
+        nbits -= window; \
+        i = 0; row = table; scalar_s = scalars; \
+    } \
+\
+    for (j = i; i < npoints; i++, j++, row += nwin) { \
+        if (j == scratch_sz) \
+            ptype##s_accumulate(ret, scratch, j), j = 0; \
+        scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \
+        wval = (get_wval_limb(scalar, 0, wbits) << 1) & wmask; \
+        wval = booth_encode(wval, wbits); \
+        ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \
+    } \
+    ptype##s_accumulate(ret, scratch, j); \
+} \
+\
+size_t prefix##s_mult_wbits_scratch_sizeof(size_t npoints) \
+{ \
+    const size_t scratch_sz = SCRATCH_SZ(ptype); \
+    return sizeof(ptype) * (npoints < scratch_sz ? npoints : scratch_sz); \
+} \
+void prefix##s_mult_wbits(ptype *ret, const ptype##_affine table[], \
+                          size_t wbits, size_t npoints, \
+                          const byte *const scalars[], size_t nbits, \
+                          ptype scratch[]) \
+{ ptype##s_mult_wbits(ret, table, wbits, npoints, scalars, nbits, scratch); }
+
+PRECOMPUTE_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p)
+POINTS_MULT_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p)
+
+PRECOMPUTE_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
+POINTS_MULT_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
+
+/*
+ * Pippenger algorithm implementation, fastest option for larger amount
+ * of points...
+ */
+
+static size_t pippenger_window_size(size_t npoints)
+{
+    size_t wbits;
+
+    for (wbits=0; npoints>>=1; wbits++) ;
+
+    return wbits>12 ? wbits-3 : (wbits>4 ? wbits-2 : (wbits ? 2 : 1));
+}
+
+#define DECLARE_PRIVATE_POINTXYZZ(ptype, bits) \
+typedef struct { vec##bits X,Y,ZZZ,ZZ; } ptype##xyzz;
+
+#define POINTS_MULT_PIPPENGER_IMPL(prefix, ptype) \
+static void ptype##_integrate_buckets(ptype *out, ptype##xyzz buckets[], \
+                                                  size_t wbits) \
+{ \
+    ptype##xyzz ret[1], acc[1]; \
+    size_t n = (size_t)1 << wbits; \
+\
+    /* Calculate sum of x[i-1]*i for i=1 through 1<<|wbits|. */\
+    vec_copy(acc, &buckets[--n], sizeof(acc)); \
+    vec_copy(ret, &buckets[n], sizeof(ret)); \
+    vec_zero(&buckets[n], sizeof(buckets[n])); \
+    while (n--) { \
+        ptype##xyzz_dadd(acc, acc, &buckets[n]); \
+        ptype##xyzz_dadd(ret, ret, acc); \
+        vec_zero(&buckets[n], sizeof(buckets[n])); \
+    } \
+    ptype##xyzz_to_Jacobian(out, ret); \
+} \
+\
+static void ptype##_bucket(ptype##xyzz buckets[], limb_t booth_idx, \
+                           size_t wbits, const ptype##_affine *p) \
+{ \
+    bool_t booth_sign = (booth_idx >> wbits) & 1; \
+\
+    booth_idx &= (1<<wbits) - 1; \
+    if (booth_idx--) \
+        ptype##xyzz_dadd_affine(&buckets[booth_idx], &buckets[booth_idx], \
+                                                     p, booth_sign); \
+} \
+\
+static void ptype##_prefetch(const ptype##xyzz buckets[], limb_t booth_idx, \
+                             size_t wbits) \
+{ \
+    booth_idx &= (1<<wbits) - 1; \
+    if (booth_idx--) \
+        vec_prefetch(&buckets[booth_idx], sizeof(buckets[booth_idx])); \
+} \
+\
+static void ptype##s_tile_pippenger(ptype *ret, \
+                                    const ptype##_affine *const points[], \
+                                    size_t npoints, \
+                                    const byte *const scalars[], size_t nbits, \
+                                    ptype##xyzz buckets[], \
+                                    size_t bit0, size_t wbits, size_t cbits) \
+{ \
+    limb_t wmask, wval, wnxt; \
+    size_t i, z, nbytes; \
+    const byte *scalar = *scalars++; \
+    const ptype##_affine *point = *points++; \
+\
+    nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \
+    wmask = ((limb_t)1 << (wbits+1)) - 1; \
+    z = is_zero(bit0); \
+    bit0 -= z^1; wbits += z^1; \
+    wval = (get_wval_limb(scalar, bit0, wbits) << z) & wmask; \
+    wval = booth_encode(wval, cbits); \
+    scalar = *scalars ? *scalars++ : scalar+nbytes; \
+    wnxt = (get_wval_limb(scalar, bit0, wbits) << z) & wmask; \
+    wnxt = booth_encode(wnxt, cbits); \
+    npoints--;  /* account for prefetch */ \
+\
+    ptype##_bucket(buckets, wval, cbits, point); \
+    for (i = 1; i < npoints; i++) { \
+        wval = wnxt; \
+        scalar = *scalars ? *scalars++ : scalar+nbytes; \
+        wnxt = (get_wval_limb(scalar, bit0, wbits) << z) & wmask; \
+        wnxt = booth_encode(wnxt, cbits); \
+        ptype##_prefetch(buckets, wnxt, cbits); \
+        point = *points ? *points++ : point+1; \
+        ptype##_bucket(buckets, wval, cbits, point); \
+    } \
+    point = *points ? *points++ : point+1; \
+    ptype##_bucket(buckets, wnxt, cbits, point); \
+    ptype##_integrate_buckets(ret, buckets, cbits - 1); \
+} \
+\
+static void ptype##s_mult_pippenger(ptype *ret, \
+                                    const ptype##_affine *const points[], \
+                                    size_t npoints, \
+                                    const byte *const scalars[], size_t nbits, \
+                                    ptype##xyzz buckets[], size_t window) \
+{ \
+    size_t i, wbits, cbits, bit0 = nbits; \
+    ptype tile[1]; \
+\
+    window = window ? window : pippenger_window_size(npoints); \
+    vec_zero(buckets, sizeof(buckets[0]) << (window-1)); \
+    vec_zero(ret, sizeof(*ret)); \
+\
+    /* top excess bits modulo target window size */ \
+    wbits = nbits % window; /* yes, it may be zero */ \
+    cbits = wbits + 1; \
+    while (bit0 -= wbits) { \
+        ptype##s_tile_pippenger(tile, points, npoints, scalars, nbits, \
+                                      buckets, bit0, wbits, cbits); \
+        ptype##_dadd(ret, ret, tile, NULL); \
+        for (i = 0; i < window; i++) \
+            ptype##_double(ret, ret); \
+        cbits = wbits = window; \
+    } \
+    ptype##s_tile_pippenger(tile, points, npoints, scalars, nbits, \
+                                  buckets, 0, wbits, cbits); \
+    ptype##_dadd(ret, ret, tile, NULL); \
+} \
+\
+size_t prefix##s_mult_pippenger_scratch_sizeof(size_t npoints) \
+{   return sizeof(ptype##xyzz) << (pippenger_window_size(npoints)-1);   } \
+void prefix##s_tile_pippenger(ptype *ret, \
+                              const ptype##_affine *const points[], \
+                              size_t npoints, \
+                              const byte *const scalars[], size_t nbits, \
+                              ptype##xyzz scratch[], \
+                              size_t bit0, size_t window) \
+{ \
+    size_t wbits, cbits; \
+\
+    if (bit0 + window > nbits)  wbits = nbits - bit0, cbits = wbits + 1; \
+    else                        wbits = cbits = window; \
+    ptype##s_tile_pippenger(ret, points, npoints, scalars, nbits, scratch, \
+                                 bit0, wbits, cbits); \
+} \
+void prefix##s_mult_pippenger(ptype *ret, \
+                              const ptype##_affine *const points[], \
+                              size_t npoints, \
+                              const byte *const scalars[], size_t nbits, \
+                              ptype##xyzz scratch[]) \
+{ ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); }
+
+DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384)
+POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp)
+POINTXYZZ_DADD_IMPL(POINTonE1, 384, fp)
+POINTXYZZ_DADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p)
+POINTS_MULT_PIPPENGER_IMPL(blst_p1, POINTonE1)
+
+DECLARE_PRIVATE_POINTXYZZ(POINTonE2, 384x)
+POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE2, 384x, fp2)
+POINTXYZZ_DADD_IMPL(POINTonE2, 384x, fp2)
+POINTXYZZ_DADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
+POINTS_MULT_PIPPENGER_IMPL(blst_p2, POINTonE2)
diff --git a/blst/no_asm.h b/blst/no_asm.h
new file mode 100644
index 0000000..4f12f53
--- /dev/null
+++ b/blst/no_asm.h
@@ -0,0 +1,1287 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#if LIMB_T_BITS==32
+typedef unsigned long long llimb_t;
+#endif
+
+#if defined(__clang__)
+# pragma GCC diagnostic ignored "-Wstatic-in-inline"
+#endif
+
+static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[],
+                       const limb_t p[], limb_t n0, size_t n)
+{
+    llimb_t limbx;
+    limb_t mask, borrow, mx, hi, tmp[n+1], carry;
+    size_t i, j;
+
+    for (mx=b[0], hi=0, i=0; i<n; i++) {
+        limbx = (mx * (llimb_t)a[i]) + hi;
+        tmp[i] = (limb_t)limbx;
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+    mx = n0*tmp[0];
+    tmp[i] = hi;
+
+    for (carry=0, j=0; ; ) {
+        limbx = (mx * (llimb_t)p[0]) + tmp[0];
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+        for (i=1; i<n; i++) {
+            limbx = (mx * (llimb_t)p[i] + hi) + tmp[i];
+            tmp[i-1] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        limbx = tmp[i] + (hi + (llimb_t)carry);
+        tmp[i-1] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+
+        if (++j==n)
+            break;
+
+        for (mx=b[j], hi=0, i=0; i<n; i++) {
+            limbx = (mx * (llimb_t)a[i] + hi) + tmp[i];
+            tmp[i] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        mx = n0*tmp[0];
+        limbx = hi + (llimb_t)carry;
+        tmp[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = carry - borrow;
+
+    for(i=0; i<n; i++)
+        ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define MUL_MONT_IMPL(bits) \
+inline void mul_mont_##bits(vec##bits ret, const vec##bits a, \
+                            const vec##bits b, const vec##bits p, limb_t n0) \
+{   mul_mont_n(ret, a, b, p, n0, NLIMBS(bits));   } \
+\
+inline void sqr_mont_##bits(vec##bits ret, const vec##bits a, \
+                            const vec##bits p, limb_t n0) \
+{   mul_mont_n(ret, a, a, p, n0, NLIMBS(bits));   }
+
+/*
+ * 256-bit subroutines can handle arbitrary modulus, even non-"sparse",
+ * but we have to harmonize the naming with assembly.
+ */
+#define mul_mont_256 mul_mont_sparse_256
+#define sqr_mont_256 sqr_mont_sparse_256
+MUL_MONT_IMPL(256)
+#undef mul_mont_256
+#undef sqr_mont_256
+MUL_MONT_IMPL(384)
+
+static void add_mod_n(limb_t ret[], const limb_t a[], const limb_t b[],
+                      const limb_t p[], size_t n)
+{
+    llimb_t limbx;
+    limb_t mask, carry, borrow, tmp[n];
+    size_t i;
+
+    for (carry=0, i=0; i<n; i++) {
+        limbx = a[i] + (b[i] + (llimb_t)carry);
+        tmp[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = carry - borrow;
+
+    for(i=0; i<n; i++)
+        ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define ADD_MOD_IMPL(bits) \
+inline void add_mod_##bits(vec##bits ret, const vec##bits a, \
+                           const vec##bits b, const vec##bits p) \
+{   add_mod_n(ret, a, b, p, NLIMBS(bits));   }
+
+ADD_MOD_IMPL(256)
+ADD_MOD_IMPL(384)
+
+static void sub_mod_n(limb_t ret[], const limb_t a[], const limb_t b[],
+                      const limb_t p[], size_t n)
+{
+    llimb_t limbx;
+    limb_t mask, carry, borrow;
+    size_t i;
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = a[i] - (b[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = 0 - borrow;
+
+    for (carry=0, i=0; i<n; i++) {
+        limbx = ret[i] + ((p[i] & mask) + (llimb_t)carry);
+        ret[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+}
+
+#define SUB_MOD_IMPL(bits) \
+inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \
+                           const vec##bits b, const vec##bits p) \
+{   sub_mod_n(ret, a, b, p, NLIMBS(bits));   }
+
+SUB_MOD_IMPL(256)
+SUB_MOD_IMPL(384)
+
+static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[],
+                           size_t n)
+{
+    llimb_t limbx;
+    limb_t mask, carry, borrow, tmp[n], two_a[n];
+    size_t i;
+
+    for (carry=0, i=0; i<n; i++) {
+        limb_t a_i = a[i];
+        tmp[i] = a_i<<1 | carry;
+        carry = a_i>>(LIMB_T_BITS-1);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        two_a[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = carry - borrow;
+
+    for(i=0; i<n; i++)
+        two_a[i] = (two_a[i] & ~mask) | (tmp[i] & mask);
+
+    for (carry=0, i=0; i<n; i++) {
+        limbx = a[i] + (two_a[i] + (llimb_t)carry);
+        tmp[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = carry - borrow;
+
+    for(i=0; i<n; i++)
+        ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define MUL_BY_3_MOD_IMPL(bits) \
+inline void mul_by_3_mod_##bits(vec##bits ret, const vec##bits a, \
+                                const vec##bits p) \
+{   mul_by_3_mod_n(ret, a, p, NLIMBS(bits));   }
+
+MUL_BY_3_MOD_IMPL(256)
+MUL_BY_3_MOD_IMPL(384)
+
+static void lshift_mod_n(limb_t ret[], const limb_t a[], size_t count,
+                         const limb_t p[], size_t n)
+{
+    llimb_t limbx;
+    limb_t mask, carry, borrow, tmp[n];
+    size_t i;
+
+    while (count--) {
+        for (carry=0, i=0; i<n; i++) {
+            limb_t a_i = a[i];
+            tmp[i] = a_i<<1 | carry;
+            carry = a_i>>(LIMB_T_BITS-1);
+        }
+
+        for (borrow=0, i=0; i<n; i++) {
+            limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+            ret[i] = (limb_t)limbx;
+            borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+        }
+
+        mask = carry - borrow;
+
+        for(i=0; i<n; i++)
+            ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+
+        a = ret;
+    }
+}
+
+#define LSHIFT_MOD_IMPL(bits) \
+inline void lshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \
+                              const vec##bits p) \
+{   lshift_mod_n(ret, a, count, p, NLIMBS(bits));   }
+
+LSHIFT_MOD_IMPL(256)
+LSHIFT_MOD_IMPL(384)
+
+static void cneg_mod_n(limb_t ret[], const limb_t a[], bool_t flag,
+                       const limb_t p[], size_t n)
+{
+    llimb_t limbx;
+    limb_t borrow, mask, tmp[n];
+    size_t i;
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = p[i] - (a[i] + (llimb_t)borrow);
+        tmp[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    flag &= vec_is_zero(a, sizeof(tmp)) ^ 1;
+    mask = (limb_t)0 - flag;
+
+    for(i=0; i<n; i++)
+        ret[i] = (a[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define CNEG_MOD_IMPL(bits) \
+inline void cneg_mod_##bits(vec##bits ret, const vec##bits a, bool_t flag, \
+                            const vec##bits p) \
+{   cneg_mod_n(ret, a, flag, p, NLIMBS(bits));   }
+
+CNEG_MOD_IMPL(256)
+CNEG_MOD_IMPL(384)
+
+static limb_t check_mod_n(const byte a[], const limb_t p[], size_t n)
+{
+    llimb_t limbx;
+    limb_t borrow, ai, acc;
+    size_t i, j;
+
+    for (acc=borrow=0, i=0; i<n; i++) {
+        for (ai=0, j=0; j<8*sizeof(limb_t); j+=8)
+            ai |= (limb_t)(*a++) << j;
+        acc |= ai;
+        limbx = ai - (p[i] + (llimb_t)borrow);
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    return borrow & (is_zero(acc) ^ 1);
+}
+
+#define CHECK_MOD_IMPL(bits) \
+inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \
+{   return check_mod_n(a, p, NLIMBS(bits));   }
+
+CHECK_MOD_IMPL(256)
+
+static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[],
+                                            const limb_t p[], size_t n)
+{
+    limb_t ret_[n], a_[n], b_[n], zero;
+
+    limbs_from_le_bytes(a_, a, sizeof(a_));
+    limbs_from_le_bytes(b_, b, sizeof(b_));
+
+    add_mod_n(ret_, a_, b_, p, n);
+    zero = vec_is_zero(ret_, sizeof(ret_));
+
+    le_bytes_from_limbs(ret, ret_, sizeof(ret_));
+
+    return zero^1;
+}
+
+#define ADD_N_CHECK_MOD_IMPL(bits) \
+inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \
+                                     const pow##bits b, const vec##bits p) \
+{   return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits));   }
+
+ADD_N_CHECK_MOD_IMPL(256)
+
+static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[],
+                                            const limb_t p[], size_t n)
+{
+    limb_t ret_[n], a_[n], b_[n], zero;
+
+    limbs_from_le_bytes(a_, a, sizeof(a_));
+    limbs_from_le_bytes(b_, b, sizeof(b_));
+
+    sub_mod_n(ret_, a_, b_, p, n);
+    zero = vec_is_zero(ret_, sizeof(ret_));
+
+    le_bytes_from_limbs(ret, ret_, sizeof(ret_));
+
+    return zero^1;
+}
+
+#define SUB_N_CHECK_MOD_IMPL(bits) \
+inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \
+                                     const pow##bits b, const vec##bits p) \
+{   return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits));   }
+
+SUB_N_CHECK_MOD_IMPL(256)
+
+static void from_mont_n(limb_t ret[], const limb_t a[],
+                        const limb_t p[], limb_t n0, size_t n)
+{
+    llimb_t limbx;
+    limb_t mask, borrow, mx, hi, tmp[n];
+    size_t i, j;
+
+    for (j=0; j<n; j++) {
+        mx = n0*a[0];
+        limbx = (mx * (llimb_t)p[0]) + a[0];
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+        for (i=1; i<n; i++) {
+            limbx = (mx * (llimb_t)p[i] + hi) + a[i];
+            tmp[i-1] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        tmp[i-1] = hi;
+        a = tmp;
+    }
+
+    /* this is needed only if input can be non-fully-reduced */
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = 0 - borrow;
+
+    for(i=0; i<n; i++)
+        ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define FROM_MONT_IMPL(bits) \
+inline void from_mont_##bits(vec##bits ret, const vec##bits a, \
+                             const vec##bits p, limb_t n0) \
+{   from_mont_n(ret, a, p, n0, NLIMBS(bits));   }
+
+FROM_MONT_IMPL(256)
+FROM_MONT_IMPL(384)
+
+static void redc_mont_n(limb_t ret[], const limb_t a[],
+                        const limb_t p[], limb_t n0, size_t n)
+{
+    llimb_t limbx;
+    limb_t mask, carry, borrow, mx, hi, tmp[n];
+    const limb_t *b = a;
+    size_t i, j;
+
+    for (j=0; j<n; j++) {
+        mx = n0*b[0];
+        limbx = (mx * (llimb_t)p[0]) + b[0];
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+        for (i=1; i<n; i++) {
+            limbx = (mx * (llimb_t)p[i] + hi) + b[i];
+            tmp[i-1] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        tmp[i-1] = hi;
+        b = tmp;
+    }
+
+    for (carry=0, i=0; i<n; i++) {
+        limbx = a[n+i] + (tmp[i] + (llimb_t)carry);
+        tmp[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = carry - borrow;
+
+    for(i=0; i<n; i++)
+        ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define REDC_MONT_IMPL(bits, bits2) \
+inline void redc_mont_##bits(vec##bits ret, const vec##bits2 a, \
+                             const vec##bits p, limb_t n0) \
+{   redc_mont_n(ret, a, p, n0, NLIMBS(bits));   }
+
+REDC_MONT_IMPL(256, 512)
+REDC_MONT_IMPL(384, 768)
+
+static void rshift_mod_n(limb_t ret[], const limb_t a[], size_t count,
+                         const limb_t p[], size_t n)
+{
+    llimb_t limbx;
+    limb_t mask, carry, limb, next;
+    size_t i;
+
+    while (count--) {
+        mask = 0 - (a[0] & 1);
+        for (carry=0, i=0; i<n; i++) {
+            limbx = a[i] + ((p[i]&mask) + (llimb_t)carry);
+            ret[i] = (limb_t)limbx;
+            carry = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+
+        for (next=ret[0], i=0; i<n-1; i++) {
+            limb = next >> 1;
+            next = ret[i+1];
+            ret[i] = limb | next << (LIMB_T_BITS-1);
+        }
+        ret[i] = next >> 1 | carry << (LIMB_T_BITS-1);
+
+        a = ret;
+    }
+}
+
+#define RSHIFT_MOD_IMPL(bits) \
+inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \
+                              const vec##bits p) \
+{   rshift_mod_n(ret, a, count, p, NLIMBS(bits));   }
+
+RSHIFT_MOD_IMPL(256)
+RSHIFT_MOD_IMPL(384)
+
+#define DIV_BY_2_MOD_IMPL(bits) \
+inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \
+                                const vec##bits p) \
+{   rshift_mod_n(ret, a, 1, p, NLIMBS(bits));   }
+
+DIV_BY_2_MOD_IMPL(384)
+
+static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n)
+{
+    llimb_t limbx;
+    limb_t carry, borrow, ret, tmp[n];
+    size_t i;
+
+    ret = a[0] & 1; /* parity */
+
+    for (carry=0, i=0; i<n; i++) {
+        limb_t a_i = a[i];
+        tmp[i] = a_i<<1 | carry;
+        carry = a_i>>(LIMB_T_BITS-1);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    ret |= ((carry - borrow) & 2) ^ 2;
+
+    return ret;
+}
+
+inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p)
+{   return sgn0_pty_mod_n(a, p, NLIMBS(384));   }
+
+inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0)
+{
+    vec384 tmp;
+
+    from_mont_n(tmp, a, p, n0, NLIMBS(384));
+
+    return sgn0_pty_mod_n(tmp, p, NLIMBS(384));
+}
+
+inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p)
+{
+    limb_t re, im, sign, prty;
+
+    re = sgn0_pty_mod_n(a[0], p, NLIMBS(384));
+    im = sgn0_pty_mod_n(a[1], p, NLIMBS(384));
+
+    /* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */
+    sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384));
+    sign = (re & sign) | (im & ~sign);
+
+    /* a->re==0 ? prty(a->im) : prty(a->re) */
+    prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384));
+    prty = (im & prty) | (re & ~prty);
+
+    return (sign & 2) | (prty & 1);
+}
+
+inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0)
+{
+    vec384x tmp;
+
+    from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384));
+    from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384));
+
+    return sgn0_pty_mod_384x(tmp, p);
+}
+
+void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b,
+                          const vec384 p, limb_t n0)
+{
+    vec384 aa, bb, cc;
+
+    add_mod_n(aa, a[0], a[1], p, NLIMBS(384));
+    add_mod_n(bb, b[0], b[1], p, NLIMBS(384));
+    mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384));
+    mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384));
+    mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384));
+    sub_mod_n(ret[0], aa, cc, p, NLIMBS(384));
+    sub_mod_n(ret[1], bb, aa, p, NLIMBS(384));
+    sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384));
+}
+
+/*
+ * mul_mont_n without final conditional subtraction, which implies
+ * that modulus is one bit short, which in turn means that there are
+ * no carries to handle between iterations...
+ */
+static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[],
+                              const limb_t p[], limb_t n0, size_t n)
+{
+    llimb_t limbx;
+    limb_t mx, hi, tmp[n+1];
+    size_t i, j;
+
+    for (mx=b[0], hi=0, i=0; i<n; i++) {
+        limbx = (mx * (llimb_t)a[i]) + hi;
+        tmp[i] = (limb_t)limbx;
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+    mx = n0*tmp[0];
+    tmp[i] = hi;
+
+    for (j=0; ; ) {
+        limbx = (mx * (llimb_t)p[0]) + tmp[0];
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+        for (i=1; i<n; i++) {
+            limbx = (mx * (llimb_t)p[i] + hi) + tmp[i];
+            tmp[i-1] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        tmp[i-1] = tmp[i] + hi;
+
+        if (++j==n)
+            break;
+
+        for (mx=b[j], hi=0, i=0; i<n; i++) {
+            limbx = (mx * (llimb_t)a[i] + hi) + tmp[i];
+            tmp[i] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        mx = n0*tmp[0];
+        tmp[i] = hi;
+    }
+
+    vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t));
+}
+
+void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count,
+                        const vec384 p, limb_t n0, const vec384 b)
+{
+    while(count--) {
+        mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384));
+        a = ret;
+    }
+    mul_mont_n(ret, ret, b, p, n0, NLIMBS(384));
+}
+
+void sqr_mont_382x(vec384x ret, const vec384x a,
+                          const vec384 p, limb_t n0)
+{
+    llimb_t limbx;
+    limb_t mask, carry, borrow;
+    size_t i;
+    vec384 t0, t1;
+
+    /* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */
+    for (carry=0, i=0; i<NLIMBS(384); i++) {
+        limbx = a[0][i] + (a[1][i] + (llimb_t)carry);
+        t0[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    /* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */
+    for (borrow=0, i=0; i<NLIMBS(384); i++) {
+        limbx = a[0][i] - (a[1][i] + (llimb_t)borrow);
+        t1[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+    mask = 0 - borrow;
+
+    /* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */
+    mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));
+
+    /* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */
+    for (carry=0, i=0; i<NLIMBS(384); i++) {
+        limb_t a_i = ret[1][i];
+        ret[1][i] = a_i<<1 | carry;
+        carry = a_i>>(LIMB_T_BITS-1);
+    }
+
+    /* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */
+    mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384));
+
+    /* account for t1's sign... */
+    for (borrow=0, i=0; i<NLIMBS(384); i++) {
+        limbx = ret[0][i] - ((t0[i] & mask) + (llimb_t)borrow);
+        ret[0][i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+    mask = 0 - borrow;
+    for (carry=0, i=0; i<NLIMBS(384); i++) {
+        limbx = ret[0][i] + ((p[i] & mask) + (llimb_t)carry);
+        ret[0][i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+}
+
+#define MSB(x) ((x) >> (LIMB_T_BITS-1))
+
+static size_t num_bits(limb_t l)
+{
+    limb_t x, mask;
+    size_t bits = is_zero(l) ^ 1;
+
+    if (sizeof(limb_t) == 8) {
+        x = l >> (32 & (8*sizeof(limb_t)-1));
+        mask = 0 - MSB(0 - x);
+        bits += 32 & mask;
+        l ^= (x ^ l) & mask;
+    }
+
+    x = l >> 16;
+    mask = 0 - MSB(0 - x);
+    bits += 16 & mask;
+    l ^= (x ^ l) & mask;
+
+    x = l >> 8;
+    mask = 0 - MSB(0 - x);
+    bits += 8 & mask;
+    l ^= (x ^ l) & mask;
+
+    x = l >> 4;
+    mask = 0 - MSB(0 - x);
+    bits += 4 & mask;
+    l ^= (x ^ l) & mask;
+
+    x = l >> 2;
+    mask = 0 - MSB(0 - x);
+    bits += 2 & mask;
+    l ^= (x ^ l) & mask;
+
+    bits += l >> 1;
+
+    return bits;
+}
+
+#if defined(__clang_major__) && __clang_major__>7
+__attribute__((optnone))
+#endif
+static limb_t lshift_2(limb_t hi, limb_t lo, size_t l)
+{
+    size_t r = LIMB_T_BITS - l;
+    limb_t mask = 0 - (is_zero(l)^1);
+    return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1)));
+}
+
+/*
+ * https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1.
+ */
+static void ab_approximation_n(limb_t a_[2], const limb_t a[],
+                               limb_t b_[2], const limb_t b[], size_t n)
+{
+    limb_t a_hi, a_lo, b_hi, b_lo, mask;
+    size_t i;
+
+    i = n-1;
+    a_hi = a[i],    a_lo = a[i-1];
+    b_hi = b[i],    b_lo = b[i-1];
+    for (i--; --i;) {
+        mask = 0 - is_zero(a_hi | b_hi);
+        a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi;
+        b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi;
+        a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo;
+        b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo;
+    }
+    i = LIMB_T_BITS - num_bits(a_hi | b_hi);
+    /* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */
+
+    a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i);
+    b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i);
+}
+
+typedef struct { limb_t f0, g0, f1, g1; } factors;
+
+static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2],
+                         size_t n)
+{
+    llimb_t limbx;
+    limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1;
+    limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm;
+
+    a_lo = a_[0], a_hi = a_[1];
+    b_lo = b_[0], b_hi = b_[1];
+
+    while(n--) {
+        odd = 0 - (a_lo&1);
+
+        /* a_ -= b_ if a_ is odd */
+        t_lo = a_lo, t_hi = a_hi;
+        limbx = a_lo - (llimb_t)(b_lo & odd);
+        a_lo = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+        limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow);
+        a_hi = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS);
+
+        /* negate a_-b_ if it borrowed */
+        a_lo ^= borrow;
+        a_hi ^= borrow;
+        limbx = a_lo + (llimb_t)(borrow & 1);
+        a_lo = (limb_t)limbx;
+        a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1;
+
+        /* b_=a_ if a_-b_ borrowed */
+        b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo;
+        b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi;
+
+        /* exchange f0 and f1 if a_-b_ borrowed */
+        xorm = (f0 ^ f1) & borrow;
+        f0 ^= xorm;
+        f1 ^= xorm;
+
+        /* exchange g0 and g1 if a_-b_ borrowed */
+        xorm = (g0 ^ g1) & borrow;
+        g0 ^= xorm;
+        g1 ^= xorm;
+
+        /* subtract if a_ was odd */
+        f0 -= f1 & odd;
+        g0 -= g1 & odd;
+
+        f1 <<= 1;
+        g1 <<= 1;
+        a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1);
+        a_hi >>= 1;
+    }
+
+    fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1;
+}
+
+static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n)
+{
+    llimb_t limbx = 0;
+    limb_t carry;
+    size_t i;
+
+    for (carry=neg&1, i=0; i<n; i++) {
+        limbx = (llimb_t)(a[i] ^ neg) + carry;
+        ret[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    return 0 - MSB((limb_t)limbx);
+}
+
+static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n)
+{
+    llimb_t limbx;
+    limb_t carry;
+    size_t i;
+
+    for (carry=0, i=0; i<n; i++) {
+        limbx = a[i] + (b[i] + (llimb_t)carry);
+        ret[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    return carry;
+}
+
+static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n)
+{
+    llimb_t limbx;
+    limb_t hi;
+    size_t i;
+
+    for (hi=0, i=0; i<n; i++) {
+        limbx = (b * (llimb_t)a[i]) + hi;
+        ret[i] = (limb_t)limbx;
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    return hi;
+}
+
+static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_,
+                                           const limb_t b[], limb_t *g_,
+                                           size_t n)
+{
+    limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi;
+    size_t i;
+
+    /* |a|*|f_| */
+    f = *f_;
+    neg = 0 - MSB(f);
+    f = (f ^ neg) - neg;            /* ensure |f| is positive */
+    (void)cneg_n(a_, a, neg, n);
+    hi = umul_n(a_, a_, f, n);
+    a_[n] = hi - (f & neg);
+
+    /* |b|*|g_| */
+    g = *g_;
+    neg = 0 - MSB(g);
+    g = (g ^ neg) - neg;            /* ensure |g| is positive */
+    (void)cneg_n(b_, b, neg, n);
+    hi = umul_n(b_, b_, g, n);
+    b_[n] = hi - (g & neg);
+
+    /* |a|*|f_| + |b|*|g_| */
+    (void)add_n(a_, a_, b_, n+1);
+
+    /* (|a|*|f_| + |b|*|g_|) >> k */
+    for (carry=a_[0], i=0; i<n; i++) {
+        hi = carry >> (LIMB_T_BITS-2);
+        carry = a_[i+1];
+        ret[i] = hi | (carry << 2);
+    }
+
+    /* ensure result is non-negative, fix up |f_| and |g_| accordingly */
+    neg = 0 - MSB(carry);
+    *f_ = (*f_ ^ neg) - neg;
+    *g_ = (*g_ ^ neg) - neg;
+    (void)cneg_n(ret, ret, neg, n);
+
+    return neg;
+}
+
+static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f,
+                                    const limb_t v[], limb_t g, size_t n)
+{
+    limb_t u_[n], v_[n], neg, hi;
+
+    /* |u|*|f_| */
+    neg = 0 - MSB(f);
+    f = (f ^ neg) - neg;            /* ensure |f| is positive */
+    neg = cneg_n(u_, u, neg, n);
+    hi = umul_n(u_, u_, f, n) - (f&neg);
+
+    /* |v|*|g_| */
+    neg = 0 - MSB(g);
+    g = (g ^ neg) - neg;            /* ensure |g| is positive */
+    neg = cneg_n(v_, v, neg, n);
+    hi += umul_n(v_, v_, g, n) - (g&neg);
+
+    /* |u|*|f_| + |v|*|g_| */
+    hi += add_n(ret, u_, v_, n);
+
+    return hi;
+}
+
+static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[],
+                             const limb_t mod[], const limb_t modx[], size_t n)
+{
+    llimb_t limbx;
+    limb_t a[n], b[n], u[2*n], v[2*n], t[2*n];
+    limb_t a_[2], b_[2], sign, carry, top;
+    factors fg;
+    size_t i;
+
+    vec_copy(a, inp, sizeof(a));
+    vec_copy(b, mod, sizeof(b));
+    vec_zero(u, sizeof(u)); u[0] = 1;
+    vec_zero(v, sizeof(v));
+
+    for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) {
+        ab_approximation_n(a_, a, b_, b, n);
+        inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2);
+        (void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n);
+        (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n);
+        vec_copy(a, t, sizeof(a));
+        smul_2n(t, u, fg.f0, v, fg.g0, 2*n);
+        smul_2n(v, u, fg.f1, v, fg.g1, 2*n);
+        vec_copy(u, t, sizeof(u));
+    }
+
+    inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2));
+    top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n);
+
+    sign = 0 - MSB(top);    /* top is 1, 0 or -1 */
+    for (carry=0, i=0; i<n; i++) {
+        limbx = ret[n+i] + ((modx[i] & sign) + (llimb_t)carry);
+        ret[n+i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+    top += carry;
+    sign = 0 - top;         /* top is 1, 0 or -1 */
+    top |= sign;
+    for (i=0; i<n; i++)
+        a[i] = modx[i] & top;
+    (void)cneg_n(a, a, 0 - MSB(sign), n);
+    add_n(ret+n, ret+n, a, n);
+}
+
+#define CT_INVERSE_MOD_IMPL(bits) \
+inline void ct_inverse_mod_##bits(vec##bits ret, const vec##bits inp, \
+                                  const vec##bits mod, const vec##bits modx) \
+{   ct_inverse_mod_n(ret, inp, mod, modx, NLIMBS(bits));   }
+
+CT_INVERSE_MOD_IMPL(256)
+CT_INVERSE_MOD_IMPL(384)
+
+/*
+ * Copy of inner_loop_n above, but with |L| updates.
+ */
+static limb_t legendre_loop_n(limb_t L, factors *fg, const limb_t a_[2],
+                              const limb_t b_[2], size_t n)
+{
+    llimb_t limbx;
+    limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1;
+    limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm;
+
+    a_lo = a_[0], a_hi = a_[1];
+    b_lo = b_[0], b_hi = b_[1];
+
+    while(n--) {
+        odd = 0 - (a_lo&1);
+
+        /* a_ -= b_ if a_ is odd */
+        t_lo = a_lo, t_hi = a_hi;
+        limbx = a_lo - (llimb_t)(b_lo & odd);
+        a_lo = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+        limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow);
+        a_hi = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS);
+
+        L += ((t_lo & b_lo) >> 1) & borrow;
+
+        /* negate a_-b_ if it borrowed */
+        a_lo ^= borrow;
+        a_hi ^= borrow;
+        limbx = a_lo + (llimb_t)(borrow & 1);
+        a_lo = (limb_t)limbx;
+        a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1;
+
+        /* b_=a_ if a_-b_ borrowed */
+        b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo;
+        b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi;
+
+        /* exchange f0 and f1 if a_-b_ borrowed */
+        xorm = (f0 ^ f1) & borrow;
+        f0 ^= xorm;
+        f1 ^= xorm;
+
+        /* exchange g0 and g1 if a_-b_ borrowed */
+        xorm = (g0 ^ g1) & borrow;
+        g0 ^= xorm;
+        g1 ^= xorm;
+
+        /* subtract if a_ was odd */
+        f0 -= f1 & odd;
+        g0 -= g1 & odd;
+
+        f1 <<= 1;
+        g1 <<= 1;
+        a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1);
+        a_hi >>= 1;
+
+        L += (b_lo + 2) >> 2;
+    }
+
+    fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1;
+
+    return L;
+}
+
+static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n)
+{
+    limb_t a[n], b[n], t[n];
+    limb_t a_[2], b_[2], neg, L = 0;
+    factors fg;
+    size_t i;
+
+    vec_copy(a, inp, sizeof(a));
+    vec_copy(b, mod, sizeof(b));
+
+    for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) {
+        ab_approximation_n(a_, a, b_, b, n);
+        L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2);
+        neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n);
+        (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n);
+        vec_copy(a, t, sizeof(a));
+        L += (b[0] >> 1) & neg;
+    }
+
+    L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2));
+
+    return (L & 1) ^ 1;
+}
+
+#define CT_IS_SQR_MOD_IMPL(bits) \
+inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \
+                                      const vec##bits mod) \
+{   return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits));   }
+
+CT_IS_SQR_MOD_IMPL(384)
+
+/*
+ * |div_top| points at two most significant limbs of the dividend, |d_hi|
+ * and |d_lo| are two most significant limbs of the divisor. If divisor
+ * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
+ * The divisor is required to be "bitwise left-aligned," and dividend's
+ * top limbs to be not larger than the divisor's. The latter limitation
+ * can be problematic in the first iteration of multi-precision division,
+ * where in most general case the condition would have to be "smaller."
+ * The subroutine considers four limbs, two of which are "overlapping,"
+ * hence the name... Another way to look at it is to think of the pair
+ * of the dividend's limbs being suffixed with a zero:
+ *   +-------+-------+-------+
+ * R |       |       |   0   |
+ *   +-------+-------+-------+
+ *           +-------+-------+
+ * D         |       |       |
+ *           +-------+-------+
+ */
+limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi)
+{
+    llimb_t Rx;
+    limb_t r_lo = div_top[0], r_hi = div_top[1];
+    limb_t Q = 0, mask, borrow, rx;
+    size_t i;
+
+    for (i = 0; i < LIMB_T_BITS; i++) {
+        /* "borrow, Rx = R - D" */
+        Rx = (llimb_t)r_lo - d_lo;
+        rx = (limb_t)Rx;
+        borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1;
+        Rx = r_hi - (d_hi + (llimb_t)borrow);
+        borrow = (limb_t)(Rx >> LIMB_T_BITS);
+
+        /* "if (R >= D) R -= D" */
+        r_lo = ((r_lo ^ rx) & borrow) ^ rx;
+        rx = (limb_t)Rx;
+        r_hi = ((r_hi ^ rx) & borrow) ^ rx;
+
+        Q <<= 1;
+        Q |= ~borrow & 1;
+
+        /* "D >>= 1" */
+        d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1);
+        d_hi >>= 1;
+    }
+
+    mask = 0 - MSB(Q);  /* does it overflow? */
+
+    /* "borrow, Rx = R - D" */
+    Rx = (llimb_t)r_lo - d_lo;
+    rx = (limb_t)Rx;
+    borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1;
+    Rx = r_hi - (d_hi + (llimb_t)borrow);
+    borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1;
+
+    Q <<= 1;
+    Q |= borrow ^ 1;
+
+    return (Q | mask);
+}
+
+static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor,
+                                          limb_t quotient, size_t n)
+{
+    llimb_t limbx;
+    limb_t tmp[n+1], carry, mask, borrow;
+    size_t i;
+
+    /* divisor*quotient */
+    for (carry=0, i=0; i<n; i++) {
+        limbx = (quotient * (llimb_t)divisor[i]) + carry;
+        tmp[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+    tmp[i] = carry;
+
+    /* remainder = dividend - divisor*quotient */
+    for (borrow=0, i=0; i<=n; i++) {
+        limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow);
+        tmp[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = 0 - borrow;
+
+    /* if quotient was off by one, add divisor to the remainder */
+    for (carry=0, i=0; i<n; i++) {
+        limbx = tmp[i] + ((divisor[i] & mask) + (llimb_t)carry);
+        div_rem[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    return (div_rem[i] = quotient + mask);
+}
+
+inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor,
+                                            limb_t quotient)
+{   return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128));   }
+
+inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor,
+                                           limb_t quotient)
+{   return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64));   }
+
+/*
+ * Unlock reference implementations in vect.c
+ */
+#define mul_by_8_mod_384 mul_by_8_mod_384
+#define mul_by_8_mod_384x mul_by_8_mod_384x
+#define mul_by_3_mod_384x mul_by_3_mod_384x
+#define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x
+#define add_mod_384x add_mod_384x
+#define sub_mod_384x sub_mod_384x
+#define lshift_mod_384x lshift_mod_384x
+#define sqr_mont_384x sqr_mont_384x
+
+inline void vec_prefetch(const void *ptr, size_t len)
+{   (void)ptr; (void)len;   }
+
+/*
+ * SHA-256
+ */
+#define ROTR(x,n)	((x)>>n | (x)<<(32-n))
+#define Sigma0(x)	(ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22))
+#define Sigma1(x)	(ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25))
+#define sigma0(x)	(ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3))
+#define sigma1(x)	(ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10))
+#define Ch(x,y,z)	(((x) & (y)) ^ ((~(x)) & (z)))
+#define Maj(x,y,z)	(((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+void blst_sha256_block_data_order(unsigned int *v, const void *inp,
+                                                   size_t blocks)
+{
+    static const unsigned int K256[64] = {
+        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+    };
+    unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2;
+    const unsigned char *data = inp;
+    size_t round;
+
+    a = v[0];
+    b = v[1];
+    c = v[2];
+    d = v[3];
+    e = v[4];
+    f = v[5];
+    g = v[6];
+    h = v[7];
+
+    while (blocks--) {
+        for (round = 0; round < 16; round++) {
+            l  = (unsigned int)data[0] << 24;
+            l |= (unsigned int)data[1] << 16;
+            l |= (unsigned int)data[2] << 8;
+            l |= (unsigned int)data[3];
+            data += 4;
+            T1 = X[round] = l;
+            T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round];
+            T2 = Sigma0(a) + Maj(a, b, c);
+            h = g;
+            g = f;
+            f = e;
+            e = d + T1;
+            d = c;
+            c = b;
+            b = a;
+            a = T1 + T2;
+        }
+
+        for (; round < 64; round++) {
+            s0 = X[(round + 1) & 0x0f];
+            s0 = sigma0(s0);
+            s1 = X[(round + 14) & 0x0f];
+            s1 = sigma1(s1);
+
+            T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf];
+            T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round];
+            T2 = Sigma0(a) + Maj(a, b, c);
+            h = g;
+            g = f;
+            f = e;
+            e = d + T1;
+            d = c;
+            c = b;
+            b = a;
+            a = T1 + T2;
+        }
+
+        a += v[0]; v[0] = a;
+        b += v[1]; v[1] = b;
+        c += v[2]; v[2] = c;
+        d += v[3]; v[3] = d;
+        e += v[4]; v[4] = e;
+        f += v[5]; v[5] = f;
+        g += v[6]; v[6] = g;
+        h += v[7]; v[7] = h;
+    }
+}
+#undef ROTR
+#undef Sigma0
+#undef Sigma1
+#undef sigma0
+#undef sigma1
+#undef Ch
+#undef Maj
+
+void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8])
+{
+    size_t i;
+
+    for (i=0; i<8; i++)
+        dst[i] = src[i];
+}
+
+void blst_sha256_emit(unsigned char md[32], const unsigned int h[8])
+{
+    size_t i;
+
+    for (i=0; i<8; i++, md+=4) {
+        unsigned int h_i = h[i];
+        md[0] = (unsigned char)(h_i >> 24);
+        md[1] = (unsigned char)(h_i >> 16);
+        md[2] = (unsigned char)(h_i >> 8);
+        md[3] = (unsigned char)h_i;
+    }
+}
+
+void blst_sha256_bcopy(void *dst_, const void *src_, size_t len)
+{
+    unsigned char *dst = dst_;
+    const unsigned char *src = src_;
+    size_t i;
+
+    for (i=0; i<len; i++)
+        dst[i] = src[i];
+}
diff --git a/blst/pairing.c b/blst/pairing.c
new file mode 100644
index 0000000..8d19b98
--- /dev/null
+++ b/blst/pairing.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "point.h"
+#include "fields.h"
+
+/*
+ * Line evaluations from  https://eprint.iacr.org/2010/354.pdf
+ * with a twist moving common expression to line_by_Px2.
+ */
+static void line_add(vec384fp6 line, POINTonE2 *T, const POINTonE2 *R,
+                                                   const POINTonE2_affine *Q)
+{
+    vec384x Z1Z1, U2, S2, H, HH, I, J, V;
+#if 1
+# define r line[1]
+#else
+    vec384x r;
+#endif
+
+    /*
+     * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl
+     * with XYZ3 being |T|, XYZ1 - |R|, XY2 - |Q|, i.e. Q is affine
+     */
+    sqr_fp2(Z1Z1, R->Z);                /* Z1Z1 = Z1^2 */
+    mul_fp2(U2, Q->X, Z1Z1);            /* U2 = X2*Z1Z1 */
+
+    mul_fp2(S2, Q->Y, R->Z);
+    mul_fp2(S2, S2, Z1Z1);              /* S2 = Y2*Z1*Z1Z1 */
+
+    sub_fp2(H, U2, R->X);               /* H = U2-X1 */
+
+    sqr_fp2(HH, H);                     /* HH = H^2 */
+    add_fp2(I, HH, HH);
+    add_fp2(I, I, I);                   /* I = 4*HH */
+
+    mul_fp2(J, H, I);                   /* J = H*I */
+
+    sub_fp2(r, S2, R->Y);
+    add_fp2(r, r, r);                   /* r = 2*(S2-Y1) */
+
+    mul_fp2(V, R->X, I);                /* V = X1*I */
+
+    sqr_fp2(T->X, r);
+    sub_fp2(T->X, T->X, J);
+    sub_fp2(T->X, T->X, V);
+    sub_fp2(T->X, T->X, V);             /* X3 = r^2-J-2*V */
+
+    mul_fp2(J, J, R->Y);
+    sub_fp2(T->Y, V, T->X);
+    mul_fp2(T->Y, T->Y, r);
+    sub_fp2(T->Y, T->Y, J);
+    sub_fp2(T->Y, T->Y, J);             /* Y3 = r*(V-X3)-2*Y1*J */
+
+    add_fp2(T->Z, R->Z, H);
+    sqr_fp2(T->Z, T->Z);
+    sub_fp2(T->Z, T->Z, Z1Z1);
+    sub_fp2(T->Z, T->Z, HH);            /* Z3 = (Z1+H)^2-Z1Z1-HH */
+
+    /*
+     * line evaluation
+     */
+    mul_fp2(I, r, Q->X);
+    mul_fp2(J, Q->Y, T->Z);
+    sub_fp2(I, I, J);
+    add_fp2(line[0], I, I);          /* 2*(r*X2 - Y2*Z3) */
+#ifdef r
+# undef r
+#else
+    vec_copy(line[1], r, sizeof(r));
+#endif
+    vec_copy(line[2], T->Z, sizeof(T->Z));
+}
+
+static void line_dbl(vec384fp6 line, POINTonE2 *T, const POINTonE2 *Q)
+{
+    vec384x ZZ, A, B, C, D, E, F;
+
+    /*
+     * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-alnr
+     */
+    sqr_fp2(A, Q->X);                   /* A = X1^2 */
+    sqr_fp2(B, Q->Y);                   /* B = Y1^2 */
+    sqr_fp2(ZZ, Q->Z);                  /* ZZ = Z1^2 */
+    sqr_fp2(C, B);                      /* C = B^2 */
+
+    add_fp2(D, Q->X, B);                /* X1+B */
+    sqr_fp2(D, D);                      /* (X1+B)^2 */
+    sub_fp2(D, D, A);                   /* (X1+B)^2-A */
+    sub_fp2(D, D, C);                   /* (X1+B)^2-A-C */
+    add_fp2(D, D, D);                   /* D = 2*((X1+B)^2-A-C) */
+
+    mul_by_3_fp2(E, A);                 /* E = 3*A */
+    sqr_fp2(F, E);                      /* F = E^2 */
+
+    add_fp2(line[0], E, Q->X);          /* 3*A+X1 for line evaluation */
+
+    sub_fp2(T->X, F, D);
+    sub_fp2(T->X, T->X, D);             /* X3 = F-2*D */
+
+    add_fp2(T->Z, Q->Y, Q->Z);
+    sqr_fp2(T->Z, T->Z);
+    sub_fp2(T->Z, T->Z, B);
+    sub_fp2(T->Z, T->Z, ZZ);            /* Z3 = (Y1+Z1)^2-B-ZZ */
+
+    mul_by_8_fp2(C, C);                 /* 8*C */
+    sub_fp2(T->Y, D, T->X);             /* D-X3 */
+    mul_fp2(T->Y, T->Y, E);             /* E*(D-X3) */
+    sub_fp2(T->Y, T->Y, C);             /* Y3 = E*(D-X3)-8*C */
+
+    /*
+     * line evaluation
+     */
+    sqr_fp2(line[0], line[0]);
+    sub_fp2(line[0], line[0], A);
+    sub_fp2(line[0], line[0], F);       /* (3*A+X1)^2 - X1^2 - 9*A^2 */
+    lshift_fp2(B, B, 2);
+    sub_fp2(line[0], line[0], B);       /* 6*X1^3 - 4*Y1^2 */
+
+    mul_fp2(line[1], E, ZZ);            /* 3*X1^2 * Z1^2 */
+
+    mul_fp2(line[2], T->Z, ZZ);         /* Z3 * Z1^2 */
+}
+
+static void line_by_Px2(vec384fp6 line, const POINTonE1_affine *Px2)
+{
+    mul_fp(line[1][0], line[1][0], Px2->X);   /* "b01" *= -2*P->X */
+    mul_fp(line[1][1], line[1][1], Px2->X);
+
+    mul_fp(line[2][0], line[2][0], Px2->Y);   /* "b11" *= 2*P->Y */
+    mul_fp(line[2][1], line[2][1], Px2->Y);
+}
+
+#if 0
+static void add_n_dbl(vec384fp12 ret, POINTonE2 *T, const POINTonE2_affine *Q,
+                      const POINTonE1_affine *Px2, vec384fp6 line, size_t n)
+{
+    line_add(line, T, T, Q);    line_by_Px2(line, Px2);
+    mul_by_xy00z0_fp12(ret, ret, line);
+    while (n--) {
+        sqr_fp12(ret, ret);
+        line_dbl(line, T, T);   line_by_Px2(line, Px2);
+        mul_by_xy00z0_fp12(ret, ret, line);
+    }
+}
+
+static void miller_loop(vec384fp12 ret, const POINTonE2 *Q, const POINTonE1 *P)
+{
+#define Q ((const POINTonE2_affine *)Q)
+    POINTonE2 T[1];
+    POINTonE1_affine Px2[1];
+    vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0"  */
+
+    /* Move common expression from line evaluation to line_by_Px2. */
+    add_fp(Px2->X, P->X, P->X);
+    neg_fp(Px2->X, Px2->X);
+    add_fp(Px2->Y, P->Y, P->Y);
+
+    vec_copy(T->X, Q->X, 2*sizeof(T->X));
+    vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z));
+
+    /* first step is ret = 1^2*line, which is replaced with ret = line  */
+    line_dbl(line, T, T);                       /* 0x2                  */
+    line_by_Px2(line, Px2);
+    vec_zero(ret, sizeof(vec384fp12));
+    vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2));
+    vec_copy(ret[1][1], line[2], sizeof(vec384fp2));
+    add_n_dbl(ret, T, Q, Px2, line, 2);         /* ..0xc                */
+    add_n_dbl(ret, T, Q, Px2, line, 3);         /* ..0x68               */
+    add_n_dbl(ret, T, Q, Px2, line, 9);         /* ..0xd200             */
+    add_n_dbl(ret, T, Q, Px2, line, 32);        /* ..0xd20100000000     */
+    add_n_dbl(ret, T, Q, Px2, line, 16);        /* ..0xd201000000010000 */
+    conjugate_fp12(ret);                /* account for z being negative */
+#undef Q
+}
+#endif
+
+static void start_dbl_n(vec384fp12 ret, POINTonE2 T[],
+                                        const POINTonE1_affine Px2[], size_t n)
+{
+    size_t i;
+    vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0"  */
+
+    /* first step is ret = 1^2*line, which is replaced with ret = line  */
+    line_dbl(line, T+0, T+0);           line_by_Px2(line, Px2+0);
+    vec_zero(ret, sizeof(vec384fp12));
+    vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2));
+    vec_copy(ret[1][1], line[2], sizeof(vec384fp2));
+
+    for (i = 1; i < n; i++) {
+        line_dbl(line, T+i, T+i);       line_by_Px2(line, Px2+i);
+        mul_by_xy00z0_fp12(ret, ret, line);
+    }
+}
+
+static void add_n_dbl_n(vec384fp12 ret, POINTonE2 T[],
+                                        const POINTonE2_affine Q[],
+                                        const POINTonE1_affine Px2[],
+                                        size_t n, size_t k)
+{
+    size_t i;
+    vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0"  */
+
+    for (i = 0; i < n; i++) {
+        line_add(line, T+i, T+i, Q+i);  line_by_Px2(line, Px2+i);
+        mul_by_xy00z0_fp12(ret, ret, line);
+    }
+    while (k--) {
+        sqr_fp12(ret, ret);
+        for (i = 0; i < n; i++) {
+            line_dbl(line, T+i, T+i);   line_by_Px2(line, Px2+i);
+            mul_by_xy00z0_fp12(ret, ret, line);
+        }
+    }
+}
+
+static void miller_loop_n(vec384fp12 ret, const POINTonE2_affine Q[],
+                                          const POINTonE1_affine P[], size_t n)
+{
+#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901
+    POINTonE2 *T = alloca(n*sizeof(POINTonE2));
+    POINTonE1_affine *Px2 = alloca(n*sizeof(POINTonE1_affine));
+#else
+    POINTonE2 T[n];
+    POINTonE1_affine Px2[n];
+#endif
+    size_t i;
+
+    if ((n == 1) && (vec_is_zero(&Q[0], sizeof(Q[0])) |
+                     vec_is_zero(&P[0], sizeof(P[0]))) ) {
+        /*
+         * Special case of infinite aggregated signature, pair the additive
+         * group's identity with the multiplicative group's identity.
+         */
+        vec_copy(ret, BLS12_381_Rx.p12, sizeof(vec384fp12));
+        return;
+    }
+
+    for (i = 0; i < n; i++) {
+        /* Move common expression from line evaluation to line_by_Px2.  */
+        add_fp(Px2[i].X, P[i].X, P[i].X);
+        neg_fp(Px2[i].X, Px2[i].X);
+        add_fp(Px2[i].Y, P[i].Y, P[i].Y);
+
+        vec_copy(T[i].X, Q[i].X, 2*sizeof(T[i].X));
+        vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z));
+    }
+
+    /* first step is ret = 1^2*line, which is replaced with ret = line  */
+    start_dbl_n(ret, T, Px2, n);                /* 0x2                  */
+    add_n_dbl_n(ret, T, Q, Px2, n, 2);          /* ..0xc                */
+    add_n_dbl_n(ret, T, Q, Px2, n, 3);          /* ..0x68               */
+    add_n_dbl_n(ret, T, Q, Px2, n, 9);          /* ..0xd200             */
+    add_n_dbl_n(ret, T, Q, Px2, n, 32);         /* ..0xd20100000000     */
+    add_n_dbl_n(ret, T, Q, Px2, n, 16);         /* ..0xd201000000010000 */
+    conjugate_fp12(ret);                /* account for z being negative */
+}
+
+static void pre_add_n_dbl(vec384fp6 lines[], POINTonE2 *T,
+                                             const POINTonE2_affine *Q,
+                                             size_t n)
+{
+    line_add(lines++[0], T, T, Q);
+    while (n--)
+        line_dbl(lines++[0], T, T);
+}
+
+static void precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q)
+{
+    POINTonE2 T[1];
+
+    vec_copy(T->X, Q->X, 2*sizeof(T->X));
+    vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z));
+
+    line_dbl(Qlines[0], T, T);                  /* 0x2                  */
+    pre_add_n_dbl(&Qlines[1],  T, Q, 2);        /* ..0xc                */
+    pre_add_n_dbl(&Qlines[4],  T, Q, 3);        /* ..0x68               */
+    pre_add_n_dbl(&Qlines[8],  T, Q, 9);        /* ..0xd200             */
+    pre_add_n_dbl(&Qlines[18], T, Q, 32);       /* ..0xd20100000000     */
+    pre_add_n_dbl(&Qlines[51], T, Q, 16);       /* ..0xd201000000010000 */
+}
+
+static void post_line_by_Px2(vec384fp6 out, const vec384fp6 in,
+                                            const POINTonE1_affine *Px2)
+{
+    vec_copy(out[0], in[0], sizeof(out[0]));
+
+    mul_fp(out[1][0], in[1][0], Px2->X);        /* "b01" *= -2*P->X */
+    mul_fp(out[1][1], in[1][1], Px2->X);
+
+    mul_fp(out[2][0], in[2][0], Px2->Y);        /* "b11" *= 2*P->Y */
+    mul_fp(out[2][1], in[2][1], Px2->Y);
+}
+
+static void post_add_n_dbl(vec384fp12 ret, const vec384fp6 lines[],
+                           const POINTonE1_affine *Px2, size_t n)
+{
+    vec384fp6 line;
+
+    post_line_by_Px2(line, lines++[0], Px2);
+    mul_by_xy00z0_fp12(ret, ret, line);
+    while (n--) {
+        sqr_fp12(ret, ret);
+        post_line_by_Px2(line, lines++[0], Px2);
+        mul_by_xy00z0_fp12(ret, ret, line);
+    }
+}
+
+static void miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68],
+                                              const POINTonE1_affine *P)
+{
+    POINTonE1_affine Px2[1];
+    vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0"  */
+
+    /* Move common expression from line evaluation to line_by_Px2. */
+    add_fp(Px2->X, P->X, P->X);
+    neg_fp(Px2->X, Px2->X);
+    add_fp(Px2->Y, P->Y, P->Y);
+
+    /* first step is ret = 1^2*line, which is replaced with ret = line  */
+    post_line_by_Px2(line, Qlines[0], Px2);     /* 0x2                  */
+    vec_zero(ret, sizeof(vec384fp12));
+    vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2));
+    vec_copy(ret[1][1], line[2], sizeof(vec384fp2));
+    post_add_n_dbl(ret, &Qlines[1],  Px2, 2);   /* ..0xc                */
+    post_add_n_dbl(ret, &Qlines[4],  Px2, 3);   /* ..0x68               */
+    post_add_n_dbl(ret, &Qlines[8],  Px2, 9);   /* ..0xd200             */
+    post_add_n_dbl(ret, &Qlines[18], Px2, 32);  /* ..0xd20100000000     */
+    post_add_n_dbl(ret, &Qlines[51], Px2, 16);  /* ..0xd201000000010000 */
+    conjugate_fp12(ret);                /* account for z being negative */
+}
+
+#ifdef INTERNAL_TESTMODE
+static void miller_loop_alt(vec384fp12 ret, const POINTonE2_affine *Q,
+                                            const POINTonE1_affine *P)
+{
+    vec384fp6 lines[68];
+
+    precompute_lines(lines, Q);
+    miller_loop_lines(ret, lines, P);
+}
+#endif
+
+static void mul_n_sqr(vec384fp12 ret, const vec384fp12 a, size_t n)
+{
+    mul_fp12(ret, ret, a);
+    while (n--)
+        cyclotomic_sqr_fp12(ret, ret);
+}
+
+static void raise_to_z_div_by_2(vec384fp12 ret, const vec384fp12 a)
+{
+    cyclotomic_sqr_fp12(ret, a);                /* 0x2                  */
+    mul_n_sqr(ret, a, 2);                       /* ..0xc                */
+    mul_n_sqr(ret, a, 3);                       /* ..0x68               */
+    mul_n_sqr(ret, a, 9);                       /* ..0xd200             */
+    mul_n_sqr(ret, a, 32);                      /* ..0xd20100000000     */
+    mul_n_sqr(ret, a, 16-1);                    /* ..0x6900800000008000 */
+    conjugate_fp12(ret);                /* account for z being negative */
+}
+
+#define raise_to_z(a, b) (raise_to_z_div_by_2(a, b), cyclotomic_sqr_fp12(a, a))
+
+/*
+ * Adaptation from <zkcrypto>/pairing/src/bls12_381/mod.rs
+ */
+static void final_exp(vec384fp12 ret, const vec384fp12 f)
+{
+    vec384fp12 y0, y1, y2, y3;
+
+    vec_copy(y1, f, sizeof(y1));
+    conjugate_fp12(y1);
+    inverse_fp12(y2, f);
+    mul_fp12(ret, y1, y2);
+    frobenius_map_fp12(y2, ret, 2);
+    mul_fp12(ret, ret, y2);
+
+    cyclotomic_sqr_fp12(y0, ret);
+    raise_to_z(y1, y0);
+    raise_to_z_div_by_2(y2, y1);
+    vec_copy(y3, ret, sizeof(y3));
+    conjugate_fp12(y3);
+    mul_fp12(y1, y1, y3);
+    conjugate_fp12(y1);
+    mul_fp12(y1, y1, y2);
+    raise_to_z(y2, y1);
+    raise_to_z(y3, y2);
+    conjugate_fp12(y1);
+    mul_fp12(y3, y3, y1);
+    conjugate_fp12(y1);
+    frobenius_map_fp12(y1, y1, 3);
+    frobenius_map_fp12(y2, y2, 2);
+    mul_fp12(y1, y1, y2);
+    raise_to_z(y2, y3);
+    mul_fp12(y2, y2, y0);
+    mul_fp12(y2, y2, ret);
+    mul_fp12(y1, y1, y2);
+    frobenius_map_fp12(y2, y3, 1);
+    mul_fp12(ret, y1, y2);
+}
+
+void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q,
+                                      const POINTonE1_affine *P)
+{   miller_loop_n(ret, Q ? Q : (const POINTonE2_affine *)&BLS12_381_G2,
+                       P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1);
+}
+
+void blst_final_exp(vec384fp12 ret, const vec384fp12 f)
+{   final_exp(ret, f);   }
+
+void blst_precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q)
+{   precompute_lines(Qlines, Q);   }
+
+void blst_miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68],
+                                            const POINTonE1_affine *P)
+{   miller_loop_lines(ret, Qlines, P);   }
+
+static bool_t is_cyclotomic(const vec384fp12 f)
+{
+    vec384fp12 a, b;
+
+    frobenius_map_fp12(a, f, 2);
+    frobenius_map_fp12(b, a, 2);
+    mul_fp12(b, b, f);
+
+    return vec_is_equal(a, b, sizeof(a));
+}
+
+int blst_fp12_in_group(const vec384fp12 f)
+{
+    vec384fp12 a, b;
+
+    if (vec_is_zero(f, sizeof(vec384fp12)) || !is_cyclotomic(f))
+        return 0;
+
+    frobenius_map_fp12(a, f, 1);
+    raise_to_z(b, f);
+
+    return (int)vec_is_equal(a, b, sizeof(a));
+}
diff --git a/blst/point.h b/blst/point.h
new file mode 100644
index 0000000..4d041b0
--- /dev/null
+++ b/blst/point.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_POINT_H__
+#define __BLS12_381_ASM_POINT_H__
+
+#include "vect.h"
+
+#define DECLARE_POINT(ptype, bits) \
+typedef struct { vec##bits X,Y,Z; } ptype; \
+typedef struct { vec##bits X,Y; } ptype##_affine; \
+\
+static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2,	\
+                         const vec##bits a4);				\
+static void ptype##_dadd_affine(ptype *out, const ptype *p1,		\
+                                            const ptype##_affine *p2);	\
+static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2);	\
+static void ptype##_add_affine(ptype *out, const ptype *p1,		\
+                                           const ptype##_affine *p2);	\
+static void ptype##_double(ptype *out, const ptype *p1);		\
+static void ptype##_mult_w5(ptype *out, const ptype *point,		\
+                            const byte *scalar, size_t nbits);		\
+static void ptype##_cneg(ptype *p, limb_t cbit);			\
+static void ptype##_to_affine(ptype##_affine *out, const ptype *in);	\
+static void ptype##_from_Jacobian(ptype *out, const ptype *in);		\
+\
+static inline void ptype##_cswap(ptype *restrict a,			\
+                                 ptype *restrict b, bool_t cbit) {	\
+    vec_cswap(a, b, sizeof(ptype), cbit);				\
+} \
+static inline void ptype##_ccopy(ptype *restrict a,			\
+                                 const ptype *restrict b, bool_t cbit) {\
+    vec_select(a, b, a, sizeof(ptype), cbit);				\
+}
+
+#define DECLARE_PRIVATE_POINTXZ(ptype, bits) \
+typedef struct { vec##bits X,Z; } ptype##xz; \
+\
+static void ptype##xz_ladder_pre(ptype##xz *out, const ptype *in);	\
+static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s,		\
+                                  const ptype##xz *p);			\
+static void ptype##xz_ladder_post(ptype *ret,				\
+                                  const ptype##xz *r, const ptype##xz *s, \
+                                  const ptype##xz *p, const vec##bits Y1);\
+\
+static inline void ptype##xz_cswap(ptype##xz *restrict a,		\
+                                   ptype##xz *restrict b, bool_t cbit) {\
+    vec_cswap(a, b, sizeof(ptype##xz), cbit);				\
+}
+
+DECLARE_POINT(POINTonE1, 384)
+
+DECLARE_POINT(POINTonE2, 384x)
+
+#ifdef __GNUC__
+# pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#endif
diff --git a/blst/rb_tree.c b/blst/rb_tree.c
new file mode 100644
index 0000000..207becd
--- /dev/null
+++ b/blst/rb_tree.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <stddef.h>
+
+/*
+ * Red-black tree tailored for uniqueness test. Amount of messages to be
+ * checked is known prior context initialization, implementation is
+ * insert-only, failure is returned if message is already in the tree.
+ */
+
+struct node {
+    struct node *leafs[2];
+    const void *data;
+    size_t len_n_colour;    /* len<<1 | colour */
+};
+
+struct rb_tree {
+    struct node *root;
+    size_t n_nodes;
+    struct node nodes[1];
+};
+
+static long bytes_compare(const unsigned char *ptr0, size_t len0,
+                          const unsigned char *ptr1, size_t len1)
+{
+    size_t i, len = len0<len1 ? len0 : len1;
+    long a, b;
+
+    for (i=0; i<len; i++) {
+        if ((a = ptr0[i]) != (b = ptr1[i]))
+            return a - b;
+    }
+
+    return (long)len0 - (long)len1;
+}
+
+#define PAINT_BLACK(p)  ((p)->len_n_colour &= ~(size_t)1)
+#define PAINT_RED(p)    ((p)->len_n_colour |= 1)
+#define IS_RED(p)       ((p)->len_n_colour & 1)
+
+static int rb_tree_insert(struct rb_tree *tree, const void *data, size_t len)
+{
+    struct node *nodes[8*sizeof(void *)];   /* visited nodes    */
+    unsigned char dirs[8*sizeof(void *)];   /* taken directions */
+    size_t k = 0;                           /* walked distance  */
+    struct node *p, *y, *z;
+
+    for (p = tree->root; p != NULL; k++) {
+        long cmp = bytes_compare(data, len, p->data, p->len_n_colour>>1);
+
+        if (cmp == 0)
+            return 0;   /* already in tree, no insertion */
+
+        /* record the step */
+        nodes[k] = p;
+        p = p->leafs[(dirs[k] = cmp>0)];
+    }
+
+    /* allocate new node */
+    z = &tree->nodes[tree->n_nodes++];
+    z->leafs[0] = z->leafs[1] = NULL;
+    z->data = data;
+    z->len_n_colour = len<<1;
+    PAINT_RED(z);
+
+    /* graft |z| */
+    if (k > 0)
+        nodes[k-1]->leafs[dirs[k-1]] = z;
+    else
+        tree->root = z;
+
+    /* re-balance |tree| */
+    while (k >= 2 && IS_RED(y = nodes[k-1])) {
+        size_t ydir = dirs[k-2];
+        struct node *x = nodes[k-2],        /* |z|'s grandparent    */
+                    *s = x->leafs[ydir^1];  /* |z|'s uncle          */
+
+        if (s != NULL && IS_RED(s)) {
+            PAINT_RED(x);
+            PAINT_BLACK(y);
+            PAINT_BLACK(s);
+            k -= 2;
+        } else {
+            if (dirs[k-1] != ydir) {
+                /*    |        |
+                 *    x        x
+                 *   / \        \
+                 *  y   s -> z   s
+                 *   \      /
+                 *    z    y
+                 *   /      \
+                 *  ?        ?
+                 */
+                struct node *t = y;
+                y = y->leafs[ydir^1];
+                t->leafs[ydir^1] = y->leafs[ydir];
+                y->leafs[ydir] = t;
+            }
+
+            /*      |        |
+             *      x        y
+             *       \      / \
+             *    y   s -> z   x
+             *   / \          / \
+             *  z   ?        ?   s
+             */
+            x->leafs[ydir] = y->leafs[ydir^1];
+            y->leafs[ydir^1] = x;
+
+            PAINT_RED(x);
+            PAINT_BLACK(y);
+
+            if (k > 2)
+                nodes[k-3]->leafs[dirs[k-3]] = y;
+            else
+                tree->root = y;
+
+            break;
+        }
+    }
+
+    PAINT_BLACK(tree->root);
+
+    return 1;
+}
+
+#undef IS_RED
+#undef PAINT_RED
+#undef PAINT_BLACK
+
+size_t blst_uniq_sizeof(size_t n_nodes)
+{   return sizeof(struct rb_tree) + sizeof(struct node)*(n_nodes-1);   }
+
+void blst_uniq_init(struct rb_tree *tree)
+{
+    tree->root = NULL;
+    tree->n_nodes = 0;
+}
+
+int blst_uniq_test(struct rb_tree *tree, const void *data, size_t len)
+{   return (int)rb_tree_insert(tree, data, len);   }
diff --git a/blst/recip-addchain.h b/blst/recip-addchain.h
new file mode 100644
index 0000000..e4e436a
--- /dev/null
+++ b/blst/recip-addchain.h
@@ -0,0 +1,489 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * The "magic" number is BLS12_381_P-2. Exponentiation to which yields
+ * reciprocal to input base.
+ *
+ * Generated with 'addchain 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559785'
+ * https://github.com/kwantam/addchain
+ *
+ * # Bos-Coster (win=4)           :  461 (16) <<<
+ * # Bos-Coster (win=3)           :  464 ( 9)
+ * # Bos-Coster (win=8)           :  469 (35)
+ * # Bos-Coster (win=5)           :  463 (28)
+ * # Bos-Coster (win=9)           :  467 (32)
+ * # Bos-Coster (win=7)           :  462 (27)
+ * # Yacobi                       :  481 (31)
+ * # Bos-Coster (win=10)          :  475 (30)
+ * # Bos-Coster (win=6)           :  463 (32)
+ * # Bos-Coster (win=2)           :  489 ( 5)
+ * # Bergeron-Berstel-Brlek-Duboc :  498 ( 5)
+ */
+
+#define RECIPROCAL_MOD_BLS12_381_P(out, inp, ptype) do { \
+ptype t[16]; \
+vec_copy(t[1], inp, sizeof(ptype)); /*    0: 1 */\
+sqr(t[0], t[1]);                    /*    1: 2 */\
+mul(t[9], t[0], t[1]);              /*    2: 3 */\
+sqr(t[5], t[0]);                    /*    3: 4 */\
+mul(t[2], t[9], t[0]);              /*    4: 5 */\
+mul(t[7], t[5], t[9]);              /*    5: 7 */\
+mul(t[10], t[2], t[5]);             /*    6: 9 */\
+mul(t[13], t[7], t[5]);             /*    7: b */\
+mul(t[4], t[10], t[5]);             /*    8: d */\
+mul(t[8], t[13], t[5]);             /*    9: f */\
+mul(t[15], t[4], t[5]);             /*   10: 11 */\
+mul(t[11], t[8], t[5]);             /*   11: 13 */\
+mul(t[3], t[15], t[5]);             /*   12: 15 */\
+mul(t[12], t[11], t[5]);            /*   13: 17 */\
+sqr(t[0], t[4]);                    /*   14: 1a */\
+mul(t[14], t[12], t[5]);            /*   15: 1b */\
+mul(t[6], t[0], t[9]);              /*   16: 1d */\
+mul(t[5], t[0], t[2]);              /*   17: 1f */\
+/* sqr(t[0], t[0]); */              /*   18: 34 */\
+/* sqr(t[0], t[0]); */              /*   19: 68 */\
+/* sqr(t[0], t[0]); */              /*   20: d0 */\
+/* sqr(t[0], t[0]); */              /*   21: 1a0 */\
+/* sqr(t[0], t[0]); */              /*   22: 340 */\
+/* sqr(t[0], t[0]); */              /*   23: 680 */\
+/* sqr(t[0], t[0]); */              /*   24: d00 */\
+/* sqr(t[0], t[0]); */              /*   25: 1a00 */\
+/* sqr(t[0], t[0]); */              /*   26: 3400 */\
+/* sqr(t[0], t[0]); */              /*   27: 6800 */\
+/* sqr(t[0], t[0]); */              /*   28: d000 */\
+/* sqr(t[0], t[0]); */              /*   29: 1a000 */\
+sqr_n_mul(t[0], t[0], 12, t[15]);   /*   30: 1a011 */\
+/* sqr(t[0], t[0]); */              /*   31: 34022 */\
+/* sqr(t[0], t[0]); */              /*   32: 68044 */\
+/* sqr(t[0], t[0]); */              /*   33: d0088 */\
+/* sqr(t[0], t[0]); */              /*   34: 1a0110 */\
+/* sqr(t[0], t[0]); */              /*   35: 340220 */\
+/* sqr(t[0], t[0]); */              /*   36: 680440 */\
+/* sqr(t[0], t[0]); */              /*   37: d00880 */\
+sqr_n_mul(t[0], t[0], 7, t[8]);     /*   38: d0088f */\
+/* sqr(t[0], t[0]); */              /*   39: 1a0111e */\
+/* sqr(t[0], t[0]); */              /*   40: 340223c */\
+/* sqr(t[0], t[0]); */              /*   41: 6804478 */\
+/* sqr(t[0], t[0]); */              /*   42: d0088f0 */\
+sqr_n_mul(t[0], t[0], 4, t[2]);     /*   43: d0088f5 */\
+/* sqr(t[0], t[0]); */              /*   44: 1a0111ea */\
+/* sqr(t[0], t[0]); */              /*   45: 340223d4 */\
+/* sqr(t[0], t[0]); */              /*   46: 680447a8 */\
+/* sqr(t[0], t[0]); */              /*   47: d0088f50 */\
+/* sqr(t[0], t[0]); */              /*   48: 1a0111ea0 */\
+/* sqr(t[0], t[0]); */              /*   49: 340223d40 */\
+sqr_n_mul(t[0], t[0], 6, t[7]);     /*   50: 340223d47 */\
+/* sqr(t[0], t[0]); */              /*   51: 680447a8e */\
+/* sqr(t[0], t[0]); */              /*   52: d0088f51c */\
+/* sqr(t[0], t[0]); */              /*   53: 1a0111ea38 */\
+/* sqr(t[0], t[0]); */              /*   54: 340223d470 */\
+/* sqr(t[0], t[0]); */              /*   55: 680447a8e0 */\
+/* sqr(t[0], t[0]); */              /*   56: d0088f51c0 */\
+/* sqr(t[0], t[0]); */              /*   57: 1a0111ea380 */\
+sqr_n_mul(t[0], t[0], 7, t[12]);    /*   58: 1a0111ea397 */\
+/* sqr(t[0], t[0]); */              /*   59: 340223d472e */\
+/* sqr(t[0], t[0]); */              /*   60: 680447a8e5c */\
+/* sqr(t[0], t[0]); */              /*   61: d0088f51cb8 */\
+/* sqr(t[0], t[0]); */              /*   62: 1a0111ea3970 */\
+/* sqr(t[0], t[0]); */              /*   63: 340223d472e0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*   64: 340223d472ff */\
+/* sqr(t[0], t[0]); */              /*   65: 680447a8e5fe */\
+/* sqr(t[0], t[0]); */              /*   66: d0088f51cbfc */\
+sqr_n_mul(t[0], t[0], 2, t[9]);     /*   67: d0088f51cbff */\
+/* sqr(t[0], t[0]); */              /*   68: 1a0111ea397fe */\
+/* sqr(t[0], t[0]); */              /*   69: 340223d472ffc */\
+/* sqr(t[0], t[0]); */              /*   70: 680447a8e5ff8 */\
+/* sqr(t[0], t[0]); */              /*   71: d0088f51cbff0 */\
+/* sqr(t[0], t[0]); */              /*   72: 1a0111ea397fe0 */\
+/* sqr(t[0], t[0]); */              /*   73: 340223d472ffc0 */\
+sqr_n_mul(t[0], t[0], 6, t[4]);     /*   74: 340223d472ffcd */\
+/* sqr(t[0], t[0]); */              /*   75: 680447a8e5ff9a */\
+/* sqr(t[0], t[0]); */              /*   76: d0088f51cbff34 */\
+/* sqr(t[0], t[0]); */              /*   77: 1a0111ea397fe68 */\
+/* sqr(t[0], t[0]); */              /*   78: 340223d472ffcd0 */\
+/* sqr(t[0], t[0]); */              /*   79: 680447a8e5ff9a0 */\
+/* sqr(t[0], t[0]); */              /*   80: d0088f51cbff340 */\
+sqr_n_mul(t[0], t[0], 6, t[4]);     /*   81: d0088f51cbff34d */\
+/* sqr(t[0], t[0]); */              /*   82: 1a0111ea397fe69a */\
+/* sqr(t[0], t[0]); */              /*   83: 340223d472ffcd34 */\
+/* sqr(t[0], t[0]); */              /*   84: 680447a8e5ff9a68 */\
+/* sqr(t[0], t[0]); */              /*   85: d0088f51cbff34d0 */\
+/* sqr(t[0], t[0]); */              /*   86: 1a0111ea397fe69a0 */\
+/* sqr(t[0], t[0]); */              /*   87: 340223d472ffcd340 */\
+sqr_n_mul(t[0], t[0], 6, t[10]);    /*   88: 340223d472ffcd349 */\
+/* sqr(t[0], t[0]); */              /*   89: 680447a8e5ff9a692 */\
+/* sqr(t[0], t[0]); */              /*   90: d0088f51cbff34d24 */\
+/* sqr(t[0], t[0]); */              /*   91: 1a0111ea397fe69a48 */\
+sqr_n_mul(t[0], t[0], 3, t[9]);     /*   92: 1a0111ea397fe69a4b */\
+/* sqr(t[0], t[0]); */              /*   93: 340223d472ffcd3496 */\
+/* sqr(t[0], t[0]); */              /*   94: 680447a8e5ff9a692c */\
+/* sqr(t[0], t[0]); */              /*   95: d0088f51cbff34d258 */\
+/* sqr(t[0], t[0]); */              /*   96: 1a0111ea397fe69a4b0 */\
+/* sqr(t[0], t[0]); */              /*   97: 340223d472ffcd34960 */\
+/* sqr(t[0], t[0]); */              /*   98: 680447a8e5ff9a692c0 */\
+/* sqr(t[0], t[0]); */              /*   99: d0088f51cbff34d2580 */\
+sqr_n_mul(t[0], t[0], 7, t[4]);     /*  100: d0088f51cbff34d258d */\
+/* sqr(t[0], t[0]); */              /*  101: 1a0111ea397fe69a4b1a */\
+/* sqr(t[0], t[0]); */              /*  102: 340223d472ffcd349634 */\
+/* sqr(t[0], t[0]); */              /*  103: 680447a8e5ff9a692c68 */\
+/* sqr(t[0], t[0]); */              /*  104: d0088f51cbff34d258d0 */\
+sqr_n_mul(t[0], t[0], 4, t[4]);     /*  105: d0088f51cbff34d258dd */\
+/* sqr(t[0], t[0]); */              /*  106: 1a0111ea397fe69a4b1ba */\
+/* sqr(t[0], t[0]); */              /*  107: 340223d472ffcd3496374 */\
+/* sqr(t[0], t[0]); */              /*  108: 680447a8e5ff9a692c6e8 */\
+/* sqr(t[0], t[0]); */              /*  109: d0088f51cbff34d258dd0 */\
+/* sqr(t[0], t[0]); */              /*  110: 1a0111ea397fe69a4b1ba0 */\
+/* sqr(t[0], t[0]); */              /*  111: 340223d472ffcd34963740 */\
+sqr_n_mul(t[0], t[0], 6, t[8]);     /*  112: 340223d472ffcd3496374f */\
+/* sqr(t[0], t[0]); */              /*  113: 680447a8e5ff9a692c6e9e */\
+/* sqr(t[0], t[0]); */              /*  114: d0088f51cbff34d258dd3c */\
+/* sqr(t[0], t[0]); */              /*  115: 1a0111ea397fe69a4b1ba78 */\
+/* sqr(t[0], t[0]); */              /*  116: 340223d472ffcd3496374f0 */\
+/* sqr(t[0], t[0]); */              /*  117: 680447a8e5ff9a692c6e9e0 */\
+/* sqr(t[0], t[0]); */              /*  118: d0088f51cbff34d258dd3c0 */\
+sqr_n_mul(t[0], t[0], 6, t[14]);    /*  119: d0088f51cbff34d258dd3db */\
+/* sqr(t[0], t[0]); */              /*  120: 1a0111ea397fe69a4b1ba7b6 */\
+/* sqr(t[0], t[0]); */              /*  121: 340223d472ffcd3496374f6c */\
+/* sqr(t[0], t[0]); */              /*  122: 680447a8e5ff9a692c6e9ed8 */\
+sqr_n_mul(t[0], t[0], 3, t[1]);     /*  123: 680447a8e5ff9a692c6e9ed9 */\
+/* sqr(t[0], t[0]); */              /*  124: d0088f51cbff34d258dd3db2 */\
+/* sqr(t[0], t[0]); */              /*  125: 1a0111ea397fe69a4b1ba7b64 */\
+/* sqr(t[0], t[0]); */              /*  126: 340223d472ffcd3496374f6c8 */\
+/* sqr(t[0], t[0]); */              /*  127: 680447a8e5ff9a692c6e9ed90 */\
+/* sqr(t[0], t[0]); */              /*  128: d0088f51cbff34d258dd3db20 */\
+/* sqr(t[0], t[0]); */              /*  129: 1a0111ea397fe69a4b1ba7b640 */\
+/* sqr(t[0], t[0]); */              /*  130: 340223d472ffcd3496374f6c80 */\
+/* sqr(t[0], t[0]); */              /*  131: 680447a8e5ff9a692c6e9ed900 */\
+sqr_n_mul(t[0], t[0], 8, t[4]);     /*  132: 680447a8e5ff9a692c6e9ed90d */\
+/* sqr(t[0], t[0]); */              /*  133: d0088f51cbff34d258dd3db21a */\
+/* sqr(t[0], t[0]); */              /*  134: 1a0111ea397fe69a4b1ba7b6434 */\
+/* sqr(t[0], t[0]); */              /*  135: 340223d472ffcd3496374f6c868 */\
+/* sqr(t[0], t[0]); */              /*  136: 680447a8e5ff9a692c6e9ed90d0 */\
+/* sqr(t[0], t[0]); */              /*  137: d0088f51cbff34d258dd3db21a0 */\
+/* sqr(t[0], t[0]); */              /*  138: 1a0111ea397fe69a4b1ba7b64340 */\
+/* sqr(t[0], t[0]); */              /*  139: 340223d472ffcd3496374f6c8680 */\
+sqr_n_mul(t[0], t[0], 7, t[12]);    /*  140: 340223d472ffcd3496374f6c8697 */\
+/* sqr(t[0], t[0]); */              /*  141: 680447a8e5ff9a692c6e9ed90d2e */\
+/* sqr(t[0], t[0]); */              /*  142: d0088f51cbff34d258dd3db21a5c */\
+/* sqr(t[0], t[0]); */              /*  143: 1a0111ea397fe69a4b1ba7b6434b8 */\
+/* sqr(t[0], t[0]); */              /*  144: 340223d472ffcd3496374f6c86970 */\
+/* sqr(t[0], t[0]); */              /*  145: 680447a8e5ff9a692c6e9ed90d2e0 */\
+sqr_n_mul(t[0], t[0], 5, t[13]);    /*  146: 680447a8e5ff9a692c6e9ed90d2eb */\
+/* sqr(t[0], t[0]); */              /*  147: d0088f51cbff34d258dd3db21a5d6 */\
+/* sqr(t[0], t[0]); */              /*  148: 1a0111ea397fe69a4b1ba7b6434bac */\
+/* sqr(t[0], t[0]); */              /*  149: 340223d472ffcd3496374f6c869758 */\
+/* sqr(t[0], t[0]); */              /*  150: 680447a8e5ff9a692c6e9ed90d2eb0 */\
+/* sqr(t[0], t[0]); */              /*  151: d0088f51cbff34d258dd3db21a5d60 */\
+/* sqr(t[0], t[0]); */              /*  152: 1a0111ea397fe69a4b1ba7b6434bac0 */\
+sqr_n_mul(t[0], t[0], 6, t[4]);     /*  153: 1a0111ea397fe69a4b1ba7b6434bacd */\
+/* sqr(t[0], t[0]); */              /*  154: 340223d472ffcd3496374f6c869759a */\
+/* sqr(t[0], t[0]); */              /*  155: 680447a8e5ff9a692c6e9ed90d2eb34 */\
+/* sqr(t[0], t[0]); */              /*  156: d0088f51cbff34d258dd3db21a5d668 */\
+/* sqr(t[0], t[0]); */              /*  157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\
+/* sqr(t[0], t[0]); */              /*  158: 340223d472ffcd3496374f6c869759a0 */\
+/* sqr(t[0], t[0]); */              /*  159: 680447a8e5ff9a692c6e9ed90d2eb340 */\
+sqr_n_mul(t[0], t[0], 6, t[6]);     /*  160: 680447a8e5ff9a692c6e9ed90d2eb35d */\
+/* sqr(t[0], t[0]); */              /*  161: d0088f51cbff34d258dd3db21a5d66ba */\
+/* sqr(t[0], t[0]); */              /*  162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\
+/* sqr(t[0], t[0]); */              /*  163: 340223d472ffcd3496374f6c869759ae8 */\
+/* sqr(t[0], t[0]); */              /*  164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\
+sqr_n_mul(t[0], t[0], 4, t[10]);    /*  165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\
+/* sqr(t[0], t[0]); */              /*  166: d0088f51cbff34d258dd3db21a5d66bb2 */\
+/* sqr(t[0], t[0]); */              /*  167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\
+/* sqr(t[0], t[0]); */              /*  168: 340223d472ffcd3496374f6c869759aec8 */\
+/* sqr(t[0], t[0]); */              /*  169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\
+/* sqr(t[0], t[0]); */              /*  170: d0088f51cbff34d258dd3db21a5d66bb20 */\
+/* sqr(t[0], t[0]); */              /*  171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\
+/* sqr(t[0], t[0]); */              /*  172: 340223d472ffcd3496374f6c869759aec80 */\
+/* sqr(t[0], t[0]); */              /*  173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\
+sqr_n_mul(t[0], t[0], 8, t[6]);     /*  174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\
+/* sqr(t[0], t[0]); */              /*  175: d0088f51cbff34d258dd3db21a5d66bb23a */\
+/* sqr(t[0], t[0]); */              /*  176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\
+/* sqr(t[0], t[0]); */              /*  177: 340223d472ffcd3496374f6c869759aec8e8 */\
+/* sqr(t[0], t[0]); */              /*  178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\
+sqr_n_mul(t[0], t[0], 4, t[4]);     /*  179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\
+/* sqr(t[0], t[0]); */              /*  180: d0088f51cbff34d258dd3db21a5d66bb23ba */\
+/* sqr(t[0], t[0]); */              /*  181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\
+/* sqr(t[0], t[0]); */              /*  182: 340223d472ffcd3496374f6c869759aec8ee8 */\
+/* sqr(t[0], t[0]); */              /*  183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\
+/* sqr(t[0], t[0]); */              /*  184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\
+/* sqr(t[0], t[0]); */              /*  185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\
+/* sqr(t[0], t[0]); */              /*  186: 340223d472ffcd3496374f6c869759aec8ee80 */\
+sqr_n_mul(t[0], t[0], 7, t[12]);    /*  187: 340223d472ffcd3496374f6c869759aec8ee97 */\
+/* sqr(t[0], t[0]); */              /*  188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\
+/* sqr(t[0], t[0]); */              /*  189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\
+/* sqr(t[0], t[0]); */              /*  190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\
+/* sqr(t[0], t[0]); */              /*  191: 340223d472ffcd3496374f6c869759aec8ee970 */\
+/* sqr(t[0], t[0]); */              /*  192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\
+/* sqr(t[0], t[0]); */              /*  193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\
+/* sqr(t[0], t[0]); */              /*  194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\
+/* sqr(t[0], t[0]); */              /*  195: 340223d472ffcd3496374f6c869759aec8ee9700 */\
+/* sqr(t[0], t[0]); */              /*  196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\
+sqr_n_mul(t[0], t[0], 9, t[11]);    /*  197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\
+/* sqr(t[0], t[0]); */              /*  198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\
+/* sqr(t[0], t[0]); */              /*  199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\
+sqr_n_mul(t[0], t[0], 2, t[9]);     /*  200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\
+/* sqr(t[0], t[0]); */              /*  201: 340223d472ffcd3496374f6c869759aec8ee9709e */\
+/* sqr(t[0], t[0]); */              /*  202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\
+/* sqr(t[0], t[0]); */              /*  203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\
+/* sqr(t[0], t[0]); */              /*  204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\
+/* sqr(t[0], t[0]); */              /*  205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\
+sqr_n_mul(t[0], t[0], 5, t[7]);     /*  206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\
+/* sqr(t[0], t[0]); */              /*  207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\
+/* sqr(t[0], t[0]); */              /*  208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\
+/* sqr(t[0], t[0]); */              /*  209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\
+/* sqr(t[0], t[0]); */              /*  210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\
+/* sqr(t[0], t[0]); */              /*  211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\
+/* sqr(t[0], t[0]); */              /*  212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\
+/* sqr(t[0], t[0]); */              /*  213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\
+sqr_n_mul(t[0], t[0], 7, t[2]);     /*  214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\
+/* sqr(t[0], t[0]); */              /*  215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\
+/* sqr(t[0], t[0]); */              /*  216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\
+/* sqr(t[0], t[0]); */              /*  217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\
+/* sqr(t[0], t[0]); */              /*  218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\
+/* sqr(t[0], t[0]); */              /*  219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\
+/* sqr(t[0], t[0]); */              /*  220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\
+/* sqr(t[0], t[0]); */              /*  221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\
+sqr_n_mul(t[0], t[0], 7, t[10]);    /*  222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\
+/* sqr(t[0], t[0]); */              /*  223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\
+/* sqr(t[0], t[0]); */              /*  224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\
+/* sqr(t[0], t[0]); */              /*  225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\
+/* sqr(t[0], t[0]); */              /*  226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\
+/* sqr(t[0], t[0]); */              /*  227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\
+/* sqr(t[0], t[0]); */              /*  228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\
+sqr_n_mul(t[0], t[0], 6, t[12]);    /*  229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\
+/* sqr(t[0], t[0]); */              /*  230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\
+/* sqr(t[0], t[0]); */              /*  231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\
+/* sqr(t[0], t[0]); */              /*  232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\
+/* sqr(t[0], t[0]); */              /*  233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\
+/* sqr(t[0], t[0]); */              /*  234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\
+sqr_n_mul(t[0], t[0], 5, t[6]);     /*  235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\
+/* sqr(t[0], t[0]); */              /*  236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\
+/* sqr(t[0], t[0]); */              /*  237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\
+/* sqr(t[0], t[0]); */              /*  238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\
+/* sqr(t[0], t[0]); */              /*  239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\
+/* sqr(t[0], t[0]); */              /*  240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\
+sqr_n_mul(t[0], t[0], 5, t[11]);    /*  241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\
+/* sqr(t[0], t[0]); */              /*  242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\
+/* sqr(t[0], t[0]); */              /*  243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\
+/* sqr(t[0], t[0]); */              /*  244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\
+/* sqr(t[0], t[0]); */              /*  245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\
+/* sqr(t[0], t[0]); */              /*  246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\
+sqr_n_mul(t[0], t[0], 5, t[11]);    /*  247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\
+/* sqr(t[0], t[0]); */              /*  248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\
+/* sqr(t[0], t[0]); */              /*  249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\
+/* sqr(t[0], t[0]); */              /*  250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\
+/* sqr(t[0], t[0]); */              /*  251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\
+/* sqr(t[0], t[0]); */              /*  252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\
+/* sqr(t[0], t[0]); */              /*  253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\
+/* sqr(t[0], t[0]); */              /*  254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\
+/* sqr(t[0], t[0]); */              /*  255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\
+sqr_n_mul(t[0], t[0], 8, t[4]);     /*  256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\
+/* sqr(t[0], t[0]); */              /*  257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\
+/* sqr(t[0], t[0]); */              /*  258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\
+/* sqr(t[0], t[0]); */              /*  259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\
+/* sqr(t[0], t[0]); */              /*  260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\
+/* sqr(t[0], t[0]); */              /*  261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\
+/* sqr(t[0], t[0]); */              /*  262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\
+/* sqr(t[0], t[0]); */              /*  263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\
+sqr_n_mul(t[0], t[0], 7, t[3]);     /*  264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\
+/* sqr(t[0], t[0]); */              /*  265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\
+/* sqr(t[0], t[0]); */              /*  266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\
+/* sqr(t[0], t[0]); */              /*  267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\
+/* sqr(t[0], t[0]); */              /*  268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\
+/* sqr(t[0], t[0]); */              /*  269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\
+/* sqr(t[0], t[0]); */              /*  270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\
+/* sqr(t[0], t[0]); */              /*  271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\
+/* sqr(t[0], t[0]); */              /*  272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\
+/* sqr(t[0], t[0]); */              /*  273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\
+sqr_n_mul(t[0], t[0], 9, t[8]);     /*  274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\
+/* sqr(t[0], t[0]); */              /*  275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\
+/* sqr(t[0], t[0]); */              /*  276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\
+/* sqr(t[0], t[0]); */              /*  277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\
+/* sqr(t[0], t[0]); */              /*  278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\
+/* sqr(t[0], t[0]); */              /*  279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\
+/* sqr(t[0], t[0]); */              /*  281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\
+/* sqr(t[0], t[0]); */              /*  282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\
+/* sqr(t[0], t[0]); */              /*  283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\
+sqr_n_mul(t[0], t[0], 3, t[9]);     /*  284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\
+/* sqr(t[0], t[0]); */              /*  285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\
+/* sqr(t[0], t[0]); */              /*  286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\
+/* sqr(t[0], t[0]); */              /*  287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\
+/* sqr(t[0], t[0]); */              /*  288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\
+/* sqr(t[0], t[0]); */              /*  289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\
+/* sqr(t[0], t[0]); */              /*  290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\
+/* sqr(t[0], t[0]); */              /*  291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\
+/* sqr(t[0], t[0]); */              /*  292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\
+sqr_n_mul(t[0], t[0], 8, t[8]);     /*  293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\
+/* sqr(t[0], t[0]); */              /*  294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\
+/* sqr(t[0], t[0]); */              /*  295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\
+/* sqr(t[0], t[0]); */              /*  296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\
+sqr_n_mul(t[0], t[0], 3, t[9]);     /*  297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\
+/* sqr(t[0], t[0]); */              /*  298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\
+/* sqr(t[0], t[0]); */              /*  299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\
+/* sqr(t[0], t[0]); */              /*  300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\
+/* sqr(t[0], t[0]); */              /*  301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\
+/* sqr(t[0], t[0]); */              /*  302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\
+/* sqr(t[0], t[0]); */              /*  303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\
+/* sqr(t[0], t[0]); */              /*  304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\
+sqr_n_mul(t[0], t[0], 7, t[10]);    /*  305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\
+/* sqr(t[0], t[0]); */              /*  306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\
+/* sqr(t[0], t[0]); */              /*  307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\
+/* sqr(t[0], t[0]); */              /*  308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\
+/* sqr(t[0], t[0]); */              /*  309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\
+/* sqr(t[0], t[0]); */              /*  310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\
+/* sqr(t[0], t[0]); */              /*  311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\
+/* sqr(t[0], t[0]); */              /*  312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\
+/* sqr(t[0], t[0]); */              /*  313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\
+/* sqr(t[0], t[0]); */              /*  314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\
+sqr_n_mul(t[0], t[0], 9, t[8]);     /*  315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\
+/* sqr(t[0], t[0]); */              /*  316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\
+/* sqr(t[0], t[0]); */              /*  317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\
+/* sqr(t[0], t[0]); */              /*  318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\
+/* sqr(t[0], t[0]); */              /*  319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\
+/* sqr(t[0], t[0]); */              /*  320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\
+/* sqr(t[0], t[0]); */              /*  321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*  322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\
+/* sqr(t[0], t[0]); */              /*  323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\
+/* sqr(t[0], t[0]); */              /*  324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\
+/* sqr(t[0], t[0]); */              /*  325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\
+/* sqr(t[0], t[0]); */              /*  326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\
+/* sqr(t[0], t[0]); */              /*  327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\
+/* sqr(t[0], t[0]); */              /*  328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\
+sqr_n_mul(t[0], t[0], 6, t[5]);     /*  329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\
+/* sqr(t[0], t[0]); */              /*  330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\
+/* sqr(t[0], t[0]); */              /*  331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\
+/* sqr(t[0], t[0]); */              /*  332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\
+/* sqr(t[0], t[0]); */              /*  333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\
+/* sqr(t[0], t[0]); */              /*  334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\
+/* sqr(t[0], t[0]); */              /*  336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\
+/* sqr(t[0], t[0]); */              /*  337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\
+/* sqr(t[0], t[0]); */              /*  338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\
+/* sqr(t[0], t[0]); */              /*  339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\
+/* sqr(t[0], t[0]); */              /*  340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\
+/* sqr(t[0], t[0]); */              /*  342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\
+/* sqr(t[0], t[0]); */              /*  343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\
+/* sqr(t[0], t[0]); */              /*  344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\
+/* sqr(t[0], t[0]); */              /*  345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\
+sqr_n_mul(t[0], t[0], 4, t[4]);     /*  346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\
+/* sqr(t[0], t[0]); */              /*  347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\
+/* sqr(t[0], t[0]); */              /*  348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\
+/* sqr(t[0], t[0]); */              /*  349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\
+sqr_n_mul(t[0], t[0], 3, t[9]);     /*  350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\
+/* sqr(t[0], t[0]); */              /*  351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\
+/* sqr(t[0], t[0]); */              /*  352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\
+/* sqr(t[0], t[0]); */              /*  353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\
+/* sqr(t[0], t[0]); */              /*  354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\
+/* sqr(t[0], t[0]); */              /*  355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\
+/* sqr(t[0], t[0]); */              /*  356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\
+/* sqr(t[0], t[0]); */              /*  357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\
+/* sqr(t[0], t[0]); */              /*  358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\
+sqr_n_mul(t[0], t[0], 8, t[3]);     /*  359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\
+/* sqr(t[0], t[0]); */              /*  360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\
+/* sqr(t[0], t[0]); */              /*  361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\
+/* sqr(t[0], t[0]); */              /*  362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\
+/* sqr(t[0], t[0]); */              /*  363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\
+/* sqr(t[0], t[0]); */              /*  364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\
+/* sqr(t[0], t[0]); */              /*  365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\
+/* sqr(t[0], t[0]); */              /*  366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\
+sqr_n_mul(t[0], t[0], 7, t[5]);     /*  367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\
+/* sqr(t[0], t[0]); */              /*  368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\
+/* sqr(t[0], t[0]); */              /*  369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\
+/* sqr(t[0], t[0]); */              /*  370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\
+/* sqr(t[0], t[0]); */              /*  371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\
+/* sqr(t[0], t[0]); */              /*  372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\
+/* sqr(t[0], t[0]); */              /*  374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\
+/* sqr(t[0], t[0]); */              /*  375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\
+/* sqr(t[0], t[0]); */              /*  376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\
+/* sqr(t[0], t[0]); */              /*  377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\
+/* sqr(t[0], t[0]); */              /*  378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\
+/* sqr(t[0], t[0]); */              /*  380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\
+/* sqr(t[0], t[0]); */              /*  381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\
+/* sqr(t[0], t[0]); */              /*  382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\
+/* sqr(t[0], t[0]); */              /*  383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\
+sqr_n_mul(t[0], t[0], 4, t[8]);     /*  384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\
+/* sqr(t[0], t[0]); */              /*  385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\
+/* sqr(t[0], t[0]); */              /*  386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\
+/* sqr(t[0], t[0]); */              /*  387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\
+/* sqr(t[0], t[0]); */              /*  388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\
+sqr_n_mul(t[0], t[0], 4, t[7]);     /*  389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\
+/* sqr(t[0], t[0]); */              /*  390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\
+/* sqr(t[0], t[0]); */              /*  391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\
+/* sqr(t[0], t[0]); */              /*  392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\
+/* sqr(t[0], t[0]); */              /*  393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\
+/* sqr(t[0], t[0]); */              /*  394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\
+/* sqr(t[0], t[0]); */              /*  395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\
+/* sqr(t[0], t[0]); */              /*  396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\
+sqr_n_mul(t[0], t[0], 7, t[5]);     /*  397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\
+/* sqr(t[0], t[0]); */              /*  398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\
+/* sqr(t[0], t[0]); */              /*  399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\
+/* sqr(t[0], t[0]); */              /*  400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\
+/* sqr(t[0], t[0]); */              /*  401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\
+/* sqr(t[0], t[0]); */              /*  402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\
+sqr_n_mul(t[0], t[0], 5, t[6]);     /*  403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\
+/* sqr(t[0], t[0]); */              /*  404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\
+/* sqr(t[0], t[0]); */              /*  405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\
+/* sqr(t[0], t[0]); */              /*  406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\
+/* sqr(t[0], t[0]); */              /*  407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\
+/* sqr(t[0], t[0]); */              /*  408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\
+/* sqr(t[0], t[0]); */              /*  410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\
+/* sqr(t[0], t[0]); */              /*  411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\
+/* sqr(t[0], t[0]); */              /*  412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\
+/* sqr(t[0], t[0]); */              /*  413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\
+/* sqr(t[0], t[0]); */              /*  414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\
+/* sqr(t[0], t[0]); */              /*  416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\
+/* sqr(t[0], t[0]); */              /*  417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\
+/* sqr(t[0], t[0]); */              /*  418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\
+/* sqr(t[0], t[0]); */              /*  419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\
+/* sqr(t[0], t[0]); */              /*  420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\
+/* sqr(t[0], t[0]); */              /*  422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\
+/* sqr(t[0], t[0]); */              /*  423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\
+/* sqr(t[0], t[0]); */              /*  424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\
+/* sqr(t[0], t[0]); */              /*  425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\
+/* sqr(t[0], t[0]); */              /*  426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\
+/* sqr(t[0], t[0]); */              /*  428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\
+/* sqr(t[0], t[0]); */              /*  429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\
+/* sqr(t[0], t[0]); */              /*  430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\
+/* sqr(t[0], t[0]); */              /*  431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\
+/* sqr(t[0], t[0]); */              /*  432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\
+/* sqr(t[0], t[0]); */              /*  434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\
+/* sqr(t[0], t[0]); */              /*  435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\
+/* sqr(t[0], t[0]); */              /*  436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\
+/* sqr(t[0], t[0]); */              /*  437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\
+/* sqr(t[0], t[0]); */              /*  438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\
+/* sqr(t[0], t[0]); */              /*  440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\
+/* sqr(t[0], t[0]); */              /*  441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\
+/* sqr(t[0], t[0]); */              /*  442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\
+/* sqr(t[0], t[0]); */              /*  443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\
+sqr_n_mul(t[0], t[0], 4, t[4]);     /*  444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\
+/* sqr(t[0], t[0]); */              /*  445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\
+/* sqr(t[0], t[0]); */              /*  446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\
+/* sqr(t[0], t[0]); */              /*  447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\
+/* sqr(t[0], t[0]); */              /*  448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\
+/* sqr(t[0], t[0]); */              /*  449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\
+/* sqr(t[0], t[0]); */              /*  450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*  451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\
+/* sqr(t[0], t[0]); */              /*  452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\
+/* sqr(t[0], t[0]); */              /*  453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\
+/* sqr(t[0], t[0]); */              /*  454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\
+/* sqr(t[0], t[0]); */              /*  455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\
+sqr_n_mul(t[0], t[0], 4, t[2]);     /*  456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\
+/* sqr(t[0], t[0]); */              /*  457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\
+/* sqr(t[0], t[0]); */              /*  458: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd554 */\
+/* sqr(t[0], t[0]); */              /*  459: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa8 */\
+sqr_n_mul(out, t[0], 3, t[1]);      /*  460: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9 */\
+} while(0)
diff --git a/blst/recip.c b/blst/recip.c
new file mode 100644
index 0000000..e0c7006
--- /dev/null
+++ b/blst/recip.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+
+#ifdef __OPTIMIZE_SIZE__
+/*
+ * 608 multiplications for scalar inversion modulo BLS12-381 prime, 32%
+ * more than corresponding optimal addition-chain, plus mispredicted
+ * branch penalties on top of that... The addition chain below was
+ * measured to be >50% faster.
+ */
+static void flt_reciprocal_fp(vec384 out, const vec384 inp)
+{
+    static const byte BLS12_381_P_minus_2[] = {
+        TO_BYTES(0xb9feffffffffaaa9), TO_BYTES(0x1eabfffeb153ffff),
+        TO_BYTES(0x6730d2a0f6b0f624), TO_BYTES(0x64774b84f38512bf),
+        TO_BYTES(0x4b1ba7b6434bacd7), TO_BYTES(0x1a0111ea397fe69a)
+    };
+
+    exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0);
+}
+#else
+# define sqr(ret,a)		sqr_fp(ret,a)
+# define mul(ret,a,b)		mul_fp(ret,a,b)
+# define sqr_n_mul(ret,a,n,b)	sqr_n_mul_fp(ret,a,n,b)
+
+# include "recip-addchain.h"
+static void flt_reciprocal_fp(vec384 out, const vec384 inp)
+{
+    RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384);
+}
+# undef RECIPROCAL_MOD_BLS12_381_P
+# undef sqr_n_mul
+# undef mul
+# undef sqr
+#endif
+
+static void flt_reciprocal_fp2(vec384x out, const vec384x inp)
+{
+    vec384 t0, t1;
+
+    /*
+     * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i
+     */
+    sqr_fp(t0, inp[0]);
+    sqr_fp(t1, inp[1]);
+    add_fp(t0, t0, t1);
+    flt_reciprocal_fp(t1, t0);
+    mul_fp(out[0], inp[0], t1);
+    mul_fp(out[1], inp[1], t1);
+    neg_fp(out[1], out[1]);
+}
+
+static void reciprocal_fp(vec384 out, const vec384 inp)
+{
+    static const vec384 Px8 = {    /* left-aligned value of the modulus */
+        TO_LIMB_T(0xcff7fffffffd5558), TO_LIMB_T(0xf55ffff58a9ffffd),
+        TO_LIMB_T(0x39869507b587b120), TO_LIMB_T(0x23ba5c279c2895fb),
+        TO_LIMB_T(0x58dd3db21a5d66bb), TO_LIMB_T(0xd0088f51cbff34d2)
+    };
+#ifdef __BLST_NO_ASM__
+# define RRx4 BLS12_381_RR
+#else
+    static const vec384 RRx4 = {   /* (4<<768)%P */
+        TO_LIMB_T(0x5f7e7cd070d107c2), TO_LIMB_T(0xec839a9ac49c13c8),
+        TO_LIMB_T(0x6933786f44f4ef0b), TO_LIMB_T(0xd6bf8b9c676be983),
+        TO_LIMB_T(0xd3adaaaa4dcefb06), TO_LIMB_T(0x12601bc1d82bc175)
+    };
+#endif
+    union { vec768 x; vec384 r[2]; } temp;
+
+    ct_inverse_mod_383(temp.x, inp, BLS12_381_P, Px8);
+    redc_mont_384(temp.r[0], temp.x, BLS12_381_P, p0);
+    mul_mont_384(temp.r[0], temp.r[0], RRx4, BLS12_381_P, p0);
+
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    /* sign goes straight to flt_reciprocal */
+    mul_mont_384(temp.r[1], temp.r[0], inp, BLS12_381_P, p0);
+    if (vec_is_equal(temp.r[1],  BLS12_381_Rx.p, sizeof(vec384)) |
+        vec_is_zero(temp.r[1], sizeof(vec384)))
+        vec_copy(out, temp.r[0], sizeof(vec384));
+    else
+        flt_reciprocal_fp(out, inp);
+#else
+    vec_copy(out, temp.r[0], sizeof(vec384));
+#endif
+#undef RRx4
+}
+
+void blst_fp_inverse(vec384 out, const vec384 inp)
+{   reciprocal_fp(out, inp);   }
+
+void blst_fp_eucl_inverse(vec384 ret, const vec384 a)
+{   reciprocal_fp(ret, a);   }
+
+static void reciprocal_fp2(vec384x out, const vec384x inp)
+{
+    vec384 t0, t1;
+
+    /*
+     * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i
+     */
+    sqr_fp(t0, inp[0]);
+    sqr_fp(t1, inp[1]);
+    add_fp(t0, t0, t1);
+    reciprocal_fp(t1, t0);
+    mul_fp(out[0], inp[0], t1);
+    mul_fp(out[1], inp[1], t1);
+    neg_fp(out[1], out[1]);
+}
+
+void blst_fp2_inverse(vec384x out, const vec384x inp)
+{   reciprocal_fp2(out, inp);   }
+
+void blst_fp2_eucl_inverse(vec384x out, const vec384x inp)
+{   reciprocal_fp2(out, inp);   }
+
+static void reciprocal_fr(vec256 out, const vec256 inp)
+{
+    static const vec256 rx2 = { /* left-aligned value of the modulus */
+        TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd),
+        TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90),
+    };
+    vec512 temp;
+
+    ct_inverse_mod_256(temp, inp, BLS12_381_r, rx2);
+    redc_mont_256(out, temp, BLS12_381_r, r0);
+    mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0);
+}
+
+void blst_fr_inverse(vec256 out, const vec256 inp)
+{   reciprocal_fr(out, inp);   }
+
+void blst_fr_eucl_inverse(vec256 out, const vec256 inp)
+{   reciprocal_fr(out, inp);   }
diff --git a/blst/server.c b/blst/server.c
new file mode 100644
index 0000000..52c1812
--- /dev/null
+++ b/blst/server.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "keygen.c"
+#include "hash_to_field.c"
+#include "e1.c"
+#include "map_to_g1.c"
+#include "e2.c"
+#include "map_to_g2.c"
+#include "fp12_tower.c"
+#include "pairing.c"
+#include "aggregate.c"
+#include "exp.c"
+#include "sqrt.c"
+#include "recip.c"
+#include "bulk_addition.c"
+#include "multi_scalar.c"
+#include "consts.c"
+#include "vect.c"
+#include "exports.c"
+#include "rb_tree.c"
diff --git a/blst/sha256.h b/blst/sha256.h
new file mode 100644
index 0000000..77ddb6d
--- /dev/null
+++ b/blst/sha256.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_SHA256_H__
+#define __BLS12_381_ASM_SHA256_H__
+
+#include "vect.h"
+
+#if (defined(__x86_64__) || defined(__x86_64) || defined(_M_X64)) && \
+     defined(__SHA__) /* -msha */ && !defined(__BLST_PORTABLE__)
+# define sha256_block_data_order blst_sha256_block_data_order_shaext
+#elif defined(__aarch64__) && \
+      defined(__ARM_FEATURE_CRYPTO) && !defined(__BLST_PORTABLE__)
+# define sha256_block_data_order blst_sha256_block_armv8
+#else
+# define sha256_block_data_order blst_sha256_block_data_order
+#endif
+#define sha256_hcopy blst_sha256_hcopy
+#define sha256_bcopy blst_sha256_bcopy
+#define sha256_emit  blst_sha256_emit
+
+void sha256_block_data_order(unsigned int *h, const void *inp, size_t blocks);
+void sha256_hcopy(unsigned int dst[8], const unsigned int src[8]);
+void sha256_bcopy(void *dst, const void *src, size_t len);
+
+/*
+ * If SHA256_CTX conflicts with something, just redefine it to alternative
+ * custom name prior including this header.
+ */
+typedef struct {
+    unsigned int h[8];
+    unsigned long long N;
+    unsigned char buf[64];
+    size_t off;
+} SHA256_CTX;
+
+
+static void sha256_init_h(unsigned int h[8])
+{
+    h[0] = 0x6a09e667U;
+    h[1] = 0xbb67ae85U;
+    h[2] = 0x3c6ef372U;
+    h[3] = 0xa54ff53aU;
+    h[4] = 0x510e527fU;
+    h[5] = 0x9b05688cU;
+    h[6] = 0x1f83d9abU;
+    h[7] = 0x5be0cd19U;
+}
+
+static void sha256_init(SHA256_CTX *ctx)
+{
+    sha256_init_h(ctx->h);
+    ctx->N = 0;
+    vec_zero(ctx->buf, sizeof(ctx->buf));
+    ctx->off = 0;
+}
+
+static void sha256_update(SHA256_CTX *ctx, const void *_inp, size_t len)
+{
+    size_t n;
+    const unsigned char *inp = _inp;
+
+    ctx->N += len;
+
+    if ((len != 0) & ((n = ctx->off) != 0)) {
+        size_t rem = sizeof(ctx->buf) - n;
+
+        if (rem > len) {
+            sha256_bcopy(ctx->buf + n, inp, len);
+            ctx->off += len;
+            return;
+        } else {
+            sha256_bcopy(ctx->buf + n, inp, rem);
+            inp += rem;
+            len -= rem;
+            sha256_block_data_order(ctx->h, ctx->buf, 1);
+            vec_zero(ctx->buf, sizeof(ctx->buf));
+            ctx->off = 0;
+        }
+    }
+
+    n = len / sizeof(ctx->buf);
+    if (n > 0) {
+        sha256_block_data_order(ctx->h, inp, n);
+        n *= sizeof(ctx->buf);
+        inp += n;
+        len -= n;
+    }
+
+    if (len)
+        sha256_bcopy(ctx->buf, inp, ctx->off = len);
+}
+
+#define __TOBE32(ptr, val) ((ptr)[0] = (unsigned char)((val)>>24), \
+                            (ptr)[1] = (unsigned char)((val)>>16), \
+                            (ptr)[2] = (unsigned char)((val)>>8),  \
+                            (ptr)[3] = (unsigned char)(val))
+
+#if 1
+void sha256_emit(unsigned char md[32], const unsigned int h[8]);
+#else
+static void sha256_emit(unsigned char md[32], const unsigned int h[8])
+{
+    unsigned int h_i;
+
+    h_i = h[0]; __TOBE32(md + 0, h_i);
+    h_i = h[1]; __TOBE32(md + 4, h_i);
+    h_i = h[2]; __TOBE32(md + 8, h_i);
+    h_i = h[3]; __TOBE32(md + 12, h_i);
+    h_i = h[4]; __TOBE32(md + 16, h_i);
+    h_i = h[5]; __TOBE32(md + 20, h_i);
+    h_i = h[6]; __TOBE32(md + 24, h_i);
+    h_i = h[7]; __TOBE32(md + 28, h_i);
+}
+#endif
+
+static void sha256_final(unsigned char md[32], SHA256_CTX *ctx)
+{
+    unsigned long long bits = ctx->N * 8;
+    size_t n = ctx->off;
+    unsigned char *tail;
+
+    ctx->buf[n++] = 0x80;
+
+    if (n > (sizeof(ctx->buf) - 8)) {
+        sha256_block_data_order(ctx->h, ctx->buf, 1);
+        vec_zero(ctx->buf, sizeof(ctx->buf));
+    }
+
+    tail = ctx->buf + sizeof(ctx->buf) - 8;
+    __TOBE32(tail, (unsigned int)(bits >> 32));
+    __TOBE32(tail + 4, (unsigned int)bits);
+    sha256_block_data_order(ctx->h, ctx->buf, 1);
+    sha256_emit(md, ctx->h);
+}
+
+#undef __TOBE32
+#endif
diff --git a/blst/sqrt-addchain.h b/blst/sqrt-addchain.h
new file mode 100644
index 0000000..4e7f0be
--- /dev/null
+++ b/blst/sqrt-addchain.h
@@ -0,0 +1,489 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * The "magic" number is (BLS12_381_P-3)/4. Exponentiation to which
+ * yields reciprocal of sqrt(x), which is used in simplified Shallue-
+ * van de Woestijne-Ulas map-to-curve method, but it's trivial to adapt
+ * it for more "traditional" sqrt(x) as 'x*ret' (or for is_square(x)
+ * as 'x*ret^2==1').
+ *
+ * Generated with 'addchain 1000602388805416848354447456433976039139220704984751971333014534031007912622709466110671907282253916009473568139946'
+ * https://github.com/kwantam/addchain
+ *
+ * # Bos-Coster (win=4)           :  458 (16) <<<
+ * # Bos-Coster (win=5)           :  460 (28)
+ * # Bos-Coster (win=6)           :  461 (33)
+ * # Bos-Coster (win=7)           :  460 (28)
+ * # Bos-Coster (win=3)           :  462 ( 9)
+ * # Bos-Coster (win=8)           :  466 (34)
+ * # Bos-Coster (win=9)           :  464 (31)
+ * # Yacobi                       :  478 (31)
+ * # Bos-Coster (win=10)          :  473 (30)
+ * # Bos-Coster (win=2)           :  486 ( 5)
+ * # Bergeron-Berstel-Brlek-Duboc :  489 ( 5)
+ */
+
+#define RECIP_SQRT_MOD_BLS12_381_P(out, inp, ptype) do { \
+ptype t[16]; \
+vec_copy(t[13], inp, sizeof(ptype));/*    0: 1 */\
+sqr(t[0], t[13]);                   /*    1: 2 */\
+mul(t[8], t[0], t[13]);             /*    2: 3 */\
+sqr(t[4], t[0]);                    /*    3: 4 */\
+mul(t[1], t[8], t[0]);              /*    4: 5 */\
+mul(t[6], t[4], t[8]);              /*    5: 7 */\
+mul(t[9], t[1], t[4]);              /*    6: 9 */\
+mul(t[12], t[6], t[4]);             /*    7: b */\
+mul(t[3], t[9], t[4]);              /*    8: d */\
+mul(t[7], t[12], t[4]);             /*    9: f */\
+mul(t[15], t[3], t[4]);             /*   10: 11 */\
+mul(t[10], t[7], t[4]);             /*   11: 13 */\
+mul(t[2], t[15], t[4]);             /*   12: 15 */\
+mul(t[11], t[10], t[4]);            /*   13: 17 */\
+sqr(t[0], t[3]);                    /*   14: 1a */\
+mul(t[14], t[11], t[4]);            /*   15: 1b */\
+mul(t[5], t[0], t[8]);              /*   16: 1d */\
+mul(t[4], t[0], t[1]);              /*   17: 1f */\
+/* sqr(t[0], t[0]); */              /*   18: 34 */\
+/* sqr(t[0], t[0]); */              /*   19: 68 */\
+/* sqr(t[0], t[0]); */              /*   20: d0 */\
+/* sqr(t[0], t[0]); */              /*   21: 1a0 */\
+/* sqr(t[0], t[0]); */              /*   22: 340 */\
+/* sqr(t[0], t[0]); */              /*   23: 680 */\
+/* sqr(t[0], t[0]); */              /*   24: d00 */\
+/* sqr(t[0], t[0]); */              /*   25: 1a00 */\
+/* sqr(t[0], t[0]); */              /*   26: 3400 */\
+/* sqr(t[0], t[0]); */              /*   27: 6800 */\
+/* sqr(t[0], t[0]); */              /*   28: d000 */\
+/* sqr(t[0], t[0]); */              /*   29: 1a000 */\
+sqr_n_mul(t[0], t[0], 12, t[15]);   /*   30: 1a011 */\
+/* sqr(t[0], t[0]); */              /*   31: 34022 */\
+/* sqr(t[0], t[0]); */              /*   32: 68044 */\
+/* sqr(t[0], t[0]); */              /*   33: d0088 */\
+/* sqr(t[0], t[0]); */              /*   34: 1a0110 */\
+/* sqr(t[0], t[0]); */              /*   35: 340220 */\
+/* sqr(t[0], t[0]); */              /*   36: 680440 */\
+/* sqr(t[0], t[0]); */              /*   37: d00880 */\
+sqr_n_mul(t[0], t[0], 7, t[7]);     /*   38: d0088f */\
+/* sqr(t[0], t[0]); */              /*   39: 1a0111e */\
+/* sqr(t[0], t[0]); */              /*   40: 340223c */\
+/* sqr(t[0], t[0]); */              /*   41: 6804478 */\
+/* sqr(t[0], t[0]); */              /*   42: d0088f0 */\
+sqr_n_mul(t[0], t[0], 4, t[1]);     /*   43: d0088f5 */\
+/* sqr(t[0], t[0]); */              /*   44: 1a0111ea */\
+/* sqr(t[0], t[0]); */              /*   45: 340223d4 */\
+/* sqr(t[0], t[0]); */              /*   46: 680447a8 */\
+/* sqr(t[0], t[0]); */              /*   47: d0088f50 */\
+/* sqr(t[0], t[0]); */              /*   48: 1a0111ea0 */\
+/* sqr(t[0], t[0]); */              /*   49: 340223d40 */\
+sqr_n_mul(t[0], t[0], 6, t[6]);     /*   50: 340223d47 */\
+/* sqr(t[0], t[0]); */              /*   51: 680447a8e */\
+/* sqr(t[0], t[0]); */              /*   52: d0088f51c */\
+/* sqr(t[0], t[0]); */              /*   53: 1a0111ea38 */\
+/* sqr(t[0], t[0]); */              /*   54: 340223d470 */\
+/* sqr(t[0], t[0]); */              /*   55: 680447a8e0 */\
+/* sqr(t[0], t[0]); */              /*   56: d0088f51c0 */\
+/* sqr(t[0], t[0]); */              /*   57: 1a0111ea380 */\
+sqr_n_mul(t[0], t[0], 7, t[11]);    /*   58: 1a0111ea397 */\
+/* sqr(t[0], t[0]); */              /*   59: 340223d472e */\
+/* sqr(t[0], t[0]); */              /*   60: 680447a8e5c */\
+/* sqr(t[0], t[0]); */              /*   61: d0088f51cb8 */\
+/* sqr(t[0], t[0]); */              /*   62: 1a0111ea3970 */\
+/* sqr(t[0], t[0]); */              /*   63: 340223d472e0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*   64: 340223d472ff */\
+/* sqr(t[0], t[0]); */              /*   65: 680447a8e5fe */\
+/* sqr(t[0], t[0]); */              /*   66: d0088f51cbfc */\
+sqr_n_mul(t[0], t[0], 2, t[8]);     /*   67: d0088f51cbff */\
+/* sqr(t[0], t[0]); */              /*   68: 1a0111ea397fe */\
+/* sqr(t[0], t[0]); */              /*   69: 340223d472ffc */\
+/* sqr(t[0], t[0]); */              /*   70: 680447a8e5ff8 */\
+/* sqr(t[0], t[0]); */              /*   71: d0088f51cbff0 */\
+/* sqr(t[0], t[0]); */              /*   72: 1a0111ea397fe0 */\
+/* sqr(t[0], t[0]); */              /*   73: 340223d472ffc0 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*   74: 340223d472ffcd */\
+/* sqr(t[0], t[0]); */              /*   75: 680447a8e5ff9a */\
+/* sqr(t[0], t[0]); */              /*   76: d0088f51cbff34 */\
+/* sqr(t[0], t[0]); */              /*   77: 1a0111ea397fe68 */\
+/* sqr(t[0], t[0]); */              /*   78: 340223d472ffcd0 */\
+/* sqr(t[0], t[0]); */              /*   79: 680447a8e5ff9a0 */\
+/* sqr(t[0], t[0]); */              /*   80: d0088f51cbff340 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*   81: d0088f51cbff34d */\
+/* sqr(t[0], t[0]); */              /*   82: 1a0111ea397fe69a */\
+/* sqr(t[0], t[0]); */              /*   83: 340223d472ffcd34 */\
+/* sqr(t[0], t[0]); */              /*   84: 680447a8e5ff9a68 */\
+/* sqr(t[0], t[0]); */              /*   85: d0088f51cbff34d0 */\
+/* sqr(t[0], t[0]); */              /*   86: 1a0111ea397fe69a0 */\
+/* sqr(t[0], t[0]); */              /*   87: 340223d472ffcd340 */\
+sqr_n_mul(t[0], t[0], 6, t[9]);     /*   88: 340223d472ffcd349 */\
+/* sqr(t[0], t[0]); */              /*   89: 680447a8e5ff9a692 */\
+/* sqr(t[0], t[0]); */              /*   90: d0088f51cbff34d24 */\
+/* sqr(t[0], t[0]); */              /*   91: 1a0111ea397fe69a48 */\
+sqr_n_mul(t[0], t[0], 3, t[8]);     /*   92: 1a0111ea397fe69a4b */\
+/* sqr(t[0], t[0]); */              /*   93: 340223d472ffcd3496 */\
+/* sqr(t[0], t[0]); */              /*   94: 680447a8e5ff9a692c */\
+/* sqr(t[0], t[0]); */              /*   95: d0088f51cbff34d258 */\
+/* sqr(t[0], t[0]); */              /*   96: 1a0111ea397fe69a4b0 */\
+/* sqr(t[0], t[0]); */              /*   97: 340223d472ffcd34960 */\
+/* sqr(t[0], t[0]); */              /*   98: 680447a8e5ff9a692c0 */\
+/* sqr(t[0], t[0]); */              /*   99: d0088f51cbff34d2580 */\
+sqr_n_mul(t[0], t[0], 7, t[3]);     /*  100: d0088f51cbff34d258d */\
+/* sqr(t[0], t[0]); */              /*  101: 1a0111ea397fe69a4b1a */\
+/* sqr(t[0], t[0]); */              /*  102: 340223d472ffcd349634 */\
+/* sqr(t[0], t[0]); */              /*  103: 680447a8e5ff9a692c68 */\
+/* sqr(t[0], t[0]); */              /*  104: d0088f51cbff34d258d0 */\
+sqr_n_mul(t[0], t[0], 4, t[3]);     /*  105: d0088f51cbff34d258dd */\
+/* sqr(t[0], t[0]); */              /*  106: 1a0111ea397fe69a4b1ba */\
+/* sqr(t[0], t[0]); */              /*  107: 340223d472ffcd3496374 */\
+/* sqr(t[0], t[0]); */              /*  108: 680447a8e5ff9a692c6e8 */\
+/* sqr(t[0], t[0]); */              /*  109: d0088f51cbff34d258dd0 */\
+/* sqr(t[0], t[0]); */              /*  110: 1a0111ea397fe69a4b1ba0 */\
+/* sqr(t[0], t[0]); */              /*  111: 340223d472ffcd34963740 */\
+sqr_n_mul(t[0], t[0], 6, t[7]);     /*  112: 340223d472ffcd3496374f */\
+/* sqr(t[0], t[0]); */              /*  113: 680447a8e5ff9a692c6e9e */\
+/* sqr(t[0], t[0]); */              /*  114: d0088f51cbff34d258dd3c */\
+/* sqr(t[0], t[0]); */              /*  115: 1a0111ea397fe69a4b1ba78 */\
+/* sqr(t[0], t[0]); */              /*  116: 340223d472ffcd3496374f0 */\
+/* sqr(t[0], t[0]); */              /*  117: 680447a8e5ff9a692c6e9e0 */\
+/* sqr(t[0], t[0]); */              /*  118: d0088f51cbff34d258dd3c0 */\
+sqr_n_mul(t[0], t[0], 6, t[14]);    /*  119: d0088f51cbff34d258dd3db */\
+/* sqr(t[0], t[0]); */              /*  120: 1a0111ea397fe69a4b1ba7b6 */\
+/* sqr(t[0], t[0]); */              /*  121: 340223d472ffcd3496374f6c */\
+/* sqr(t[0], t[0]); */              /*  122: 680447a8e5ff9a692c6e9ed8 */\
+sqr_n_mul(t[0], t[0], 3, t[13]);    /*  123: 680447a8e5ff9a692c6e9ed9 */\
+/* sqr(t[0], t[0]); */              /*  124: d0088f51cbff34d258dd3db2 */\
+/* sqr(t[0], t[0]); */              /*  125: 1a0111ea397fe69a4b1ba7b64 */\
+/* sqr(t[0], t[0]); */              /*  126: 340223d472ffcd3496374f6c8 */\
+/* sqr(t[0], t[0]); */              /*  127: 680447a8e5ff9a692c6e9ed90 */\
+/* sqr(t[0], t[0]); */              /*  128: d0088f51cbff34d258dd3db20 */\
+/* sqr(t[0], t[0]); */              /*  129: 1a0111ea397fe69a4b1ba7b640 */\
+/* sqr(t[0], t[0]); */              /*  130: 340223d472ffcd3496374f6c80 */\
+/* sqr(t[0], t[0]); */              /*  131: 680447a8e5ff9a692c6e9ed900 */\
+sqr_n_mul(t[0], t[0], 8, t[3]);     /*  132: 680447a8e5ff9a692c6e9ed90d */\
+/* sqr(t[0], t[0]); */              /*  133: d0088f51cbff34d258dd3db21a */\
+/* sqr(t[0], t[0]); */              /*  134: 1a0111ea397fe69a4b1ba7b6434 */\
+/* sqr(t[0], t[0]); */              /*  135: 340223d472ffcd3496374f6c868 */\
+/* sqr(t[0], t[0]); */              /*  136: 680447a8e5ff9a692c6e9ed90d0 */\
+/* sqr(t[0], t[0]); */              /*  137: d0088f51cbff34d258dd3db21a0 */\
+/* sqr(t[0], t[0]); */              /*  138: 1a0111ea397fe69a4b1ba7b64340 */\
+/* sqr(t[0], t[0]); */              /*  139: 340223d472ffcd3496374f6c8680 */\
+sqr_n_mul(t[0], t[0], 7, t[11]);    /*  140: 340223d472ffcd3496374f6c8697 */\
+/* sqr(t[0], t[0]); */              /*  141: 680447a8e5ff9a692c6e9ed90d2e */\
+/* sqr(t[0], t[0]); */              /*  142: d0088f51cbff34d258dd3db21a5c */\
+/* sqr(t[0], t[0]); */              /*  143: 1a0111ea397fe69a4b1ba7b6434b8 */\
+/* sqr(t[0], t[0]); */              /*  144: 340223d472ffcd3496374f6c86970 */\
+/* sqr(t[0], t[0]); */              /*  145: 680447a8e5ff9a692c6e9ed90d2e0 */\
+sqr_n_mul(t[0], t[0], 5, t[12]);    /*  146: 680447a8e5ff9a692c6e9ed90d2eb */\
+/* sqr(t[0], t[0]); */              /*  147: d0088f51cbff34d258dd3db21a5d6 */\
+/* sqr(t[0], t[0]); */              /*  148: 1a0111ea397fe69a4b1ba7b6434bac */\
+/* sqr(t[0], t[0]); */              /*  149: 340223d472ffcd3496374f6c869758 */\
+/* sqr(t[0], t[0]); */              /*  150: 680447a8e5ff9a692c6e9ed90d2eb0 */\
+/* sqr(t[0], t[0]); */              /*  151: d0088f51cbff34d258dd3db21a5d60 */\
+/* sqr(t[0], t[0]); */              /*  152: 1a0111ea397fe69a4b1ba7b6434bac0 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*  153: 1a0111ea397fe69a4b1ba7b6434bacd */\
+/* sqr(t[0], t[0]); */              /*  154: 340223d472ffcd3496374f6c869759a */\
+/* sqr(t[0], t[0]); */              /*  155: 680447a8e5ff9a692c6e9ed90d2eb34 */\
+/* sqr(t[0], t[0]); */              /*  156: d0088f51cbff34d258dd3db21a5d668 */\
+/* sqr(t[0], t[0]); */              /*  157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\
+/* sqr(t[0], t[0]); */              /*  158: 340223d472ffcd3496374f6c869759a0 */\
+/* sqr(t[0], t[0]); */              /*  159: 680447a8e5ff9a692c6e9ed90d2eb340 */\
+sqr_n_mul(t[0], t[0], 6, t[5]);     /*  160: 680447a8e5ff9a692c6e9ed90d2eb35d */\
+/* sqr(t[0], t[0]); */              /*  161: d0088f51cbff34d258dd3db21a5d66ba */\
+/* sqr(t[0], t[0]); */              /*  162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\
+/* sqr(t[0], t[0]); */              /*  163: 340223d472ffcd3496374f6c869759ae8 */\
+/* sqr(t[0], t[0]); */              /*  164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\
+sqr_n_mul(t[0], t[0], 4, t[9]);     /*  165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\
+/* sqr(t[0], t[0]); */              /*  166: d0088f51cbff34d258dd3db21a5d66bb2 */\
+/* sqr(t[0], t[0]); */              /*  167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\
+/* sqr(t[0], t[0]); */              /*  168: 340223d472ffcd3496374f6c869759aec8 */\
+/* sqr(t[0], t[0]); */              /*  169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\
+/* sqr(t[0], t[0]); */              /*  170: d0088f51cbff34d258dd3db21a5d66bb20 */\
+/* sqr(t[0], t[0]); */              /*  171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\
+/* sqr(t[0], t[0]); */              /*  172: 340223d472ffcd3496374f6c869759aec80 */\
+/* sqr(t[0], t[0]); */              /*  173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\
+sqr_n_mul(t[0], t[0], 8, t[5]);     /*  174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\
+/* sqr(t[0], t[0]); */              /*  175: d0088f51cbff34d258dd3db21a5d66bb23a */\
+/* sqr(t[0], t[0]); */              /*  176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\
+/* sqr(t[0], t[0]); */              /*  177: 340223d472ffcd3496374f6c869759aec8e8 */\
+/* sqr(t[0], t[0]); */              /*  178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\
+sqr_n_mul(t[0], t[0], 4, t[3]);     /*  179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\
+/* sqr(t[0], t[0]); */              /*  180: d0088f51cbff34d258dd3db21a5d66bb23ba */\
+/* sqr(t[0], t[0]); */              /*  181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\
+/* sqr(t[0], t[0]); */              /*  182: 340223d472ffcd3496374f6c869759aec8ee8 */\
+/* sqr(t[0], t[0]); */              /*  183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\
+/* sqr(t[0], t[0]); */              /*  184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\
+/* sqr(t[0], t[0]); */              /*  185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\
+/* sqr(t[0], t[0]); */              /*  186: 340223d472ffcd3496374f6c869759aec8ee80 */\
+sqr_n_mul(t[0], t[0], 7, t[11]);    /*  187: 340223d472ffcd3496374f6c869759aec8ee97 */\
+/* sqr(t[0], t[0]); */              /*  188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\
+/* sqr(t[0], t[0]); */              /*  189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\
+/* sqr(t[0], t[0]); */              /*  190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\
+/* sqr(t[0], t[0]); */              /*  191: 340223d472ffcd3496374f6c869759aec8ee970 */\
+/* sqr(t[0], t[0]); */              /*  192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\
+/* sqr(t[0], t[0]); */              /*  193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\
+/* sqr(t[0], t[0]); */              /*  194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\
+/* sqr(t[0], t[0]); */              /*  195: 340223d472ffcd3496374f6c869759aec8ee9700 */\
+/* sqr(t[0], t[0]); */              /*  196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\
+sqr_n_mul(t[0], t[0], 9, t[10]);    /*  197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\
+/* sqr(t[0], t[0]); */              /*  198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\
+/* sqr(t[0], t[0]); */              /*  199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\
+sqr_n_mul(t[0], t[0], 2, t[8]);     /*  200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\
+/* sqr(t[0], t[0]); */              /*  201: 340223d472ffcd3496374f6c869759aec8ee9709e */\
+/* sqr(t[0], t[0]); */              /*  202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\
+/* sqr(t[0], t[0]); */              /*  203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\
+/* sqr(t[0], t[0]); */              /*  204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\
+/* sqr(t[0], t[0]); */              /*  205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\
+sqr_n_mul(t[0], t[0], 5, t[6]);     /*  206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\
+/* sqr(t[0], t[0]); */              /*  207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\
+/* sqr(t[0], t[0]); */              /*  208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\
+/* sqr(t[0], t[0]); */              /*  209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\
+/* sqr(t[0], t[0]); */              /*  210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\
+/* sqr(t[0], t[0]); */              /*  211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\
+/* sqr(t[0], t[0]); */              /*  212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\
+/* sqr(t[0], t[0]); */              /*  213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\
+sqr_n_mul(t[0], t[0], 7, t[1]);     /*  214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\
+/* sqr(t[0], t[0]); */              /*  215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\
+/* sqr(t[0], t[0]); */              /*  216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\
+/* sqr(t[0], t[0]); */              /*  217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\
+/* sqr(t[0], t[0]); */              /*  218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\
+/* sqr(t[0], t[0]); */              /*  219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\
+/* sqr(t[0], t[0]); */              /*  220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\
+/* sqr(t[0], t[0]); */              /*  221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\
+sqr_n_mul(t[0], t[0], 7, t[9]);     /*  222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\
+/* sqr(t[0], t[0]); */              /*  223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\
+/* sqr(t[0], t[0]); */              /*  224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\
+/* sqr(t[0], t[0]); */              /*  225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\
+/* sqr(t[0], t[0]); */              /*  226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\
+/* sqr(t[0], t[0]); */              /*  227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\
+/* sqr(t[0], t[0]); */              /*  228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\
+sqr_n_mul(t[0], t[0], 6, t[11]);    /*  229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\
+/* sqr(t[0], t[0]); */              /*  230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\
+/* sqr(t[0], t[0]); */              /*  231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\
+/* sqr(t[0], t[0]); */              /*  232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\
+/* sqr(t[0], t[0]); */              /*  233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\
+/* sqr(t[0], t[0]); */              /*  234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\
+/* sqr(t[0], t[0]); */              /*  236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\
+/* sqr(t[0], t[0]); */              /*  237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\
+/* sqr(t[0], t[0]); */              /*  238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\
+/* sqr(t[0], t[0]); */              /*  239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\
+/* sqr(t[0], t[0]); */              /*  240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\
+sqr_n_mul(t[0], t[0], 5, t[10]);    /*  241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\
+/* sqr(t[0], t[0]); */              /*  242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\
+/* sqr(t[0], t[0]); */              /*  243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\
+/* sqr(t[0], t[0]); */              /*  244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\
+/* sqr(t[0], t[0]); */              /*  245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\
+/* sqr(t[0], t[0]); */              /*  246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\
+sqr_n_mul(t[0], t[0], 5, t[10]);    /*  247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\
+/* sqr(t[0], t[0]); */              /*  248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\
+/* sqr(t[0], t[0]); */              /*  249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\
+/* sqr(t[0], t[0]); */              /*  250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\
+/* sqr(t[0], t[0]); */              /*  251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\
+/* sqr(t[0], t[0]); */              /*  252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\
+/* sqr(t[0], t[0]); */              /*  253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\
+/* sqr(t[0], t[0]); */              /*  254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\
+/* sqr(t[0], t[0]); */              /*  255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\
+sqr_n_mul(t[0], t[0], 8, t[3]);     /*  256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\
+/* sqr(t[0], t[0]); */              /*  257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\
+/* sqr(t[0], t[0]); */              /*  258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\
+/* sqr(t[0], t[0]); */              /*  259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\
+/* sqr(t[0], t[0]); */              /*  260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\
+/* sqr(t[0], t[0]); */              /*  261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\
+/* sqr(t[0], t[0]); */              /*  262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\
+/* sqr(t[0], t[0]); */              /*  263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\
+sqr_n_mul(t[0], t[0], 7, t[2]);     /*  264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\
+/* sqr(t[0], t[0]); */              /*  265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\
+/* sqr(t[0], t[0]); */              /*  266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\
+/* sqr(t[0], t[0]); */              /*  267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\
+/* sqr(t[0], t[0]); */              /*  268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\
+/* sqr(t[0], t[0]); */              /*  269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\
+/* sqr(t[0], t[0]); */              /*  270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\
+/* sqr(t[0], t[0]); */              /*  271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\
+/* sqr(t[0], t[0]); */              /*  272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\
+/* sqr(t[0], t[0]); */              /*  273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\
+sqr_n_mul(t[0], t[0], 9, t[7]);     /*  274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\
+/* sqr(t[0], t[0]); */              /*  275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\
+/* sqr(t[0], t[0]); */              /*  276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\
+/* sqr(t[0], t[0]); */              /*  277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\
+/* sqr(t[0], t[0]); */              /*  278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\
+/* sqr(t[0], t[0]); */              /*  279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\
+sqr_n_mul(t[0], t[0], 5, t[3]);     /*  280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\
+/* sqr(t[0], t[0]); */              /*  281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\
+/* sqr(t[0], t[0]); */              /*  282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\
+/* sqr(t[0], t[0]); */              /*  283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\
+sqr_n_mul(t[0], t[0], 3, t[8]);     /*  284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\
+/* sqr(t[0], t[0]); */              /*  285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\
+/* sqr(t[0], t[0]); */              /*  286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\
+/* sqr(t[0], t[0]); */              /*  287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\
+/* sqr(t[0], t[0]); */              /*  288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\
+/* sqr(t[0], t[0]); */              /*  289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\
+/* sqr(t[0], t[0]); */              /*  290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\
+/* sqr(t[0], t[0]); */              /*  291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\
+/* sqr(t[0], t[0]); */              /*  292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\
+sqr_n_mul(t[0], t[0], 8, t[7]);     /*  293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\
+/* sqr(t[0], t[0]); */              /*  294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\
+/* sqr(t[0], t[0]); */              /*  295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\
+/* sqr(t[0], t[0]); */              /*  296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\
+sqr_n_mul(t[0], t[0], 3, t[8]);     /*  297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\
+/* sqr(t[0], t[0]); */              /*  298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\
+/* sqr(t[0], t[0]); */              /*  299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\
+/* sqr(t[0], t[0]); */              /*  300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\
+/* sqr(t[0], t[0]); */              /*  301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\
+/* sqr(t[0], t[0]); */              /*  302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\
+/* sqr(t[0], t[0]); */              /*  303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\
+/* sqr(t[0], t[0]); */              /*  304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\
+sqr_n_mul(t[0], t[0], 7, t[9]);     /*  305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\
+/* sqr(t[0], t[0]); */              /*  306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\
+/* sqr(t[0], t[0]); */              /*  307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\
+/* sqr(t[0], t[0]); */              /*  308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\
+/* sqr(t[0], t[0]); */              /*  309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\
+/* sqr(t[0], t[0]); */              /*  310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\
+/* sqr(t[0], t[0]); */              /*  311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\
+/* sqr(t[0], t[0]); */              /*  312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\
+/* sqr(t[0], t[0]); */              /*  313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\
+/* sqr(t[0], t[0]); */              /*  314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\
+sqr_n_mul(t[0], t[0], 9, t[7]);     /*  315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\
+/* sqr(t[0], t[0]); */              /*  316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\
+/* sqr(t[0], t[0]); */              /*  317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\
+/* sqr(t[0], t[0]); */              /*  318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\
+/* sqr(t[0], t[0]); */              /*  319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\
+/* sqr(t[0], t[0]); */              /*  320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\
+/* sqr(t[0], t[0]); */              /*  321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\
+sqr_n_mul(t[0], t[0], 6, t[2]);     /*  322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\
+/* sqr(t[0], t[0]); */              /*  323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\
+/* sqr(t[0], t[0]); */              /*  324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\
+/* sqr(t[0], t[0]); */              /*  325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\
+/* sqr(t[0], t[0]); */              /*  326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\
+/* sqr(t[0], t[0]); */              /*  327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\
+/* sqr(t[0], t[0]); */              /*  328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\
+sqr_n_mul(t[0], t[0], 6, t[4]);     /*  329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\
+/* sqr(t[0], t[0]); */              /*  330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\
+/* sqr(t[0], t[0]); */              /*  331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\
+/* sqr(t[0], t[0]); */              /*  332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\
+/* sqr(t[0], t[0]); */              /*  333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\
+/* sqr(t[0], t[0]); */              /*  334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\
+/* sqr(t[0], t[0]); */              /*  336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\
+/* sqr(t[0], t[0]); */              /*  337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\
+/* sqr(t[0], t[0]); */              /*  338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\
+/* sqr(t[0], t[0]); */              /*  339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\
+/* sqr(t[0], t[0]); */              /*  340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\
+/* sqr(t[0], t[0]); */              /*  342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\
+/* sqr(t[0], t[0]); */              /*  343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\
+/* sqr(t[0], t[0]); */              /*  344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\
+/* sqr(t[0], t[0]); */              /*  345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\
+sqr_n_mul(t[0], t[0], 4, t[3]);     /*  346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\
+/* sqr(t[0], t[0]); */              /*  347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\
+/* sqr(t[0], t[0]); */              /*  348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\
+/* sqr(t[0], t[0]); */              /*  349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\
+sqr_n_mul(t[0], t[0], 3, t[8]);     /*  350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\
+/* sqr(t[0], t[0]); */              /*  351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\
+/* sqr(t[0], t[0]); */              /*  352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\
+/* sqr(t[0], t[0]); */              /*  353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\
+/* sqr(t[0], t[0]); */              /*  354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\
+/* sqr(t[0], t[0]); */              /*  355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\
+/* sqr(t[0], t[0]); */              /*  356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\
+/* sqr(t[0], t[0]); */              /*  357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\
+/* sqr(t[0], t[0]); */              /*  358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\
+sqr_n_mul(t[0], t[0], 8, t[2]);     /*  359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\
+/* sqr(t[0], t[0]); */              /*  360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\
+/* sqr(t[0], t[0]); */              /*  361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\
+/* sqr(t[0], t[0]); */              /*  362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\
+/* sqr(t[0], t[0]); */              /*  363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\
+/* sqr(t[0], t[0]); */              /*  364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\
+/* sqr(t[0], t[0]); */              /*  365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\
+/* sqr(t[0], t[0]); */              /*  366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\
+sqr_n_mul(t[0], t[0], 7, t[4]);     /*  367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\
+/* sqr(t[0], t[0]); */              /*  368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\
+/* sqr(t[0], t[0]); */              /*  369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\
+/* sqr(t[0], t[0]); */              /*  370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\
+/* sqr(t[0], t[0]); */              /*  371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\
+/* sqr(t[0], t[0]); */              /*  372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\
+/* sqr(t[0], t[0]); */              /*  374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\
+/* sqr(t[0], t[0]); */              /*  375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\
+/* sqr(t[0], t[0]); */              /*  376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\
+/* sqr(t[0], t[0]); */              /*  377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\
+/* sqr(t[0], t[0]); */              /*  378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\
+/* sqr(t[0], t[0]); */              /*  380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\
+/* sqr(t[0], t[0]); */              /*  381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\
+/* sqr(t[0], t[0]); */              /*  382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\
+/* sqr(t[0], t[0]); */              /*  383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\
+sqr_n_mul(t[0], t[0], 4, t[7]);     /*  384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\
+/* sqr(t[0], t[0]); */              /*  385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\
+/* sqr(t[0], t[0]); */              /*  386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\
+/* sqr(t[0], t[0]); */              /*  387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\
+/* sqr(t[0], t[0]); */              /*  388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\
+sqr_n_mul(t[0], t[0], 4, t[6]);     /*  389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\
+/* sqr(t[0], t[0]); */              /*  390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\
+/* sqr(t[0], t[0]); */              /*  391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\
+/* sqr(t[0], t[0]); */              /*  392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\
+/* sqr(t[0], t[0]); */              /*  393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\
+/* sqr(t[0], t[0]); */              /*  394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\
+/* sqr(t[0], t[0]); */              /*  395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\
+/* sqr(t[0], t[0]); */              /*  396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\
+sqr_n_mul(t[0], t[0], 7, t[4]);     /*  397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\
+/* sqr(t[0], t[0]); */              /*  398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\
+/* sqr(t[0], t[0]); */              /*  399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\
+/* sqr(t[0], t[0]); */              /*  400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\
+/* sqr(t[0], t[0]); */              /*  401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\
+/* sqr(t[0], t[0]); */              /*  402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\
+/* sqr(t[0], t[0]); */              /*  404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\
+/* sqr(t[0], t[0]); */              /*  405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\
+/* sqr(t[0], t[0]); */              /*  406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\
+/* sqr(t[0], t[0]); */              /*  407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\
+/* sqr(t[0], t[0]); */              /*  408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\
+/* sqr(t[0], t[0]); */              /*  410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\
+/* sqr(t[0], t[0]); */              /*  411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\
+/* sqr(t[0], t[0]); */              /*  412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\
+/* sqr(t[0], t[0]); */              /*  413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\
+/* sqr(t[0], t[0]); */              /*  414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\
+/* sqr(t[0], t[0]); */              /*  416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\
+/* sqr(t[0], t[0]); */              /*  417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\
+/* sqr(t[0], t[0]); */              /*  418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\
+/* sqr(t[0], t[0]); */              /*  419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\
+/* sqr(t[0], t[0]); */              /*  420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\
+/* sqr(t[0], t[0]); */              /*  422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\
+/* sqr(t[0], t[0]); */              /*  423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\
+/* sqr(t[0], t[0]); */              /*  424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\
+/* sqr(t[0], t[0]); */              /*  425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\
+/* sqr(t[0], t[0]); */              /*  426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\
+/* sqr(t[0], t[0]); */              /*  428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\
+/* sqr(t[0], t[0]); */              /*  429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\
+/* sqr(t[0], t[0]); */              /*  430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\
+/* sqr(t[0], t[0]); */              /*  431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\
+/* sqr(t[0], t[0]); */              /*  432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\
+/* sqr(t[0], t[0]); */              /*  434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\
+/* sqr(t[0], t[0]); */              /*  435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\
+/* sqr(t[0], t[0]); */              /*  436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\
+/* sqr(t[0], t[0]); */              /*  437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\
+/* sqr(t[0], t[0]); */              /*  438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\
+/* sqr(t[0], t[0]); */              /*  440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\
+/* sqr(t[0], t[0]); */              /*  441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\
+/* sqr(t[0], t[0]); */              /*  442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\
+/* sqr(t[0], t[0]); */              /*  443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\
+sqr_n_mul(t[0], t[0], 4, t[3]);     /*  444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\
+/* sqr(t[0], t[0]); */              /*  445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\
+/* sqr(t[0], t[0]); */              /*  446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\
+/* sqr(t[0], t[0]); */              /*  447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\
+/* sqr(t[0], t[0]); */              /*  448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\
+/* sqr(t[0], t[0]); */              /*  449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\
+/* sqr(t[0], t[0]); */              /*  450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\
+sqr_n_mul(t[0], t[0], 6, t[2]);     /*  451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\
+/* sqr(t[0], t[0]); */              /*  452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\
+/* sqr(t[0], t[0]); */              /*  453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\
+/* sqr(t[0], t[0]); */              /*  454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\
+/* sqr(t[0], t[0]); */              /*  455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\
+sqr_n_mul(t[0], t[0], 4, t[1]);     /*  456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\
+sqr(out, t[0]);                     /*  457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\
+} while(0)
diff --git a/blst/sqrt.c b/blst/sqrt.c
new file mode 100644
index 0000000..cf149fd
--- /dev/null
+++ b/blst/sqrt.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+
+#ifdef __OPTIMIZE_SIZE__
+static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp)
+{
+    static const byte BLS_12_381_P_minus_3_div_4[] = {
+        TO_BYTES(0xee7fbfffffffeaaa), TO_BYTES(0x07aaffffac54ffff),
+        TO_BYTES(0xd9cc34a83dac3d89), TO_BYTES(0xd91dd2e13ce144af),
+        TO_BYTES(0x92c6e9ed90d2eb35), TO_BYTES(0x0680447a8e5ff9a6)
+    };
+
+    exp_mont_384(out, inp, BLS_12_381_P_minus_3_div_4, 379, BLS12_381_P, p0);
+}
+#else
+# if 1
+/*
+ * "383"-bit variant omits full reductions at the ends of squarings,
+ * which results in up to ~15% improvement. [One can improve further
+ * by omitting full reductions even after multiplications and
+ * performing final reduction at the very end of the chain.]
+ */
+static inline void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count,
+                                const vec384 b)
+{   sqr_n_mul_mont_383(out, a, count, BLS12_381_P, p0, b);   }
+# else
+static void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count,
+                         const vec384 b)
+{
+    while(count--) {
+        sqr_fp(out, a);
+        a = out;
+    }
+    mul_fp(out, out, b);
+}
+# endif
+
+# define sqr(ret,a)		sqr_fp(ret,a)
+# define mul(ret,a,b)		mul_fp(ret,a,b)
+# define sqr_n_mul(ret,a,n,b)	sqr_n_mul_fp(ret,a,n,b)
+
+# include "sqrt-addchain.h"
+static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp)
+{
+    RECIP_SQRT_MOD_BLS12_381_P(out, inp, vec384);
+}
+# undef RECIP_SQRT_MOD_BLS12_381_P
+
+# undef sqr_n_mul
+# undef sqr
+# undef mul
+#endif
+
+static bool_t recip_sqrt_fp(vec384 out, const vec384 inp)
+{
+    vec384 t0, t1;
+    bool_t ret;
+
+    recip_sqrt_fp_3mod4(t0, inp);
+
+    mul_fp(t1, t0, inp);
+    sqr_fp(t1, t1);
+    ret = vec_is_equal(t1, inp, sizeof(t1));
+    vec_copy(out, t0, sizeof(t0));
+
+    return ret;
+}
+
+static bool_t sqrt_fp(vec384 out, const vec384 inp)
+{
+    vec384 t0, t1;
+    bool_t ret;
+
+    recip_sqrt_fp_3mod4(t0, inp);
+
+    mul_fp(t0, t0, inp);
+    sqr_fp(t1, t0);
+    ret = vec_is_equal(t1, inp, sizeof(t1));
+    vec_copy(out, t0, sizeof(t0));
+
+    return ret;
+}
+
+int blst_fp_sqrt(vec384 out, const vec384 inp)
+{   return (int)sqrt_fp(out, inp);   }
+
+int blst_fp_is_square(const vec384 inp)
+{
+    return (int)ct_is_square_mod_384(inp, BLS12_381_P);
+}
+
+static bool_t sqrt_align_fp2(vec384x out, const vec384x ret,
+                             const vec384x sqrt, const vec384x inp)
+{
+    static const vec384x sqrt_minus_1 = { { 0 }, { ONE_MONT_P } };
+    static const vec384x sqrt_sqrt_minus_1 = {
+      /*
+       * "magic" number is ±2^((p-3)/4)%p, which is "1/sqrt(2)",
+       * in quotes because 2*"1/sqrt(2)"^2 == -1 mod p, not 1,
+       * but it pivots into "complex" plane nevertheless...
+       */
+      { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183),
+        TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18),
+        TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) },
+      { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c),
+        TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7),
+        TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }
+    };
+    static const vec384x sqrt_minus_sqrt_minus_1 = {
+      { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c),
+        TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7),
+        TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) },
+      { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c),
+        TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7),
+        TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }
+    };
+    vec384x coeff, t0, t1;
+    bool_t is_sqrt, flag;
+
+    /*
+     * Instead of multiple trial squarings we can perform just one
+     * and see if the result is "rotated by multiple of 90°" in
+     * relation to |inp|, and "rotate" |ret| accordingly.
+     */
+    sqr_fp2(t0, sqrt);
+    /* "sqrt(|inp|)"^2 = (a + b*i)^2 = (a^2-b^2) + 2ab*i */
+
+    /* (a^2-b^2) + 2ab*i == |inp| ? |ret| is spot on */
+    sub_fp2(t1, t0, inp);
+    is_sqrt = vec_is_zero(t1, sizeof(t1));
+    vec_copy(coeff, BLS12_381_Rx.p2, sizeof(coeff));
+
+    /* -(a^2-b^2) - 2ab*i == |inp| ? "rotate |ret| by 90°" */
+    add_fp2(t1, t0, inp);
+    vec_select(coeff, sqrt_minus_1, coeff, sizeof(coeff),
+               flag = vec_is_zero(t1, sizeof(t1)));
+    is_sqrt |= flag;
+
+    /* 2ab - (a^2-b^2)*i == |inp| ? "rotate |ret| by 135°" */
+    sub_fp(t1[0], t0[0], inp[1]);
+    add_fp(t1[1], t0[1], inp[0]);
+    vec_select(coeff, sqrt_sqrt_minus_1, coeff, sizeof(coeff),
+               flag = vec_is_zero(t1, sizeof(t1)));
+    is_sqrt |= flag;
+
+    /* -2ab + (a^2-b^2)*i == |inp| ? "rotate |ret| by 45°" */
+    add_fp(t1[0], t0[0], inp[1]);
+    sub_fp(t1[1], t0[1], inp[0]);
+    vec_select(coeff, sqrt_minus_sqrt_minus_1, coeff, sizeof(coeff),
+               flag = vec_is_zero(t1, sizeof(t1)));
+    is_sqrt |= flag;
+
+    /* actual "rotation" */
+    mul_fp2(out, ret, coeff);
+
+    return is_sqrt;
+}
+
+/*
+ * |inp| = a + b*i
+ */
+static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp,
+                                          const vec384x recip_ZZZ,
+                                          const vec384x magic_ZZZ)
+{
+    vec384 aa, bb, cc;
+    vec384x inp_;
+    bool_t is_sqrt;
+
+    sqr_fp(aa, inp[0]);
+    sqr_fp(bb, inp[1]);
+    add_fp(aa, aa, bb);
+
+    is_sqrt = recip_sqrt_fp(cc, aa);  /* 1/sqrt(a²+b²)                    */
+
+    /* if |inp| doesn't have quadratic residue, multiply by "1/Z³" ...    */
+    mul_fp2(inp_, inp, recip_ZZZ);
+    /* ... and adjust |aa| and |cc| accordingly                           */
+    {
+        vec384 za, zc;
+
+        mul_fp(za, aa, magic_ZZZ[0]); /* aa*(za² + zb²)                   */
+        mul_fp(zc, cc, magic_ZZZ[1]); /* cc*(za² + zb²)^((p-3)/4)         */
+        vec_select(aa, aa, za, sizeof(aa), is_sqrt);
+        vec_select(cc, cc, zc, sizeof(cc), is_sqrt);
+    }
+    vec_select(inp_, inp, inp_, sizeof(inp_), is_sqrt);
+
+    mul_fp(aa, aa, cc);               /* sqrt(a²+b²)                      */
+
+    sub_fp(bb, inp_[0], aa);
+    add_fp(aa, inp_[0], aa);
+    vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa)));
+    div_by_2_fp(aa, aa);              /* (a ± sqrt(a²+b²))/2              */
+
+    /* if it says "no sqrt," final "align" will find right one...         */
+    (void)recip_sqrt_fp(out[0], aa);  /* 1/sqrt((a ± sqrt(a²+b²))/2)      */
+
+    div_by_2_fp(out[1], inp_[1]);
+    mul_fp(out[1], out[1], out[0]);   /* b/(2*sqrt((a ± sqrt(a²+b²))/2))  */
+    mul_fp(out[0], out[0], aa);       /* sqrt((a ± sqrt(a²+b²))/2)        */
+
+    /* bound to succeed                                                   */
+    (void)sqrt_align_fp2(out, out, out, inp_);
+
+    mul_fp(out[0], out[0], cc);       /* inverse the result               */
+    mul_fp(out[1], out[1], cc);
+    neg_fp(out[1], out[1]);
+
+    return is_sqrt;
+}
+
+static bool_t sqrt_fp2(vec384x out, const vec384x inp)
+{
+    vec384x ret;
+    vec384 aa, bb;
+
+    sqr_fp(aa, inp[0]);
+    sqr_fp(bb, inp[1]);
+    add_fp(aa, aa, bb);
+
+    /* don't pay attention to return value, final "align" will tell...    */
+    (void)sqrt_fp(aa, aa);            /* sqrt(a²+b²)                      */
+
+    sub_fp(bb, inp[0], aa);
+    add_fp(aa, inp[0], aa);
+    vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa)));
+    div_by_2_fp(aa, aa);              /* (a ± sqrt(a²+b²))/2              */
+
+    /* if it says "no sqrt," final "align" will find right one...         */
+    (void)recip_sqrt_fp(ret[0], aa);  /* 1/sqrt((a ± sqrt(a²+b²))/2)      */
+
+    div_by_2_fp(ret[1], inp[1]);
+    mul_fp(ret[1], ret[1], ret[0]);   /* b/(2*sqrt((a ± sqrt(a²+b²))/2))  */
+    mul_fp(ret[0], ret[0], aa);       /* sqrt((a ± sqrt(a²+b²))/2)        */
+
+    /*
+     * Now see if |ret| is or can be made sqrt(|inp|)...
+     */
+
+    return sqrt_align_fp2(out, ret, ret, inp);
+}
+
+int blst_fp2_sqrt(vec384x out, const vec384x inp)
+{   return (int)sqrt_fp2(out, inp);   }
+
+int blst_fp2_is_square(const vec384x inp)
+{
+    vec384 aa, bb;
+
+    sqr_fp(aa, inp[0]);
+    sqr_fp(bb, inp[1]);
+    add_fp(aa, aa, bb);
+
+    return (int)ct_is_square_mod_384(aa, BLS12_381_P);
+}
diff --git a/blst/vect.c b/blst/vect.c
new file mode 100644
index 0000000..1834a48
--- /dev/null
+++ b/blst/vect.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "vect.h"
+
+#ifdef __BLST_NO_ASM__
+# include "no_asm.h"
+#endif
+
+/*
+ * Following are some reference C implementations to assist new
+ * assembly modules development, as starting-point stand-ins and for
+ * cross-checking. In order to "polyfil" specific subroutine redefine
+ * it on compiler command line, e.g. -Dmul_mont_384x=_mul_mont_384x.
+ */
+
+#ifdef lshift_mod_384
+inline void lshift_mod_384(vec384 ret, const vec384 a, size_t n,
+                           const vec384 mod)
+{
+    while(n--)
+        add_mod_384(ret, a, a, mod), a = ret;
+}
+#endif
+
+#ifdef mul_by_8_mod_384
+inline void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 mod)
+{   lshift_mod_384(ret, a, 3, mod);   }
+#endif
+
+#ifdef mul_by_3_mod_384
+inline void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 mod)
+{
+    vec384 t;
+
+    add_mod_384(t, a, a, mod);
+    add_mod_384(ret, t, a, mod);
+}
+#endif
+
+#ifdef mul_by_3_mod_384x
+inline void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 mod)
+{
+    mul_by_3_mod_384(ret[0], a[0], mod);
+    mul_by_3_mod_384(ret[1], a[1], mod);
+}
+#endif
+
+#ifdef mul_by_8_mod_384x
+inline void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 mod)
+{
+    mul_by_8_mod_384(ret[0], a[0], mod);
+    mul_by_8_mod_384(ret[1], a[1], mod);
+}
+#endif
+
+#ifdef mul_by_1_plus_i_mod_384x
+inline void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a,
+                                     const vec384 mod)
+{
+    vec384 t;
+
+    add_mod_384(t, a[0], a[1], mod);
+    sub_mod_384(ret[0], a[0], a[1], mod);
+    vec_copy(ret[1], t, sizeof(t));
+}
+#endif
+
+#ifdef add_mod_384x
+inline void add_mod_384x(vec384x ret, const vec384x a, const vec384x b,
+                         const vec384 mod)
+{
+    add_mod_384(ret[0], a[0], b[0], mod);
+    add_mod_384(ret[1], a[1], b[1], mod);
+}
+#endif
+
+#ifdef sub_mod_384x
+inline void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b,
+                         const vec384 mod)
+{
+    sub_mod_384(ret[0], a[0], b[0], mod);
+    sub_mod_384(ret[1], a[1], b[1], mod);
+}
+#endif
+
+#ifdef lshift_mod_384x
+inline void lshift_mod_384x(vec384x ret, const vec384x a, size_t n,
+                            const vec384 mod)
+{
+    lshift_mod_384(ret[0], a[0], n, mod);
+    lshift_mod_384(ret[1], a[1], n, mod);
+}
+#endif
+
+#if defined(mul_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__))
+void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b,
+                   const vec384 mod, limb_t n0)
+{
+    vec768 t0, t1, t2;
+    vec384 aa, bb;
+
+    mul_384(t0, a[0], b[0]);
+    mul_384(t1, a[1], b[1]);
+
+    add_mod_384(aa, a[0], a[1], mod);
+    add_mod_384(bb, b[0], b[1], mod);
+    mul_384(t2, aa, bb);
+    sub_mod_384x384(t2, t2, t0, mod);
+    sub_mod_384x384(t2, t2, t1, mod);
+
+    sub_mod_384x384(t0, t0, t1, mod);
+
+    redc_mont_384(ret[0], t0, mod, n0);
+    redc_mont_384(ret[1], t2, mod, n0);
+}
+#endif
+
+#if defined(sqr_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__))
+void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 mod, limb_t n0)
+{
+    vec384 t0, t1;
+
+    add_mod_384(t0, a[0], a[1], mod);
+    sub_mod_384(t1, a[0], a[1], mod);
+
+    mul_mont_384(ret[1], a[0], a[1], mod, n0);
+    add_mod_384(ret[1], ret[1], ret[1], mod);
+
+    mul_mont_384(ret[0], t0, t1, mod, n0);
+}
+#endif
+
+limb_t div_3_limbs(const limb_t dividend_top[2], limb_t d_lo, limb_t d_hi);
+limb_t quot_rem_128(limb_t *quot_rem, const limb_t *divisor, limb_t quotient);
+limb_t quot_rem_64(limb_t *quot_rem, const limb_t *divisor, limb_t quotient);
+
+/*
+ * Divide 255-bit |val| by z^2 yielding 128-bit quotient and remainder in place.
+ */
+static void div_by_zz(limb_t val[])
+{
+    static const limb_t zz[] = { TO_LIMB_T(0x0000000100000000),
+                                 TO_LIMB_T(0xac45a4010001a402) };
+    size_t loop, zz_len = sizeof(zz)/sizeof(zz[0]);
+    limb_t d_lo, d_hi;
+
+    d_lo = zz[zz_len - 2];
+    d_hi = zz[zz_len - 1];
+    for (loop = zz_len, zz_len--; loop--;) {
+        limb_t q = div_3_limbs(val + loop + zz_len, d_lo, d_hi);
+        (void)quot_rem_128(val + loop, zz, q);
+    }
+    /* remainder is in low half of val[], quotient is in high */
+}
+
+/*
+ * Divide 128-bit |val| by z yielding 64-bit quotient and remainder in place.
+ */
+static void div_by_z(limb_t val[])
+{
+    static const limb_t z[] = { TO_LIMB_T(0xd201000000010000) };
+    size_t loop, z_len = sizeof(z)/sizeof(z[0]);
+    limb_t d_lo, d_hi;
+
+    d_lo = (sizeof(z) == sizeof(limb_t)) ? 0 : z[z_len - 2];
+    d_hi = z[z_len - 1];
+    for (loop = z_len, z_len--; loop--;) {
+        limb_t q = div_3_limbs(val + loop + z_len, d_lo, d_hi);
+        (void)quot_rem_64(val + loop, z, q);
+    }
+    /* remainder is in low half of val[], quotient is in high */
+}
diff --git a/blst/vect.h b/blst/vect.h
new file mode 100644
index 0000000..11b5836
--- /dev/null
+++ b/blst/vect.h
@@ -0,0 +1,483 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_VECT_H__
+#define __BLS12_381_ASM_VECT_H__
+
+#include <stddef.h>
+
+#if defined(__x86_64__) || defined(__aarch64__)
+/* These are available even in ILP32 flavours, but even then they are
+ * capable of performing 64-bit operations as efficiently as in *P64. */
+typedef unsigned long long limb_t;
+# define LIMB_T_BITS    64
+
+#elif defined(_WIN64)   /* Win64 is P64 */
+typedef unsigned __int64 limb_t;
+# define LIMB_T_BITS    64
+
+#elif defined(__BLST_NO_ASM__) || defined(__wasm64__)
+typedef unsigned int limb_t;
+# define LIMB_T_BITS    32
+# ifndef __BLST_NO_ASM__
+#  define __BLST_NO_ASM__
+# endif
+
+#else                   /* 32 bits on 32-bit platforms, 64 - on 64-bit */
+typedef unsigned long limb_t;
+#  ifdef _LP64
+#   define LIMB_T_BITS   64
+#  else
+#   define LIMB_T_BITS   32
+#   define __BLST_NO_ASM__
+#  endif
+#endif
+
+/*
+ * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor
+ * knows nothing about sizeof(anything)...
+ */
+#if LIMB_T_BITS == 64
+# define TO_LIMB_T(limb64)     limb64
+#else
+# define TO_LIMB_T(limb64)     (limb_t)limb64,(limb_t)(limb64>>32)
+#endif
+
+#define NLIMBS(bits)   (bits/LIMB_T_BITS)
+
+typedef limb_t vec256[NLIMBS(256)];
+typedef limb_t vec512[NLIMBS(512)];
+typedef limb_t vec384[NLIMBS(384)];
+typedef limb_t vec768[NLIMBS(768)];
+typedef vec384 vec384x[2];      /* 0 is "real" part, 1 is "imaginary" */
+
+typedef unsigned char byte;
+#define TO_BYTES(limb64)    (byte)limb64,(byte)(limb64>>8),\
+                            (byte)(limb64>>16),(byte)(limb64>>24),\
+                            (byte)(limb64>>32),(byte)(limb64>>40),\
+                            (byte)(limb64>>48),(byte)(limb64>>56)
+typedef byte pow256[256/8];
+
+/*
+ * Internal Boolean type, Bolean by value, hence safe to cast to or
+ * reinterpret as 'bool'.
+ */
+typedef limb_t bool_t;
+
+/*
+ * Assembly subroutines...
+ */
+#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__)
+# define mul_mont_sparse_256 mulx_mont_sparse_256
+# define sqr_mont_sparse_256 sqrx_mont_sparse_256
+# define from_mont_256 fromx_mont_256
+# define redc_mont_256 redcx_mont_256
+# define mul_mont_384 mulx_mont_384
+# define sqr_mont_384 sqrx_mont_384
+# define sqr_n_mul_mont_384 sqrx_n_mul_mont_384
+# define sqr_n_mul_mont_383 sqrx_n_mul_mont_383
+# define mul_384 mulx_384
+# define sqr_384 sqrx_384
+# define redc_mont_384 redcx_mont_384
+# define from_mont_384 fromx_mont_384
+# define sgn0_pty_mont_384 sgn0x_pty_mont_384
+# define sgn0_pty_mont_384x sgn0x_pty_mont_384x
+# define ct_inverse_mod_383 ctx_inverse_mod_383
+#elif defined(__BLST_NO_ASM__)
+# define ct_inverse_mod_383 ct_inverse_mod_384
+#endif
+
+void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b,
+                         const vec256 p, limb_t n0);
+void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0);
+void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0);
+void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0);
+
+void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p);
+void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p);
+void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p);
+void cneg_mod_256(vec256 ret, const vec256 a, bool_t flag, const vec256 p);
+void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p);
+void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p);
+bool_t eucl_inverse_mod_256(vec256 ret, const vec256 a, const vec256 p,
+                            const vec256 one);
+limb_t check_mod_256(const pow256 a, const vec256 p);
+limb_t add_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b,
+                                       const vec256 p);
+limb_t sub_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b,
+                                       const vec256 p);
+
+void vec_prefetch(const void *ptr, size_t len);
+
+void mul_mont_384(vec384 ret, const vec384 a, const vec384 b,
+                  const vec384 p, limb_t n0);
+void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0);
+void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count,
+                        const vec384 p, limb_t n0, const vec384 b);
+void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count,
+                        const vec384 p, limb_t n0, const vec384 b);
+
+void mul_384(vec768 ret, const vec384 a, const vec384 b);
+void sqr_384(vec768 ret, const vec384 a);
+void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0);
+void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0);
+limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0);
+limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0);
+limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p);
+limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p);
+
+void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p);
+void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p);
+void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p);
+void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p);
+void cneg_mod_384(vec384 ret, const vec384 a, bool_t flag, const vec384 p);
+void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p);
+void rshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p);
+void div_by_2_mod_384(vec384 ret, const vec384 a, const vec384 p);
+void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod,
+                                                      const vec384 modx);
+void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
+                                                      const vec256 modx);
+bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod);
+
+#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__)
+# define mul_mont_384x mulx_mont_384x
+# define sqr_mont_384x sqrx_mont_384x
+# define sqr_mont_382x sqrx_mont_382x
+# define sqr_n_mul_mont_384x sqrx_n_mul_mont_384x
+# define mul_382x mulx_382x
+# define sqr_382x sqrx_382x
+#endif
+
+void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b,
+                   const vec384 p, limb_t n0);
+void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0);
+void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0);
+void sqr_n_mul_mont_384x(vec384x ret, const vec384x a, size_t count,
+                         const vec384 p, limb_t n0, const vec384x b);
+void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p);
+void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p);
+
+void add_mod_384x(vec384x ret, const vec384x a, const vec384x b,
+                  const vec384 p);
+void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b,
+                  const vec384 p);
+void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p);
+void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p);
+void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p);
+void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b,
+                     const vec384 p);
+void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b,
+                     const vec384 p);
+
+/*
+ * C subroutines
+ */
+static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow,
+                         size_t pow_bits, const vec384 p, limb_t n0);
+static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow,
+                          size_t pow_bits, const vec384 p, limb_t n0);
+static void div_by_zz(limb_t val[]);
+static void div_by_z(limb_t val[]);
+
+#ifdef __UINTPTR_TYPE__
+typedef __UINTPTR_TYPE__ uptr_t;
+#else
+typedef const void *uptr_t;
+#endif
+
+#if !defined(restrict)
+# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901
+#  if defined(__GNUC__) && __GNUC__>=2
+#   define restrict __restrict__
+#  elif defined(_MSC_VER)
+#   define restrict __restrict
+#  else
+#   define restrict
+#  endif
+# endif
+#endif
+
+#if defined(__CUDA_ARCH__)
+# define inline inline __device__
+#endif
+
+#if !defined(inline) && !defined(__cplusplus)
+# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901
+#  if defined(__GNUC__) && __GNUC__>=2
+#   define inline __inline__
+#  elif defined(_MSC_VER)
+#   define inline __inline
+#  else
+#   define inline
+#  endif
+# endif
+#endif
+
+static inline bool_t is_bit_set(const byte *v, size_t i)
+{   return (v[i/8] >> (i%8)) & 1;   }
+
+static inline bool_t byte_is_zero(unsigned char c)
+{   return ((limb_t)(c) - 1) >> (LIMB_T_BITS - 1);   }
+
+static inline bool_t bytes_are_zero(const unsigned char *a, size_t num)
+{
+    unsigned char acc;
+    size_t i;
+
+    for (acc = 0, i = 0; i < num; i++)
+        acc |= a[i];
+
+    return byte_is_zero(acc);
+}
+
+static inline void bytes_zero(unsigned char *a, size_t num)
+{
+    size_t i;
+
+    for (i = 0; i < num; i++)
+        a[i] = 0;
+}
+
+static inline void vec_cswap(void *restrict a, void *restrict b, size_t num,
+                             bool_t cbit)
+{
+    limb_t ai, *ap = (limb_t *)a;
+    limb_t bi, *bp = (limb_t *)b;
+    limb_t xorm, mask = (limb_t)0 - cbit;
+    size_t i;
+
+    num /= sizeof(limb_t);
+
+    for (i = 0; i < num; i++) {
+        xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask;
+        ap[i] = ai ^ xorm;
+        bp[i] = bi ^ xorm;
+    }
+}
+
+/* ret = bit ? a : b */
+#ifdef __CUDA_ARCH__
+extern "C" {
+__device__ void vec_select_48(void *ret, const void *a, const void *b,
+                                         unsigned int sel_a);
+__device__ void vec_select_96(void *ret, const void *a, const void *b,
+                                         unsigned int sel_a);
+__device__ void vec_select_192(void *ret, const void *a, const void *b,
+                                          unsigned int sel_a);
+__device__ void vec_select_144(void *ret, const void *a, const void *b,
+                                          unsigned int sel_a);
+__device__ void vec_select_288(void *ret, const void *a, const void *b,
+                                          unsigned int sel_a);
+}
+#else
+void vec_select_48(void *ret, const void *a, const void *b, bool_t sel_a);
+void vec_select_96(void *ret, const void *a, const void *b, bool_t sel_a);
+void vec_select_144(void *ret, const void *a, const void *b, bool_t sel_a);
+void vec_select_192(void *ret, const void *a, const void *b, bool_t sel_a);
+void vec_select_288(void *ret, const void *a, const void *b, bool_t sel_a);
+#endif
+static inline void vec_select(void *ret, const void *a, const void *b,
+                              size_t num, bool_t sel_a)
+{
+#ifndef __BLST_NO_ASM__
+    if (num == 48)          vec_select_48(ret, a, b, sel_a);
+    else if (num == 96)     vec_select_96(ret, a, b, sel_a);
+    else if (num == 144)    vec_select_144(ret, a, b, sel_a);
+    else if (num == 192)    vec_select_192(ret, a, b, sel_a);
+    else if (num == 288)    vec_select_288(ret, a, b, sel_a);
+#else
+    if (0) ;
+#endif
+    else {
+        limb_t bi, *rp = (limb_t *)ret;
+        const limb_t *ap = (const limb_t *)a;
+        const limb_t *bp = (const limb_t *)b;
+        limb_t xorm, mask = (limb_t)0 - sel_a;
+        size_t i;
+
+        num /= sizeof(limb_t);
+
+        for (i = 0; i < num; i++) {
+            xorm = (ap[i] ^ (bi = bp[i])) & mask;
+            rp[i] = bi ^ xorm;
+        }
+    }
+}
+
+static inline bool_t is_zero(limb_t l)
+{   return (~l & (l - 1)) >> (LIMB_T_BITS - 1);   }
+
+static inline bool_t vec_is_zero(const void *a, size_t num)
+{
+    const limb_t *ap = (const limb_t *)a;
+    limb_t acc;
+    size_t i;
+
+    num /= sizeof(limb_t);
+
+    for (acc = 0, i = 0; i < num; i++)
+        acc |= ap[i];
+
+    return is_zero(acc);
+}
+
+static inline bool_t vec_is_equal(const void *a, const void *b, size_t num)
+{
+    const limb_t *ap = (const limb_t *)a;
+    const limb_t *bp = (const limb_t *)b;
+    limb_t acc;
+    size_t i;
+
+    num /= sizeof(limb_t);
+
+    for (acc = 0, i = 0; i < num; i++)
+        acc |= ap[i] ^ bp[i];
+
+    return is_zero(acc);
+}
+
+static inline void cneg_mod_384x(vec384x ret, const vec384x a, bool_t flag,
+                                 const vec384 p)
+{
+    cneg_mod_384(ret[0], a[0], flag, p);
+    cneg_mod_384(ret[1], a[1], flag, p);
+}
+
+static inline void vec_copy(void *restrict ret, const void *a, size_t num)
+{
+    limb_t *rp = (limb_t *)ret;
+    const limb_t *ap = (const limb_t *)a;
+    size_t i;
+
+    num /= sizeof(limb_t);
+
+    for (i = 0; i < num; i++)
+        rp[i] = ap[i];
+}
+
+static inline void vec_zero(void *ret, size_t num)
+{
+    volatile limb_t *rp = (volatile limb_t *)ret;
+    size_t i;
+
+    num /= sizeof(limb_t);
+
+    for (i = 0; i < num; i++)
+        rp[i] = 0;
+
+#if defined(__GNUC__) && !defined(__NVCC__)
+    asm volatile("" : : "r"(ret) : "memory");
+#endif
+}
+
+static inline void limbs_from_be_bytes(limb_t *restrict ret,
+                                       const unsigned char *in, size_t n)
+{
+    limb_t limb = 0;
+
+    while(n--) {
+        limb <<= 8;
+        limb |= *in++;
+        /*
+         * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper
+         * to perform redundant stores than to pay penalty for
+         * mispredicted branch. Besides, some compilers unroll the
+         * loop and remove redundant stores to 'restict'-ed storage...
+         */
+        ret[n / sizeof(limb_t)] = limb;
+    }
+}
+
+static inline void be_bytes_from_limbs(unsigned char *out, const limb_t *in,
+                                       size_t n)
+{
+    limb_t limb;
+
+    while(n--) {
+        limb = in[n / sizeof(limb_t)];
+        *out++ = (unsigned char)(limb >> (8 * (n % sizeof(limb_t))));
+    }
+}
+
+static inline void limbs_from_le_bytes(limb_t *restrict ret,
+                                       const unsigned char *in, size_t n)
+{
+    limb_t limb = 0;
+
+    while(n--) {
+        limb <<= 8;
+        limb |= in[n];
+        /*
+         * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper
+         * to perform redundant stores than to pay penalty for
+         * mispredicted branch. Besides, some compilers unroll the
+         * loop and remove redundant stores to 'restict'-ed storage...
+         */
+        ret[n / sizeof(limb_t)] = limb;
+    }
+}
+
+static inline void le_bytes_from_limbs(unsigned char *out, const limb_t *in,
+                                       size_t n)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+    limb_t limb;
+    size_t i, j, r;
+
+    if ((uptr_t)out == (uptr_t)in && is_endian.little)
+        return;
+
+    r = n % sizeof(limb_t);
+    n /= sizeof(limb_t);
+
+    for(i = 0; i < n; i++) {
+        for (limb = in[i], j = 0; j < sizeof(limb_t); j++, limb >>= 8)
+            *out++ = (unsigned char)limb;
+    }
+    if (r) {
+        for (limb = in[i], j = 0; j < r; j++, limb >>= 8)
+            *out++ = (unsigned char)limb;
+    }
+}
+
+/*
+ * Some compilers get arguably overzealous(*) when passing pointer to
+ * multi-dimensional array [such as vec384x] as 'const' argument.
+ * General direction seems to be to legitimize such constification,
+ * so it's argued that suppressing the warning is appropriate.
+ *
+ * (*)  http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm
+ */
+#if defined(__INTEL_COMPILER)
+# pragma warning(disable:167)
+# pragma warning(disable:556)
+#elif defined(__GNUC__) && !defined(__clang__)
+# pragma GCC diagnostic ignored "-Wpedantic"
+#elif defined(_MSC_VER)
+# pragma warning(disable: 4127 4189)
+#endif
+
+#if !defined(__wasm__)
+# include <stdlib.h>
+#endif
+
+#if defined(__GNUC__)
+# ifndef alloca
+#  define alloca(s) __builtin_alloca(s)
+# endif
+#elif defined(__sun)
+# include <alloca.h>
+#elif defined(_WIN32)
+# include <malloc.h>
+# ifndef alloca
+#  define alloca(s) _alloca(s)
+# endif
+#endif
+
+#endif /* __BLS12_381_ASM_VECT_H__ */
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..84bcc77
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+CFLAGS=${CFLAGS:--O -fno-builtin -fPIC -Wall -Wextra}
+CC=gcc
+AR=ar
+
+${CC} ${CFLAGS} -c blst/server.c
+${CC} ${CFLAGS} -c blst/assembly.S
+${AR} rc libblst.a server.o assembly.o
+
+${CC} ${CFLAGS} -o ctm ctm.c fstoken.c debugprint.c libblst.a
\ No newline at end of file
diff --git a/ctm b/ctm
new file mode 100755
index 0000000..0932e57
Binary files /dev/null and b/ctm differ
diff --git a/ctm.c b/ctm.c
new file mode 100644
index 0000000..7e13f45
--- /dev/null
+++ b/ctm.c
@@ -0,0 +1,404 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/random.h>
+#include <stdbool.h>
+#include <assert.h>
+#include "blst/blst.h"
+#include "debugprint.h"
+#include "fstoken.h"
+
+char* token_path;
+
+void print_help(char* name){
+    printf("FemtoStar Credit Token Manager (ctm)\n");
+    printf("This tool can be used to generate and process Credit Tokens for use with the FemtoStar Protocol.\n\n");
+    
+    printf("Warning: This tool lets you do insecure or broken things! Be careful with it.\n");
+    printf("ctm is still under development! Do not assume it is secure or complete yet.\n");
+    printf("In particular, note that keys and tokens are currently stored unencrypted on-disk.\n\n");
+    
+    printf("%s help - display this help\n", name);
+    printf("%s path - display your token path\n", name);
+    printf("%s list - list targets you have keys for - use \"%s list verbose\" to also display paths to the keys\n", name, name);
+    printf("%s keygen [targ] - create a new target keypair [targ] using the default token format (128/256) and the system true randomness source\n", name);
+    printf("%s keygen [targ] [tfs] - create a new target keypair [targ] using token format [tfs] and the system true randomness source\n", name);
+    printf("%s keygen [targ] [tfs] [ikm] - create a new target keypair [targ] using token format specifier [tfs] and 32-byte hexadecimal seed [ikm]\n", name);
+    printf("%s keydump [targ] - dump Public and, if available, Secret Keys for target [targ]\n", name);
+    printf("%s keyrepair [targ] - regenerate a missing Public Key for a target [targ] for which a Secret Key is available\n", name);
+    printf("%s req [targ] - generate a token request for target [targ]\n", name);
+}
+
+int get_key_paths(char* target, char** sk_path, char** pk_path, char** tfs_path){
+    int key_path_len;
+
+    key_path_len = strlen(token_path) + strlen(target) + 13;
+
+    *sk_path = malloc(key_path_len);
+    if(*sk_path == NULL) return 1;
+
+    *pk_path = malloc(key_path_len);
+    if(*pk_path == NULL) return 1;
+
+    *tfs_path = malloc(key_path_len + 1);
+    if(*tfs_path == NULL) return 1;
+
+    strcpy(*sk_path, token_path);
+    strcat(*sk_path, "/targets/");
+    strcat(*sk_path, target);
+    strcpy(*pk_path, *sk_path);
+    strcpy(*tfs_path, *sk_path);
+    strcat(*sk_path, ".sk");
+    strcat(*pk_path, ".pk");
+    strcat(*tfs_path, ".tfs");
+
+    return 0;
+}
+
+int get_keys(char* target, byte* sk, byte* pk, int* idbits, int* hashbits){ // pk/sk/idbits/hasbits pointers can be NULL if you don't want to read those
+    FILE *targ_file;
+    char *sk_path;
+    char *pk_path;
+    char *tfs_path;
+    bool sk_available, pk_available, tfs_available;
+    int idbits_buf, hashbits_buf;
+
+    get_key_paths(target, &sk_path, &pk_path, &tfs_path);
+    
+    sk_available = (access(sk_path, R_OK) == 0);
+    pk_available = (access(pk_path, R_OK) == 0);
+    tfs_available = (access(tfs_path, R_OK) == 0);
+
+    if(sk_available && sk != NULL){
+        targ_file = fopen(sk_path, "r");
+        if(!targ_file){
+            printf("Could not open Secret Key file. Exiting.\n");
+            return 1;
+        }
+        fread(sk, 32, 1, targ_file);
+        fclose(targ_file);
+    }
+
+    if(pk_available && pk != NULL){
+        targ_file = fopen(pk_path, "r");
+        if(!targ_file){
+            printf("Could not open Public Key file. Exiting.\n");
+            return 1;
+        }
+        fread(pk, 96, 1, targ_file);
+        fclose(targ_file);
+    }
+
+    if(idbits != NULL || hashbits != NULL){
+        if(tfs_available){
+            targ_file = fopen(tfs_path, "r");
+            if(!targ_file){
+                printf("Could not open Token Format Specifier file. Exiting.\n");
+                return 1;
+            }
+            fscanf(targ_file, "%i/%i", &idbits_buf, &hashbits_buf);
+            fclose(targ_file);
+
+            if(idbits != NULL) *idbits = idbits_buf;
+            if(hashbits != NULL) *hashbits = hashbits_buf;
+        }else{
+            printf("WARNING: Token Format Specifier not set, this is a broken state. Using default (128/256) - please add a .tfs file for the target\n");
+            if(idbits != NULL) *idbits = IDBITS_DEFAULT;
+            if(hashbits != NULL) *hashbits = HASHBITS_DEFAULT;
+        }
+    }
+
+    // 0 = no keys (bad target), 1 = PK only, 2 = SK only (broken state), 3 = PK+SK (can sign)
+    return (2 * sk_available) + pk_available;
+}
+
+int keydump(char* target){
+    byte sk[32];
+    byte pk[96];
+    int key_status;
+    int idbits, hashbits;
+    
+    key_status = get_keys(target, sk, pk, &idbits, &hashbits);
+
+    switch(key_status){
+        case 0:
+            printf("No keys found - target unknown.\n");
+            break;
+        case 1:
+            printf("Public Key available - can verify and request for this target\n");
+            print_bytes("Public Key: ", pk, 96);
+            break;
+        case 2:
+            printf("Secret Key ONLY available - this is a broken state, please keyrepair this keypair (see help)\n");
+            print_bytes("Secret Key: ", sk, 32);
+            break;
+        case 3:
+            printf("Secret Key and Public Key available - can verify, request, and sign for this target.\n");
+            print_bytes("Secret Key: ", sk, 32);
+            print_bytes("Public Key: ", pk, 96);
+            break;
+    }
+
+    printf("Token Format Specifier: %i/%i (%i ID bits, %i hash bits)\n", idbits, hashbits, idbits, hashbits);
+
+    return 0;
+}
+
+int keyrepair(char* target){
+    FILE *key_file;
+    byte sk[32];
+    byte pk[96];
+    char* sk_path;
+    char* pk_path;
+    char* tfs_path;
+    int key_status;
+    
+    key_status = get_keys(target, sk, NULL, NULL, NULL);
+
+    if(key_status != 2){
+        printf("This target does not refer to a keypair with only a Secret Key available. Exiting.\n");
+        return 1;
+    }
+
+    printf("Regenerating Public Key from Private Key for broken keypair %s\n", target);
+
+    fstoken_get_pk_from_sk(sk, pk);
+    debug_print_bytes("Regenerated Public Key: ", pk, 96);
+
+    get_key_paths(target, &sk_path, &pk_path, &tfs_path);
+
+    key_file = fopen(pk_path, "w");
+    if(!key_file){
+        printf("Could not open Public Key file. Exiting.\n");
+        return 1;
+    }
+    fwrite(pk, 96, 1, key_file);
+    fclose(key_file);
+
+    printf("Saved to %s\n", pk_path);
+
+    return 0;
+}
+
+int keygen(char* target, byte* ikm, int idbits, int hashbits){
+    char *sk_path;
+    char *pk_path;
+    char* tfs_path;
+    FILE *targ_file;
+    byte sk_byte[32];
+    byte pk_byte[96];
+
+    debug_print_bytes("IKM: ", ikm, 32);
+
+    fstoken_keygen(ikm, sk_byte, pk_byte);
+
+    debug_print_bytes("Secret Key: ", sk_byte, 32);
+    debug_print_bytes("Public Key: ", pk_byte, 96);
+
+    if(get_key_paths(target, &sk_path, &pk_path, &tfs_path)) return 1;
+
+    printf("Writing Secret Key to %s\n", sk_path);
+
+    targ_file = fopen(sk_path, "w");
+    if(!targ_file){
+        printf("Could not open Secret Key file. Exiting.\n");
+        return 1;
+    }
+    fwrite(sk_byte, 32, 1, targ_file);
+    fclose(targ_file);
+
+    printf("Writing Public Key to %s\n", pk_path);
+
+    targ_file = fopen(pk_path, "w");
+    if(!targ_file){
+        printf("Could not open Public Key file. Exiting.\n");
+        return 1;
+    }
+    fwrite(pk_byte, 96, 1, targ_file);
+    fclose(targ_file);
+
+    printf("Writing Token Format Specifier to %s\n", tfs_path);
+
+    targ_file = fopen(tfs_path, "w");
+    if(!targ_file){
+        printf("Could not open Token Format Specifier file. Exiting.\n");
+        return 1;
+    }
+    fprintf(targ_file, "%i/%i", idbits, hashbits);
+    fclose(targ_file);
+
+    return 0;
+}
+
+void print_path(){
+    printf("Token Path (from FEMTOSTAR_TOKEN_PATH environment variable): %s\n", token_path);
+}
+
+bool string_endswith(const char *str, const char *suffix){
+    if (!str || !suffix)
+        return 0;
+    size_t lenstr = strlen(str);
+    size_t lensuffix = strlen(suffix);
+    if (lensuffix >  lenstr)
+        return 0;
+    return strncmp(str + lenstr - lensuffix, suffix, lensuffix) == 0;
+}
+
+// This function is awful because strings in C. It should probably be improved.
+int list_targets(bool verbose){
+    printf("Listing all targets - you have secret keys for, and can issue tokens for, targets marked with (*)\n\n");
+    int n, keyname_len;
+    struct dirent **files;
+    char *keydir_path, *key_path, *key_name;
+    bool sk_available;
+
+    keydir_path = malloc(strlen(token_path) + 9);
+    if(keydir_path == NULL) return 1;
+
+    strcpy(keydir_path, token_path);
+    strcat(keydir_path, "/targets");
+
+    #ifndef __INTELLISENSE__ // VSCodium doesn't know where alphasort is and highlights an error
+    n = scandir(keydir_path, &files, NULL, alphasort);
+    #endif
+
+    if(n == -1){
+        fprintf(stderr, "Could not list directory at token path.\n");
+        exit(1);
+    }
+
+    for(int i=0;i<n;i++){
+        if(string_endswith(files[i]->d_name, ".pk")){
+            keyname_len = strlen(files[i]->d_name);
+
+            key_name = malloc(keyname_len + 1);
+            if(key_name == NULL) return 1;
+
+            strcpy(key_name, files[i]->d_name);            
+            key_name[keyname_len - 3] = '\0';
+
+            printf("%s", key_name);
+            
+            key_path = malloc(strlen(token_path) + 9 + strlen(files[i]->d_name));
+            if(key_path == NULL) return 1;
+
+            strcpy(key_path, token_path);
+            strcat(key_path, "/targets/");
+            strcat(key_path, files[i]->d_name);
+
+            if(verbose) printf(" (PK: %s", key_path);
+
+            key_path[strlen(key_path) - 2] = 's';
+
+            if(access(key_path, R_OK) == 0){
+                sk_available = true;
+
+                if(verbose) printf(", SK: %s", key_path);
+            }else{
+                sk_available = false;
+            }
+
+            if(verbose) printf(")");
+            if(sk_available) printf(" (*)");
+
+            printf("\n");
+            free(key_path);
+            free(key_name);
+        }
+    }
+    free(keydir_path);
+
+    return 0;
+}
+
+void bendian_from_hex_string(byte* bendian, char* string, int length){
+    char byte[2];
+    for(int i=0; i<length; i++){
+        memcpy(byte, &string[i*2], 2);
+        bendian[i] = strtol(byte, 0, 16);
+    }
+}
+
+// mostly boring command line parsing
+int main(int argc, char *argv[]){
+    token_path = getenv("FEMTOSTAR_TOKEN_PATH");
+
+    if(!token_path){
+        fprintf(stderr, "The environment variable FEMTOSTAR_TOKEN_PATH does not exist! Please set it before using ctm.\n");
+        exit(1);
+    }
+
+    if(argc < 2){
+        fprintf(stderr, "Provide at least one argument. Try \"%s help\" for more information.\n", argv[0]);
+        return 1;
+    }else if(strcmp(argv[1], "help") == 0){
+        print_help(argv[0]);
+        return 0;
+    }else if(strcmp(argv[1], "path") == 0){
+        print_path();
+        return 0;
+    }else if(strcmp(argv[1], "list") == 0){
+        return list_targets(argc > 2 && strcmp(argv[2], "verbose") == 0); // i don't know if this is cursed or genius
+    }else if(strcmp(argv[1], "keygen") == 0){
+        byte ikm[32];
+        int idbits, hashbits;
+
+        if(argc > 5){
+            printf("Too many arguments. Exiting.\n");
+        }
+
+        // Make sure there's a target name
+        if(argc < 3){
+            fprintf(stderr, "A target name must be provided, e.g. %s keygen [targ]\n", argv[0]);
+            return(1);
+        }
+
+        // Default behaviour for if only target name is provided: default TFS, random IKM. Otherwise, validate and use provided.
+        if(argc < 4){
+            idbits = IDBITS_DEFAULT;
+            hashbits = HASHBITS_DEFAULT;           
+        }else{
+            sscanf(argv[3], "%i/%i", &idbits, &hashbits);
+            if(idbits < 1 || idbits > IDBITS_MAX){
+                printf("Invalid Token Format Specifier: number of ID bits must be between 1 and 256 inclusive\n");
+                return 1;
+            }
+            if(hashbits < 1 || hashbits > HASHBITS_MAX){
+                printf("Invalid Token Format Specifier: number of hash bits must be between 1 and 256 inclusive\n");
+                return 1;
+            }
+        }
+        
+        // If no IKM is provided, use the system true randomness source
+        if(argc < 5){
+            getrandom(ikm, 32, GRND_RANDOM);
+        }else{
+            if(strlen(argv[4]) != 64){
+                fprintf(stderr, "If providing IKM, it must be 32 bytes (64 hexadecimal digits)\n");
+                return 1;
+            }
+
+            bendian_from_hex_string(ikm, argv[4], 64);
+        }
+
+        return keygen(argv[2], ikm, idbits, hashbits);
+    }else if(strcmp(argv[1], "keydump") == 0){
+        // Make sure there's a target name
+        if(argc < 3){
+            fprintf(stderr, "A target name must be provided, e.g. %s keydump [targ]\n", argv[0]);
+            return(1);
+        }
+
+        return keydump(argv[2]);
+    }else if(strcmp(argv[1], "keyrepair") == 0){
+        // Make sure there's a target name
+        if(argc < 3){
+            fprintf(stderr, "A target name must be provided, e.g. %s keyrepair [targ]\n", argv[0]);
+            return(1);
+        }
+
+        return keyrepair(argv[2]);
+    }
+}
\ No newline at end of file
diff --git a/debugprint.c b/debugprint.c
new file mode 100644
index 0000000..4a72ca0
--- /dev/null
+++ b/debugprint.c
@@ -0,0 +1,30 @@
+#include <stdio.h>
+#include "debugprint.h"
+#include "blst/blst.h"
+
+void print_bytes(const char* label, byte *toprint, int length){
+    printf("%s", label);
+    for(int i=0;i<length;i++){
+        printf("%.2x ", toprint[i]);
+    }
+    printf("\n");
+}
+
+void debug_print_bytes(__attribute__((unused)) const char* label, __attribute__((unused)) byte *toprint, __attribute__((unused)) int length){
+    #ifdef INSECURE_CTM_DEBUG_PRINT
+    print_bytes(label, toprint, length);
+    #endif
+}
+
+void print_scalar(const char* label, blst_scalar *toprint){
+    byte temp_buffer[32];
+    blst_bendian_from_scalar(temp_buffer, toprint);
+
+    debug_print_bytes(label, temp_buffer, 32);
+}
+
+void debug_print_scalar(__attribute__((unused)) const char* label, __attribute__((unused)) blst_scalar *toprint){
+    #ifdef INSECURE_CTM_DEBUG_PRINT
+    print_scalar(label, toprint);
+    #endif
+}
\ No newline at end of file
diff --git a/debugprint.h b/debugprint.h
new file mode 100644
index 0000000..fc9a7cc
--- /dev/null
+++ b/debugprint.h
@@ -0,0 +1,13 @@
+#ifndef __DEBUGPRINT_H_
+#define __DEBUGPRINT_H_
+#include "blst/blst.h"
+
+// Uncomment the line below to enable debug prints (including private keys!) - use for development only, this is insecure
+#define INSECURE_CTM_DEBUG_PRINT
+
+void print_bytes(const char* label, byte *toprint, int length);
+void debug_print_bytes(__attribute__((unused)) const char* label, __attribute__((unused)) byte *toprint, __attribute__((unused)) int length);
+void print_scalar(const char* label, blst_scalar *toprint);
+void debug_print_scalar(__attribute__((unused)) const char* label, __attribute__((unused)) blst_scalar *toprint);
+
+#endif
\ No newline at end of file
diff --git a/fstoken.c b/fstoken.c
new file mode 100644
index 0000000..f9dd63c
--- /dev/null
+++ b/fstoken.c
@@ -0,0 +1,25 @@
+#include "fstoken.h"
+
+void fstoken_keygen(byte* ikm, byte* sk_byte, byte* pk_byte){
+    blst_scalar sk;
+    blst_p2 pk;
+    blst_p2_affine pk_affine;
+
+    blst_keygen(&sk, ikm, 32, 0, 0); // generate a secret key from IKM
+    blst_bendian_from_scalar(sk_byte, &sk); // convert it to 32 big-endian bytes in sk_byte to return
+
+    blst_sk_to_pk_in_g2(&pk, &sk); // get a public key from the secret key
+    blst_p2_to_affine(&pk_affine, &pk); // convert it to an affine point, which is what most uses of the public key use
+    blst_p2_affine_compress(pk_byte, &pk_affine); // compress it to 96 bytes in pk_byte to return
+}
+
+void fstoken_get_pk_from_sk(byte* sk_byte, byte* pk_byte){
+    blst_p2 pk;
+    blst_p2_affine pk_affine;
+    blst_scalar sk;
+
+    blst_scalar_from_bendian(&sk, sk_byte);
+    blst_sk_to_pk_in_g2(&pk, &sk);
+    blst_p2_to_affine(&pk_affine, &pk);
+    blst_p2_affine_compress(pk_byte, &pk_affine);
+}
\ No newline at end of file
diff --git a/fstoken.h b/fstoken.h
new file mode 100644
index 0000000..d7ccb08
--- /dev/null
+++ b/fstoken.h
@@ -0,0 +1,20 @@
+#ifndef __FSTOKEN_H__
+#define __FSTOKEN_H__
+#include "blst/blst.h"
+
+#define IDBITS_DEFAULT 128
+#define IDBITS_MAX 256
+#define HASHBITS_DEFAULT 256
+#define HASHBITS_MAX 256
+
+typedef struct{
+    byte sk[32];
+    byte pk[96];
+    uint8_t idbits;
+    uint8_t hashbits;
+} target_descriptor;
+
+void fstoken_keygen(byte* ikm, byte* sk_byte, byte* pk_byte);
+void fstoken_get_pk_from_sk(byte* sk_byte, byte* pk_byte);
+
+#endif
\ No newline at end of file
diff --git a/libblst.a b/libblst.a
new file mode 100644
index 0000000..8a80909
Binary files /dev/null and b/libblst.a differ
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..de2dc15
--- /dev/null
+++ b/main.c
@@ -0,0 +1,114 @@
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include "blst/blst.h"
+
+const byte dst[] = "MY-DST";
+double time_taken;
+clock_t t;
+
+void printbytes(byte *toprint, int length){
+    for(int i=0;i<length;i++){
+        printf("%.2x ", toprint[i]);
+    }
+    printf("\n");
+}
+
+void signer(byte *compressed_signature, byte *compressed_public_key, byte *msg){
+    blst_scalar sk;
+    blst_p2 pk;
+    blst_p1 hash, signature;
+    byte debug_print_buf[256];
+    byte myikm[32] = {'*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*'};
+
+    // On signer's side:
+    printf("IKM: ");
+    printbytes(myikm, 32);
+
+    blst_keygen(&sk, myikm, 32, 0, 0);
+
+    blst_bendian_from_scalar(debug_print_buf, &sk);
+    printf("Secret Key: ");
+    printbytes(debug_print_buf, 32);
+
+    blst_sk_to_pk_in_g2(&pk, &sk);
+
+    blst_p2_compress(compressed_public_key, &pk);
+    printf("Compressed Public Key: ");
+    printbytes(compressed_public_key, 96);
+
+    t = clock();
+    blst_hash_to_g1(&hash, msg, strlen((char *) msg), dst, strlen((char *) dst), 0, 0);
+    t = clock() - t;
+
+    time_taken = ((double)t)/CLOCKS_PER_SEC*1000.0;
+    printf("blst_hash_to_g1 took %f ms\n", time_taken);
+
+    blst_p1_serialize(debug_print_buf, &hash);
+    printf("Message Hash: ");
+    printbytes(debug_print_buf, 96);
+
+    t = clock();
+    blst_sign_pk_in_g2(&signature, &hash, &sk);
+    t = clock() - t;
+
+    time_taken = ((double)t)/CLOCKS_PER_SEC*1000.0;
+    printf("blst_sign_pk_in_g2 took %f ms\n", time_taken);
+
+    blst_p1_serialize(debug_print_buf, &signature);
+    printf("Signature: ");
+    printbytes(debug_print_buf, 96);
+
+    blst_p1_compress(compressed_signature, &signature);
+    printf("Compressed Signature: ");
+    printbytes(compressed_signature, 48);
+}
+
+void verifier(byte *compressed_signature, byte *compressed_public_key, byte *msg){
+    blst_p1_affine sig;
+    blst_p2_affine pk;
+
+    blst_p1_uncompress(&sig, compressed_signature);
+    blst_p2_uncompress(&pk, compressed_public_key);
+
+    BLST_ERROR returned;
+
+    // TODO: check if in g2 group
+    
+    t = clock();
+    returned = blst_core_verify_pk_in_g2(&pk, &sig, 1, msg, strlen((char *) msg), dst, strlen((char *) dst), 0, 0);
+    t = clock() - t;
+
+    time_taken = ((double)t)/CLOCKS_PER_SEC*1000.0;
+    printf("blst_core_verify_pk_in_g2 took %f ms\n", time_taken);
+
+    if(returned == BLST_SUCCESS){
+        printf("Verified!\n");
+    }else{
+        printf("Not verified!\n");
+    }
+}
+
+int main(){
+    byte compressed_signature[48];
+    byte compressed_public_key[96];
+    byte msg[] = "assertion";
+
+    t = clock();
+    t = clock() - t;
+
+    time_taken = ((double)t)/CLOCKS_PER_SEC*1000.0;
+    printf("Doing nothing took %f ms\n", time_taken);
+
+    printf("msg is now %s\n", msg);
+
+    // Sign the message and get the results back
+    signer(compressed_signature, compressed_public_key, msg);
+
+    //msg[8] = 'A';
+
+    printf("msg is now %s\n", msg);
+
+    // Now on verifier's side (after compressed_signature, serialized_public_key, and msg are passed over the network)
+    verifier(compressed_signature, compressed_public_key, msg);
+}
\ No newline at end of file
diff --git a/nonblind.c b/nonblind.c
new file mode 100644
index 0000000..873364f
--- /dev/null
+++ b/nonblind.c
@@ -0,0 +1,131 @@
+// This is a (very rough) test of BLST blind signatures based on run.me from BLST's Python example code
+// Do not trust this to be secure, also this doesn't do a lot of the sanity checking yet
+
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include "blst/blst.h"
+
+const byte dst[] = "MY-DST";
+double time_taken;
+clock_t t;
+
+byte signer_private_key[32];
+byte signer_public_key[96];
+
+void printbytes(byte *toprint, int length){
+    for(int i=0;i<length;i++){
+        printf("%.2x ", toprint[i]);
+    }
+    printf("\n");
+}
+
+void signer_key_setup(){
+    blst_scalar sk;
+    blst_p2 pk;
+    blst_p2_affine pk_affine;
+
+    byte myikm[32] = {'*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*'};
+
+    // On signer's side:
+    printf("IKM: ");
+    printbytes(myikm, 32);
+
+    blst_keygen(&sk, myikm, 32, 0, 0);
+
+    blst_bendian_from_scalar(signer_private_key, &sk);
+    printf("Secret Key: ");
+    printbytes(signer_private_key, 32);
+
+    blst_sk_to_pk_in_g2(&pk, &sk);
+
+    blst_p2_to_affine(&pk_affine, &pk);
+
+    blst_p2_affine_compress(signer_public_key, &pk_affine);
+    printf("Compressed Public Key (affine): ");
+    printbytes(signer_public_key, 96);
+}
+
+void signer(byte *compressed_signature, byte *msg_for_wire){
+    blst_scalar sk;
+    blst_p1 msg, signature;
+    blst_p1_affine msg_affine;
+    byte debug_print_buf[256];
+
+    // get the secret key as a scalar
+    blst_scalar_from_bendian(&sk, signer_private_key);
+
+    // Deserialize the message - it's already a serialized P1 point, we don't need to (literally) rehash it
+    blst_p1_deserialize(&msg_affine, msg_for_wire);
+
+    // i do not know why deserializing always gives you affine points
+    blst_p1_from_affine(&msg, &msg_affine);
+
+    // sign with it
+    blst_sign_pk_in_g2(&signature, &msg, &sk);
+
+    // Serialize and print the signature
+    blst_p1_serialize(debug_print_buf, &signature);
+    printf("Signature: ");
+    printbytes(debug_print_buf, 96);
+
+    // Compress and print the signature
+    blst_p1_compress(compressed_signature, &signature);
+    printf("Compressed Signature: ");
+    printbytes(compressed_signature, 48);
+}
+
+void verifier(byte *compressed_signature, byte *msg){
+    blst_p1_affine sig;
+    blst_p2_affine pk;
+
+    blst_p1_uncompress(&sig, compressed_signature);
+    blst_p2_uncompress(&pk, signer_public_key);
+
+    BLST_ERROR returned;
+
+    // TODO: check if in g2 group
+    
+    returned = blst_core_verify_pk_in_g2(&pk, &sig, 1, msg, strlen((char *) msg), dst, strlen((char *) dst), signer_public_key, 96);
+
+    if(returned == BLST_SUCCESS){
+        printf("Verified!\n");
+    }else{
+        printf("Not verified!\n");
+    }
+}
+
+// main is the "user" in this test
+int main(){
+    byte compressed_signature[48];
+    byte msg[] = "assertion";
+    blst_p1 hash;
+    byte msg_for_wire_bytes[96];
+
+    printf("msg is now %s\n", msg);
+
+    // Set up the signer's keys first so that we can know its public key
+    signer_key_setup();
+
+    // Get a hash of the message - we put the signer's public key in aug here, I don't know why
+    blst_hash_to_g1(&hash, msg, strlen((char *) msg), dst, strlen((char *) dst), signer_public_key, 96);
+
+    // Serialize the blinded message to send it over the wire
+    blst_p1_serialize(msg_for_wire_bytes, &hash);
+
+    printf("Hashed for wire: ");
+    printbytes(msg_for_wire_bytes, 96);
+
+    // Send the message off to be signed and get the results back
+    signer(compressed_signature, msg_for_wire_bytes);
+
+    printf("RETURNED SIGNATURE: ");
+    printbytes(compressed_signature, 48);
+
+    //msg[8] = 'A';
+
+    printf("msg is now %s\n", msg);
+
+    // Now on verifier's side (after compressed_signature, serialized_public_key, and msg are passed over the network)
+    verifier(compressed_signature, msg);
+}
\ No newline at end of file
diff --git a/server.o b/server.o
new file mode 100644
index 0000000..6aeaa31
Binary files /dev/null and b/server.o differ
diff --git a/set_token_path.sh b/set_token_path.sh
new file mode 100755
index 0000000..1738612
--- /dev/null
+++ b/set_token_path.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+# source this file to set your token path, or add this to your profile
+export FEMTOSTAR_TOKEN_PATH=~/fstokens
\ No newline at end of file
diff --git a/test b/test
new file mode 100755
index 0000000..4469211
Binary files /dev/null and b/test differ