initial stuff

2022-09-09 02:47:49 -04:00 · 2022-09-09 02:47:49 -04:00 · 943c07066e
commit 943c07066e
99 changed files with 58786 additions and 0 deletions
--- a/blst/asm/add_mod_256-armv8.pl
+++ b/blst/asm/add_mod_256-armv8.pl
@ -0,0 +1,412 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
+
+@mod=map("x$_",(4..7));
+@a=map("x$_",(8..11));
+@b=map("x$_",(12..15));
+@t=map("x$_",(16,17,1..3));
+
+$code.=<<___;
+.text
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,%function
+.align	5
+add_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+
+	 ldp	@a[2],@a[3],[$a_ptr,#16]
+	adds	@a[0],@a[0],@b[0]
+	 ldp	@b[2],@b[3],[$b_ptr,#16]
+	adcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	adcs	@a[2],@a[2],@b[2]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	adcs	@a[3],@a[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[3],@a[3],@t[3],lo
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	add_mod_256,.-add_mod_256
+
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,%function
+.align	5
+mul_by_3_mod_256:
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+
+	adds	@a[0],@b[0],@b[0]
+	 ldp	@mod[0],@mod[1],[$b_ptr]
+	adcs	@a[1],@b[1],@b[1]
+	 ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	adcs	@a[2],@b[2],@b[2]
+	adcs	@a[3],@b[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	csel	@a[3],@a[3],@t[3],lo
+
+	adds	@a[0],@a[0],@b[0]
+	adcs	@a[1],@a[1],@b[1]
+	adcs	@a[2],@a[2],@b[2]
+	adcs	@a[3],@a[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[3],@a[3],@t[3],lo
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,%function
+.align	5
+lshift_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+
+.Loop_lshift_mod_256:
+	adds	@a[0],@a[0],@a[0]
+	sub	$b_ptr,$b_ptr,#1
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@b[0],@a[0],@mod[0]
+	sbcs	@b[1],@a[1],@mod[1]
+	sbcs	@b[2],@a[2],@mod[2]
+	sbcs	@b[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@b[0],lo
+	csel	@a[1],@a[1],@b[1],lo
+	csel	@a[2],@a[2],@b[2],lo
+	csel	@a[3],@a[3],@b[3],lo
+
+	cbnz	$b_ptr,.Loop_lshift_mod_256
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	lshift_mod_256,.-lshift_mod_256
+
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,%function
+.align	5
+rshift_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+
+.Loop_rshift:
+	adds	@b[0],@a[0],@mod[0]
+	sub	$b_ptr,$b_ptr,#1
+	adcs	@b[1],@a[1],@mod[1]
+	adcs	@b[2],@a[2],@mod[2]
+	adcs	@b[3],@a[3],@mod[3]
+	adc	@t[4],xzr,xzr
+	tst	@a[0],#1
+
+	csel	@b[0],@b[0],@a[0],ne
+	csel	@b[1],@b[1],@a[1],ne
+	csel	@b[2],@b[2],@a[2],ne
+	csel	@b[3],@b[3],@a[3],ne
+	csel	@t[4],@t[4],xzr,ne
+
+	extr	@a[0],@b[1],@b[0],#1
+	extr	@a[1],@b[2],@b[1],#1
+	extr	@a[2],@b[3],@b[2],#1
+	extr	@a[3],@t[4],@b[3],#1
+
+	cbnz	$b_ptr,.Loop_rshift
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	rshift_mod_256,.-rshift_mod_256
+
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,%function
+.align	5
+cneg_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@mod[0],@mod[1],[$n_ptr]
+
+	 ldp	@a[2],@a[3],[$a_ptr,#16]
+	subs	@b[0],@mod[0],@a[0]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	 orr	@mod[0],@a[0],@a[1]
+	sbcs	@b[1],@mod[1],@a[1]
+	 orr	@mod[1],@a[2],@a[3]
+	sbcs	@b[2],@mod[2],@a[2]
+	 orr	@t[4],@mod[0],@mod[1]
+	sbc	@b[3],@mod[3],@a[3]
+
+	cmp	@t[4],#0
+	csetm	@t[4],ne
+	ands	$b_ptr,$b_ptr,@t[4]
+
+	csel	@a[0],@a[0],@b[0],eq
+	csel	@a[1],@a[1],@b[1],eq
+	csel	@a[2],@a[2],@b[2],eq
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[3],@a[3],@b[3],eq
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	cneg_mod_256,.-cneg_mod_256
+
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,%function
+.align	5
+sub_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+
+	 ldp	@a[2],@a[3],[$a_ptr,#16]
+	subs	@a[0],@a[0],@b[0]
+	 ldp	@b[2],@b[3],[$b_ptr,#16]
+	sbcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	sbcs	@a[2],@a[2],@b[2]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	sbcs	@a[3],@a[3],@b[3]
+	sbc	@t[4],xzr,xzr
+
+	 and	@mod[0],@mod[0],@t[4]
+	 and	@mod[1],@mod[1],@t[4]
+	adds	@a[0],@a[0],@mod[0]
+	 and	@mod[2],@mod[2],@t[4]
+	adcs	@a[1],@a[1],@mod[1]
+	 and	@mod[3],@mod[3],@t[4]
+	adcs	@a[2],@a[2],@mod[2]
+	stp	@a[0],@a[1],[$r_ptr]
+	adc	@a[3],@a[3],@mod[3]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	sub_mod_256,.-sub_mod_256
+
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,%function
+.align	5
+check_mod_256:
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@a[1],@a[1]
+	rev	@a[2],@a[2]
+	rev	@a[3],@a[3]
+#endif
+
+	subs	xzr,@a[0],@mod[0]
+	sbcs	xzr,@a[1],@mod[1]
+	orr	@a[0],@a[0],@a[1]
+	sbcs	xzr,@a[2],@mod[2]
+	orr	@a[0],@a[0],@a[2]
+	sbcs	xzr,@a[3],@mod[3]
+	orr	@a[0],@a[0],@a[3]
+	sbc	$a_ptr,xzr,xzr
+
+	cmp	@a[0],#0
+	mov	x0,#1
+	csel	x0,x0,xzr,ne
+	and	x0,x0,$a_ptr
+
+	ret
+.size	check_mod_256,.-check_mod_256
+
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,%function
+.align	5
+add_n_check_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@b[0],@b[0]
+	rev	@a[1],@a[1]
+	rev	@b[1],@b[1]
+	rev	@a[2],@a[2]
+	rev	@b[2],@b[2]
+	rev	@a[3],@a[3]
+	rev	@b[3],@b[3]
+#endif
+
+	adds	@a[0],@a[0],@b[0]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	adcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	adcs	@a[2],@a[2],@b[2]
+	adcs	@a[3],@a[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	csel	@a[3],@a[3],@t[3],lo
+
+	orr	@t[0], @a[0], @a[1]
+	orr	@t[1], @a[2], @a[3]
+	orr	@t[0], @t[0], @t[1]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@a[1],@a[1]
+	rev	@a[2],@a[2]
+	rev	@a[3],@a[3]
+#endif
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	mov	@t[1], #1
+	cmp	@t[0], #0
+	csel	x0, @t[1], xzr, ne
+
+	ret
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,%function
+.align	5
+sub_n_check_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@b[0],@b[0]
+	rev	@a[1],@a[1]
+	rev	@b[1],@b[1]
+	rev	@a[2],@a[2]
+	rev	@b[2],@b[2]
+	rev	@a[3],@a[3]
+	rev	@b[3],@b[3]
+#endif
+
+	subs	@a[0],@a[0],@b[0]
+	sbcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	sbcs	@a[2],@a[2],@b[2]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	sbcs	@a[3],@a[3],@b[3]
+	sbc	@t[4],xzr,xzr
+
+	 and	@mod[0],@mod[0],@t[4]
+	 and	@mod[1],@mod[1],@t[4]
+	adds	@a[0],@a[0],@mod[0]
+	 and	@mod[2],@mod[2],@t[4]
+	adcs	@a[1],@a[1],@mod[1]
+	 and	@mod[3],@mod[3],@t[4]
+	adcs	@a[2],@a[2],@mod[2]
+	adc	@a[3],@a[3],@mod[3]
+
+	orr	@t[0], @a[0], @a[1]
+	orr	@t[1], @a[2], @a[3]
+	orr	@t[0], @t[0], @t[1]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@a[1],@a[1]
+	rev	@a[2],@a[2]
+	rev	@a[3],@a[3]
+#endif
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	mov	@t[1], #1
+	cmp	@t[0], #0
+	csel	x0, @t[1], xzr, ne
+
+	ret
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
+___
+
+print $code;
+
+close STDOUT;
--- a/blst/asm/add_mod_256-x86_64.pl
+++ b/blst/asm/add_mod_256-x86_64.pl
@ -0,0 +1,547 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx");
+$b_ptr = "%rbx";
+
+{ ############################################################## 256 bits add
+my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12));
+
+$code.=<<___;
+.text
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,\@function,4,"unwind"
+.align	32
+add_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+.Loaded_a_add_mod_256:
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	 mov	@acc[0], @acc[4]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[1], @acc[5]
+	adc	8*3($b_org), @acc[3]
+	sbb	$b_org, $b_org
+
+	 mov	@acc[2], @acc[6]
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], @acc[7]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[4], @acc[0]
+	cmovc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_mod_256,.-add_mod_256
+
+########################################################################
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,\@function,3,"unwind"
+.align	32
+mul_by_3_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	$b_org,$n_ptr
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	$a_ptr,$b_org
+	mov	8*3($a_ptr), @acc[3]
+
+	call	__lshift_mod_256
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	jmp	.Loaded_a_add_mod_256
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.type	__lshift_mod_256,\@abi-omnipotent
+.align	32
+__lshift_mod_256:
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	 mov	@acc[0], @acc[4]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[1], @acc[5]
+	adc	@acc[3], @acc[3]
+	sbb	@acc[8], @acc[8]
+
+	 mov	@acc[2], @acc[6]
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], @acc[7]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	\$0, @acc[8]
+
+	cmovc	@acc[4], @acc[0]
+	cmovc	@acc[5], @acc[1]
+	cmovc	@acc[6], @acc[2]
+	cmovc	@acc[7], @acc[3]
+
+	ret
+.size	__lshift_mod_256,.-__lshift_mod_256
+
+########################################################################
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,\@function,4,"unwind"
+.align	32
+lshift_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+.Loop_lshift_mod_256:
+	call	__lshift_mod_256
+	dec	%edx
+	jnz	.Loop_lshift_mod_256
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	lshift_mod_256,.-lshift_mod_256
+
+########################################################################
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,\@function,4,"unwind"
+.align	32
+rshift_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[7]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+.Loop_rshift_mod_256:
+	mov	@acc[7], @acc[0]
+	and	\$1, @acc[7]
+	mov	8*0($n_ptr), @acc[4]
+	neg	@acc[7]
+	mov	8*1($n_ptr), @acc[5]
+	mov	8*2($n_ptr), @acc[6]
+
+	and	@acc[7], @acc[4]
+	and	@acc[7], @acc[5]
+	and	@acc[7], @acc[6]
+	and	8*3($n_ptr), @acc[7]
+
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	adc	@acc[6], @acc[2]
+	adc	@acc[7], @acc[3]
+	sbb	@acc[4], @acc[4]
+
+	shr	\$1, @acc[0]
+	mov	@acc[1], @acc[7]
+	shr	\$1, @acc[1]
+	mov	@acc[2], @acc[6]
+	shr	\$1, @acc[2]
+	mov	@acc[3], @acc[5]
+	shr	\$1, @acc[3]
+
+	shl	\$63, @acc[7]
+	shl	\$63, @acc[6]
+	or	@acc[0], @acc[7]
+	shl	\$63, @acc[5]
+	or	@acc[6], @acc[1]
+	shl	\$63, @acc[4]
+	or	@acc[5], @acc[2]
+	or	@acc[4], @acc[3]
+
+	dec	%edx
+	jnz	.Loop_rshift_mod_256
+
+	mov	@acc[7], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	rshift_mod_256,.-rshift_mod_256
+
+########################################################################
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,\@function,4,"unwind"
+.align	32
+cneg_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[8]	# load a[0:3]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	@acc[8], @acc[0]
+	mov	8*3($a_ptr), @acc[3]
+	or	@acc[1], @acc[8]
+	or	@acc[2], @acc[8]
+	or	@acc[3], @acc[8]
+	mov	\$-1, @acc[7]
+
+	mov	8*0($n_ptr), @acc[4]	# load n[0:3]
+	cmovnz	@acc[7], @acc[8]	# mask = a[0:3] ? -1 : 0
+	mov	8*1($n_ptr), @acc[5]
+	mov	8*2($n_ptr), @acc[6]
+	and	@acc[8], @acc[4]	# n[0:3] &= mask
+	mov	8*3($n_ptr), @acc[7]
+	and	@acc[8], @acc[5]
+	and	@acc[8], @acc[6]
+	and	@acc[8], @acc[7]
+
+	sub	@acc[0], @acc[4]	# a[0:3] ? n[0:3]-a[0:3] : 0-0
+	sbb	@acc[1], @acc[5]
+	sbb	@acc[2], @acc[6]
+	sbb	@acc[3], @acc[7]
+
+	or	$b_org, $b_org		# check condition flag
+
+	cmovz	@acc[0], @acc[4]	# flag ? n[0:3]-a[0:3] : a[0:3]
+	cmovz	@acc[1], @acc[5]
+	mov	@acc[4], 8*0($r_ptr)
+	cmovz	@acc[2], @acc[6]
+	mov	@acc[5], 8*1($r_ptr)
+	cmovz	@acc[3], @acc[7]
+	mov	@acc[6], 8*2($r_ptr)
+	mov	@acc[7], 8*3($r_ptr)
+
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	cneg_mod_256,.-cneg_mod_256
+
+########################################################################
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,\@function,4,"unwind"
+.align	32
+sub_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[4]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[5]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[6]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[7]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_mod_256,.-sub_mod_256
+
+########################################################################
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,\@function,2,"unwind"
+.align	32
+check_mod_256:
+.cfi_startproc
+	mov	8*0($r_ptr), %rax
+	mov	8*1($r_ptr), @acc[1]
+	mov	8*2($r_ptr), @acc[2]
+	mov	8*3($r_ptr), @acc[3]
+
+	mov	%rax, @acc[0]		# see if it's zero
+	or	@acc[1], %rax
+	or	@acc[2], %rax
+	or	@acc[3], %rax
+
+	sub	8*0($a_ptr), @acc[0]	# does subtracting modulus borrow?
+	sbb	8*1($a_ptr), @acc[1]
+	sbb	8*2($a_ptr), @acc[2]
+	sbb	8*3($a_ptr), @acc[3]
+	sbb	$a_ptr, $a_ptr
+
+	mov	\$1, %rdx
+	cmp	\$0, %rax
+	cmovne	%rdx, %rax
+	and	$a_ptr, %rax
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	check_mod_256,.-check_mod_256
+
+########################################################################
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,\@function,4,"unwind"
+.align	32
+add_n_check_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	 mov	@acc[0], @acc[4]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[1], @acc[5]
+	adc	8*3($b_org), @acc[3]
+	sbb	$b_org, $b_org
+
+	 mov	@acc[2], @acc[6]
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], @acc[7]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[4], @acc[0]
+	cmovc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	or	@acc[1], @acc[0]
+	or	@acc[3], @acc[2]
+	or	@acc[2], @acc[0]
+	mov	\$1, %rax
+	cmovz	@acc[0], %rax
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+########################################################################
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,\@function,4,"unwind"
+.align	32
+sub_n_check_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[4]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[5]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[6]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[7]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	or	@acc[1], @acc[0]
+	or	@acc[3], @acc[2]
+	or	@acc[2], @acc[0]
+	mov	\$1, %rax
+	cmovz	@acc[0], %rax
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
+___
+}
+
+print $code;
+close STDOUT;
--- a/blst/asm/add_mod_384-armv8.pl
+++ b/blst/asm/add_mod_384-armv8.pl
@ -0,0 +1,872 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
+
+@mod=map("x$_",(4..9));
+@a=map("x$_",(10..15));
+@b=map("x$_",(16,17,19..22));
+$carry=$n_ptr;
+
+$code.=<<___;
+.text
+
+.globl	add_mod_384
+.hidden	add_mod_384
+.type	add_mod_384,%function
+.align	5
+add_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	add_mod_384,.-add_mod_384
+
+.type	__add_mod_384,%function
+.align	5
+__add_mod_384:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@b[4],@b[5],[$b_ptr,#32]
+
+__add_mod_384_ab_are_loaded:
+	adds	@a[0],@a[0],@b[0]
+	adcs	@a[1],@a[1],@b[1]
+	adcs	@a[2],@a[2],@b[2]
+	adcs	@a[3],@a[3],@b[3]
+	adcs	@a[4],@a[4],@b[4]
+	adcs	@a[5],@a[5],@b[5]
+	adc	$carry,xzr,xzr
+
+	subs	@b[0],@a[0],@mod[0]
+	sbcs	@b[1],@a[1],@mod[1]
+	sbcs	@b[2],@a[2],@mod[2]
+	sbcs	@b[3],@a[3],@mod[3]
+	sbcs	@b[4],@a[4],@mod[4]
+	sbcs	@b[5],@a[5],@mod[5]
+	sbcs	xzr,$carry,xzr
+
+	csel	@a[0],@a[0],@b[0],lo
+	csel	@a[1],@a[1],@b[1],lo
+	csel	@a[2],@a[2],@b[2],lo
+	csel	@a[3],@a[3],@b[3],lo
+	csel	@a[4],@a[4],@b[4],lo
+	csel	@a[5],@a[5],@b[5],lo
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.globl	add_mod_384x
+.hidden	add_mod_384x
+.type	add_mod_384x,%function
+.align	5
+add_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__add_mod_384
+
+	stp	@a[0],@a[1],[$r_ptr]
+	add	$a_ptr,$a_ptr,#48
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	add	$b_ptr,$b_ptr,#48
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	add_mod_384x,.-add_mod_384x
+
+.globl	rshift_mod_384
+.hidden	rshift_mod_384
+.type	rshift_mod_384,%function
+.align	5
+rshift_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+.Loop_rshift_mod_384:
+	sub	$b_ptr,$b_ptr,#1
+	bl	__rshift_mod_384
+	cbnz	$b_ptr,.Loop_rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	rshift_mod_384,.-rshift_mod_384
+
+.type	__rshift_mod_384,%function
+.align	5
+__rshift_mod_384:
+	sbfx	@b[5],@a[0],#0,#1
+	 and	@b[0],@b[5],@mod[0]
+	 and	@b[1],@b[5],@mod[1]
+	adds	@a[0],@a[0],@b[0]
+	 and	@b[2],@b[5],@mod[2]
+	adcs	@a[1],@a[1],@b[1]
+	 and	@b[3],@b[5],@mod[3]
+	adcs	@a[2],@a[2],@b[2]
+	 and	@b[4],@b[5],@mod[4]
+	adcs	@a[3],@a[3],@b[3]
+	 and	@b[5],@b[5],@mod[5]
+	adcs	@a[4],@a[4],@b[4]
+	 extr	@a[0],@a[1],@a[0],#1	// a[0:5] >>= 1
+	adcs	@a[5],@a[5],@b[5]
+	 extr	@a[1],@a[2],@a[1],#1
+	adc	@b[5],xzr,xzr
+	 extr	@a[2],@a[3],@a[2],#1
+	 extr	@a[3],@a[4],@a[3],#1
+	 extr	@a[4],@a[5],@a[4],#1
+	 extr	@a[5],@b[5],@a[5],#1
+	ret
+.size	__rshift_mod_384,.-__rshift_mod_384
+
+.globl	div_by_2_mod_384
+.hidden	div_by_2_mod_384
+.type	div_by_2_mod_384,%function
+.align	5
+div_by_2_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	div_by_2_mod_384,.-div_by_2_mod_384
+
+.globl	lshift_mod_384
+.hidden	lshift_mod_384
+.type	lshift_mod_384,%function
+.align	5
+lshift_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+.Loop_lshift_mod_384:
+	sub	$b_ptr,$b_ptr,#1
+	bl	__lshift_mod_384
+	cbnz	$b_ptr,.Loop_lshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	lshift_mod_384,.-lshift_mod_384
+
+.type	__lshift_mod_384,%function
+.align	5
+__lshift_mod_384:
+	adds	@a[0],@a[0],@a[0]
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$carry,xzr,xzr
+
+	subs	@b[0],@a[0],@mod[0]
+	sbcs	@b[1],@a[1],@mod[1]
+	sbcs	@b[2],@a[2],@mod[2]
+	sbcs	@b[3],@a[3],@mod[3]
+	sbcs	@b[4],@a[4],@mod[4]
+	sbcs	@b[5],@a[5],@mod[5]
+	sbcs	xzr,$carry,xzr
+
+	csel	@a[0],@a[0],@b[0],lo
+	csel	@a[1],@a[1],@b[1],lo
+	csel	@a[2],@a[2],@b[2],lo
+	csel	@a[3],@a[3],@b[3],lo
+	csel	@a[4],@a[4],@b[4],lo
+	csel	@a[5],@a[5],@b[5],lo
+
+	ret
+.size	__lshift_mod_384,.-__lshift_mod_384
+
+.globl	mul_by_3_mod_384
+.hidden	mul_by_3_mod_384
+.type	mul_by_3_mod_384,%function
+.align	5
+mul_by_3_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+	ldp	@b[4],@b[5],[$a_ptr,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_3_mod_384,.-mul_by_3_mod_384
+
+.globl	mul_by_8_mod_384
+.hidden	mul_by_8_mod_384
+.type	mul_by_8_mod_384,%function
+.align	5
+mul_by_8_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_8_mod_384,.-mul_by_8_mod_384
+
+.globl	mul_by_3_mod_384x
+.hidden	mul_by_3_mod_384x
+.type	mul_by_3_mod_384x,%function
+.align	5
+mul_by_3_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+	ldp	@b[4],@b[5],[$a_ptr,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+
+	stp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[0],@a[1],[$a_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[2],@a[3],[$a_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	ldp	@a[4],@a[5],[$a_ptr,#80]
+
+	bl	__lshift_mod_384
+
+	ldp	@b[0],@b[1],[$a_ptr,#48]
+	ldp	@b[2],@b[3],[$a_ptr,#64]
+	ldp	@b[4],@b[5],[$a_ptr,#80]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
+
+.globl	mul_by_8_mod_384x
+.hidden	mul_by_8_mod_384x
+.type	mul_by_8_mod_384x,%function
+.align	5
+mul_by_8_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+
+	stp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[0],@a[1],[$a_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[2],@a[3],[$a_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	ldp	@a[4],@a[5],[$a_ptr,#80]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
+
+.globl	cneg_mod_384
+.hidden	cneg_mod_384
+.type	cneg_mod_384,%function
+.align	5
+cneg_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+
+	subs	@b[0],@mod[0],@a[0]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+	 orr	$carry,@a[0],@a[1]
+	sbcs	@b[1],@mod[1],@a[1]
+	 orr	$carry,$carry,@a[2]
+	sbcs	@b[2],@mod[2],@a[2]
+	 orr	$carry,$carry,@a[3]
+	sbcs	@b[3],@mod[3],@a[3]
+	 orr	$carry,$carry,@a[4]
+	sbcs	@b[4],@mod[4],@a[4]
+	 orr	$carry,$carry,@a[5]
+	sbc	@b[5],@mod[5],@a[5]
+
+	cmp	$carry,#0
+	csetm	$carry,ne
+	ands	$b_ptr,$b_ptr,$carry
+
+	csel	@a[0],@a[0],@b[0],eq
+	csel	@a[1],@a[1],@b[1],eq
+	csel	@a[2],@a[2],@b[2],eq
+	csel	@a[3],@a[3],@b[3],eq
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[4],@a[4],@b[4],eq
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	csel	@a[5],@a[5],@b[5],eq
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	cneg_mod_384,.-cneg_mod_384
+
+.globl	sub_mod_384
+.hidden	sub_mod_384
+.type	sub_mod_384,%function
+.align	5
+sub_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	sub_mod_384,.-sub_mod_384
+
+.type	__sub_mod_384,%function
+.align	5
+__sub_mod_384:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@b[4],@b[5],[$b_ptr,#32]
+
+	subs	@a[0],@a[0],@b[0]
+	sbcs	@a[1],@a[1],@b[1]
+	sbcs	@a[2],@a[2],@b[2]
+	sbcs	@a[3],@a[3],@b[3]
+	sbcs	@a[4],@a[4],@b[4]
+	sbcs	@a[5],@a[5],@b[5]
+	sbc	$carry,xzr,xzr
+
+	 and	@b[0],@mod[0],$carry
+	 and	@b[1],@mod[1],$carry
+	adds	@a[0],@a[0],@b[0]
+	 and	@b[2],@mod[2],$carry
+	adcs	@a[1],@a[1],@b[1]
+	 and	@b[3],@mod[3],$carry
+	adcs	@a[2],@a[2],@b[2]
+	 and	@b[4],@mod[4],$carry
+	adcs	@a[3],@a[3],@b[3]
+	 and	@b[5],@mod[5],$carry
+	adcs	@a[4],@a[4],@b[4]
+	adc	@a[5],@a[5],@b[5]
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	sub_mod_384x
+.hidden	sub_mod_384x
+.type	sub_mod_384x,%function
+.align	5
+sub_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__sub_mod_384
+
+	stp	@a[0],@a[1],[$r_ptr]
+	add	$a_ptr,$a_ptr,#48
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	add	$b_ptr,$b_ptr,#48
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	sub_mod_384x,.-sub_mod_384x
+
+.globl	mul_by_1_plus_i_mod_384x
+.hidden	mul_by_1_plus_i_mod_384x
+.type	mul_by_1_plus_i_mod_384x,%function
+.align	5
+mul_by_1_plus_i_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+	add	$b_ptr,$a_ptr,#48
+
+	bl	__sub_mod_384			// a->re - a->im
+
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+	ldp	@b[4],@b[5],[$a_ptr,#32]
+	stp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[0],@a[1],[$a_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[2],@a[3],[$a_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	ldp	@a[4],@a[5],[$a_ptr,#80]
+
+	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
+
+.globl	sgn0_pty_mod_384
+.hidden	sgn0_pty_mod_384
+.type	sgn0_pty_mod_384,%function
+.align	5
+sgn0_pty_mod_384:
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+	ldp	@mod[4],@mod[5],[$a_ptr,#32]
+
+	and	$r_ptr,@a[0],#1
+	adds	@a[0],@a[0],@a[0]
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$carry,xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	$carry,$carry,xzr
+
+	mvn	$carry,$carry
+	and	$carry,$carry,#2
+	orr	$r_ptr,$r_ptr,$carry
+
+	ret
+.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
+
+.globl	sgn0_pty_mod_384x
+.hidden	sgn0_pty_mod_384x
+.type	sgn0_pty_mod_384x,%function
+.align	5
+sgn0_pty_mod_384x:
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+	ldp	@mod[4],@mod[5],[$a_ptr,#32]
+
+	and	$b_ptr,@a[0],#1
+	 orr	$n_ptr,@a[0],@a[1]
+	adds	@a[0],@a[0],@a[0]
+	 orr	$n_ptr,$n_ptr,@a[2]
+	adcs	@a[1],@a[1],@a[1]
+	 orr	$n_ptr,$n_ptr,@a[3]
+	adcs	@a[2],@a[2],@a[2]
+	 orr	$n_ptr,$n_ptr,@a[4]
+	adcs	@a[3],@a[3],@a[3]
+	 orr	$n_ptr,$n_ptr,@a[5]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	@b[0],xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	@b[0],@b[0],xzr
+
+	ldp	@a[0],@a[1],[$r_ptr,#48]
+	ldp	@a[2],@a[3],[$r_ptr,#64]
+	ldp	@a[4],@a[5],[$r_ptr,#80]
+
+	mvn	@b[0],@b[0]
+	and	@b[0],@b[0],#2
+	orr	$b_ptr,$b_ptr,@b[0]
+
+	and	$r_ptr,@a[0],#1
+	 orr	$a_ptr,@a[0],@a[1]
+	adds	@a[0],@a[0],@a[0]
+	 orr	$a_ptr,$a_ptr,@a[2]
+	adcs	@a[1],@a[1],@a[1]
+	 orr	$a_ptr,$a_ptr,@a[3]
+	adcs	@a[2],@a[2],@a[2]
+	 orr	$a_ptr,$a_ptr,@a[4]
+	adcs	@a[3],@a[3],@a[3]
+	 orr	$a_ptr,$a_ptr,@a[5]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	@b[0],xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	@b[0],@b[0],xzr
+
+	mvn	@b[0],@b[0]
+	and	@b[0],@b[0],#2
+	orr	$r_ptr,$r_ptr,@b[0]
+
+	cmp	$n_ptr,#0
+	csel	$n_ptr,$r_ptr,$b_ptr,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	$a_ptr,#0
+	csel	$a_ptr,$r_ptr,$b_ptr,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	$n_ptr,$n_ptr,#1
+	and	$a_ptr,$a_ptr,#2
+	orr	$r_ptr,$a_ptr,$n_ptr	// pack sign and parity
+
+	ret
+.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
+___
+if (1) {
+sub vec_select {
+my $sz = shift;
+my @v=map("v$_",(0..5,16..21));
+
+$code.=<<___;
+.globl	vec_select_$sz
+.hidden	vec_select_$sz
+.type	vec_select_$sz,%function
+.align	5
+vec_select_$sz:
+	dup	v6.2d, $n_ptr
+	ld1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48
+___
+for($i=0; $i<$sz-48; $i+=48) {
+$code.=<<___;
+	bit	@v[0].16b, @v[3].16b, v6.16b
+	ld1	{@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48
+	bit	@v[1].16b, @v[4].16b, v6.16b
+	ld1	{@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48
+	bit	@v[2].16b, @v[5].16b, v6.16b
+	st1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48
+___
+	@v = @v[6..11,0..5];
+}
+$code.=<<___;
+	bit	@v[0].16b, @v[3].16b, v6.16b
+	bit	@v[1].16b, @v[4].16b, v6.16b
+	bit	@v[2].16b, @v[5].16b, v6.16b
+	st1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr]
+	ret
+.size	vec_select_$sz,.-vec_select_$sz
+___
+}
+vec_select(48);
+vec_select(96);
+vec_select(192);
+vec_select(144);
+vec_select(288);
+}
+
+{
+my ($inp, $end, $step) = map("x$_", (0..2));
+
+$code.=<<___;
+.globl	vec_prefetch
+.hidden	vec_prefetch
+.type	vec_prefetch,%function
+.align	5
+vec_prefetch:
+	add	$end, $end, $inp
+	sub	$end, $end, #1
+	mov	$step, #64
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	prfm	pldl1keep, [$inp]
+	ret
+.size	vec_prefetch,.-vec_prefetch
+___
+}
+
+print $code;
+
+close STDOUT;
--- a/blst/asm/add_mod_384-x86_64.pl
+++ b/blst/asm/add_mod_384-x86_64.pl
--- a/blst/asm/add_mod_384x384-x86_64.pl
+++ b/blst/asm/add_mod_384x384-x86_64.pl
@ -0,0 +1,260 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+# common accumulator layout
+@acc=map("%r$_",(8..15));
+
+############################################################ 384x384 add/sub
+# Double-width addition/subtraction modulo n<<384, as opposite to
+# naively expected modulo n*n. It works because n<<384 is the actual
+# input boundary condition for Montgomery reduction, not n*n.
+# Just in case, this is duplicated, but only one module is
+# supposed to be linked...
+{
+my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.text
+
+.type	__add_mod_384x384,\@abi-omnipotent
+.align	32
+__add_mod_384x384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	8*6($a_ptr), @acc[6]
+
+	add	8*0($b_org), @acc[0]
+	mov	8*7($a_ptr), @acc[7]
+	adc	8*1($b_org), @acc[1]
+	mov	8*8($a_ptr), @acc[8]
+	adc	8*2($b_org), @acc[2]
+	mov	8*9($a_ptr), @acc[9]
+	adc	8*3($b_org), @acc[3]
+	mov	8*10($a_ptr), @acc[10]
+	adc	8*4($b_org), @acc[4]
+	mov	8*11($a_ptr), @acc[11]
+	adc	8*5($b_org), @acc[5]
+	 mov	@acc[0], 8*0($r_ptr)
+	adc	8*6($b_org), @acc[6]
+	 mov	@acc[1], 8*1($r_ptr)
+	adc	8*7($b_org), @acc[7]
+	 mov	@acc[2], 8*2($r_ptr)
+	adc	8*8($b_org), @acc[8]
+	 mov	@acc[4], 8*4($r_ptr)
+	 mov	@acc[6], @acc[0]
+	adc	8*9($b_org), @acc[9]
+	 mov	@acc[3], 8*3($r_ptr)
+	 mov	@acc[7], @acc[1]
+	adc	8*10($b_org), @acc[10]
+	 mov	@acc[5], 8*5($r_ptr)
+	 mov	@acc[8], @acc[2]
+	adc	8*11($b_org), @acc[11]
+	 mov	@acc[9], @acc[3]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[7]
+	 mov	@acc[10], @acc[4]
+	sbb	8*2($n_ptr), @acc[8]
+	sbb	8*3($n_ptr), @acc[9]
+	sbb	8*4($n_ptr), @acc[10]
+	 mov	@acc[11], @acc[5]
+	sbb	8*5($n_ptr), @acc[11]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[0], @acc[6]
+	cmovc	@acc[1], @acc[7]
+	cmovc	@acc[2], @acc[8]
+	mov	@acc[6], 8*6($r_ptr)
+	cmovc	@acc[3], @acc[9]
+	mov	@acc[7], 8*7($r_ptr)
+	cmovc	@acc[4], @acc[10]
+	mov	@acc[8], 8*8($r_ptr)
+	cmovc	@acc[5], @acc[11]
+	mov	@acc[9], 8*9($r_ptr)
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__add_mod_384x384,.-__add_mod_384x384
+
+.type	__sub_mod_384x384,\@abi-omnipotent
+.align	32
+__sub_mod_384x384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	8*6($a_ptr), @acc[6]
+
+	sub	8*0($b_org), @acc[0]
+	mov	8*7($a_ptr), @acc[7]
+	sbb	8*1($b_org), @acc[1]
+	mov	8*8($a_ptr), @acc[8]
+	sbb	8*2($b_org), @acc[2]
+	mov	8*9($a_ptr), @acc[9]
+	sbb	8*3($b_org), @acc[3]
+	mov	8*10($a_ptr), @acc[10]
+	sbb	8*4($b_org), @acc[4]
+	mov	8*11($a_ptr), @acc[11]
+	sbb	8*5($b_org), @acc[5]
+	 mov	@acc[0], 8*0($r_ptr)
+	sbb	8*6($b_org), @acc[6]
+	 mov	8*0($n_ptr), @acc[0]
+	 mov	@acc[1], 8*1($r_ptr)
+	sbb	8*7($b_org), @acc[7]
+	 mov	8*1($n_ptr), @acc[1]
+	 mov	@acc[2], 8*2($r_ptr)
+	sbb	8*8($b_org), @acc[8]
+	 mov	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], 8*3($r_ptr)
+	sbb	8*9($b_org), @acc[9]
+	 mov	8*3($n_ptr), @acc[3]
+	 mov	@acc[4], 8*4($r_ptr)
+	sbb	8*10($b_org), @acc[10]
+	 mov	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], 8*5($r_ptr)
+	sbb	8*11($b_org), @acc[11]
+	 mov	8*5($n_ptr), @acc[5]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[0]
+	and	$b_org, @acc[1]
+	and	$b_org, @acc[2]
+	and	$b_org, @acc[3]
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+
+	add	@acc[0], @acc[6]
+	adc	@acc[1], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[2], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[3], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[4], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[5], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.globl	add_mod_384x384
+.hidden	add_mod_384x384
+.type	add_mod_384x384,\@function,4,"unwind"
+.align	32
+add_mod_384x384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__add_mod_384x384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_mod_384x384,.-add_mod_384x384
+
+.globl	sub_mod_384x384
+.hidden	sub_mod_384x384
+.type	sub_mod_384x384,\@function,4,"unwind"
+.align	32
+sub_mod_384x384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__sub_mod_384x384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_mod_384x384,.-sub_mod_384x384
+___
+}
+
+print $code;
+close STDOUT;
--- a/blst/asm/arm-xlate.pl
+++ b/blst/asm/arm-xlate.pl
@ -0,0 +1,381 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ARM assembler distiller/adapter by \@dot-asm.
+
+use strict;
+
+################################################################
+# Recognized "flavour"-s are:
+#
+# linux[32|64]	GNU assembler, effectively pass-through
+# ios[32|64]	global symbols' decorations, PIC tweaks, etc.
+# win[32|64]	Visual Studio armasm-specific directives
+# coff[32|64]	e.g. clang --target=arm-windows ...
+#
+my $flavour = shift;
+   $flavour = "linux" if (!$flavour or $flavour eq "void");
+
+my $output = shift;
+open STDOUT,">$output" || die "can't open $output: $!";
+
+my %GLOBALS;
+my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0;
+my $in_proc;	# used with 'windows' flavour
+
+################################################################
+# directives which need special treatment on different platforms
+################################################################
+my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch
+my $fpu  = sub { } if ($flavour !~ /linux/);       # omit .fpu
+
+my $rodata = sub {
+    SWITCH: for ($flavour) {
+	/linux/		&& return ".section\t.rodata";
+	/ios/		&& return ".section\t__TEXT,__const";
+	/coff/		&& return ".section\t.rdata,\"dr\"";
+	/win/		&& return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8";
+	last;
+    }
+};
+
+my $hidden = sub {
+    if ($flavour =~ /ios/)	{ ".private_extern\t".join(',',@_); }
+} if ($flavour !~ /linux/);
+
+my $comm = sub {
+    my @args = split(/,\s*/,shift);
+    my $name = @args[0];
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    if ($flavour =~ /ios32/)	{
+	$ret = ".comm\t_$name,@args[1]\n";
+	$ret .= ".non_lazy_symbol_pointer\n";
+	$ret .= "$name:\n";
+	$ret .= ".indirect_symbol\t_$name\n";
+	$ret .= ".long\t0\n";
+	$ret .= ".previous";
+	$name = "_$name";
+    } elsif ($flavour =~ /win/) {
+	$ret = "\tCOMMON\t|$name|,@args[1]";
+    } elsif ($flavour =~ /coff/) {
+	$ret = ".comm\t$name,@args[1]";
+    } else {
+	$ret = ".comm\t".join(',',@args);
+    }
+
+    $$global = $name;
+    $ret;
+};
+
+my $globl = sub {
+    my $name = shift;
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    SWITCH: for ($flavour) {
+	/ios/		&& do { $name = "_$name"; last; };
+	/win/		&& do { $ret = ""; last; };
+    }
+
+    $ret = ".globl	$name" if (!defined($ret));
+    $$global = $name;
+    $ret;
+};
+my $global = $globl;
+
+my $extern = sub {
+    &$globl(@_);
+    if ($flavour =~ /win/) {
+	return "\tEXTERN\t@_";
+    }
+    return;	# return nothing
+};
+
+my $type = sub {
+    my $arg = join(',',@_);
+    my $ret;
+
+    SWITCH: for ($flavour) {
+	/ios32/		&& do { if ($arg =~ /(\w+),\s*%function/) {
+				    $ret = "#ifdef __thumb2__\n" .
+					   ".thumb_func	$1\n" .
+					   "#endif";
+				}
+				last;
+			      };
+	/win/		&& do { if ($arg =~ /(\w+),\s*%(function|object)/) {
+				    my $type = "[DATA]";
+				    if ($2 eq "function") {
+					$in_proc = $1;
+					$type = "[FUNC]";
+				    }
+				    $ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type"
+							: "";
+				}
+				last;
+			      };
+	/coff/		&& do { if ($arg =~ /(\w+),\s*%function/) {
+				    $ret = ".def	$1;\n".
+					   ".type	32;\n".
+					   ".endef";
+				}
+				last;
+			      };
+    }
+    return $ret;
+} if ($flavour !~ /linux/);
+
+my $size = sub {
+    if ($in_proc && $flavour =~ /win/) {
+	$in_proc = undef;
+	return "\tENDP";
+    }
+} if ($flavour !~ /linux/);
+
+my $inst = sub {
+    if ($flavour =~ /win/)	{ "\tDCDU\t".join(',',@_); }
+    else			{ ".long\t".join(',',@_);  }
+} if ($flavour !~ /linux/);
+
+my $asciz = sub {
+    my $line = join(",",@_);
+    if ($line =~ /^"(.*)"$/)
+    {	if ($flavour =~ /win/) {
+	    "\tDCB\t$line,0\n\tALIGN\t4";
+	} else {
+	    ".byte	" . join(",",unpack("C*",$1),0) . "\n.align	2";
+	}
+    } else {	"";	}
+};
+
+my $align = sub {
+    "\tALIGN\t".2**@_[0];
+} if ($flavour =~ /win/);
+   $align = sub {
+    ".p2align\t".@_[0];
+} if ($flavour =~ /coff/);
+
+my $byte = sub {
+    "\tDCB\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $short = sub {
+    "\tDCWU\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $word = sub {
+    "\tDCDU\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $long = $word if ($flavour =~ /win/);
+
+my $quad = sub {
+    "\tDCQU\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $skip = sub {
+    "\tSPACE\t".shift;
+} if ($flavour =~ /win/);
+
+my $code = sub {
+    "\tCODE@_[0]";
+} if ($flavour =~ /win/);
+
+my $thumb = sub {	# .thumb should appear prior .text in source
+    "# define ARM THUMB\n" .
+    "\tTHUMB";
+} if ($flavour =~ /win/);
+
+my $text = sub {
+    "\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM");
+} if ($flavour =~ /win/);
+
+my $syntax = sub {} if ($flavour =~ /win/);	# omit .syntax
+
+my $rva = sub {
+    # .rva directive comes in handy only on 32-bit Windows, i.e. it can
+    # be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections.
+    # However! Corresponding compilers don't seem to bet on PIC, which
+    # raises the question why would assembler programmer have to jump
+    # through the hoops? But just in case, it would go as following:
+    #
+    #	ldr	r1,.LOPENSSL_armcap
+    #	ldr	r2,.LOPENSSL_armcap+4
+    #	adr	r0,.LOPENSSL_armcap
+    #	bic	r1,r1,#1		; de-thumb-ify link.exe's ideas
+    #	sub	r0,r0,r1		; r0 is image base now
+    #	ldr	r0,[r0,r2]
+    #	...
+    #.LOPENSSL_armcap:
+    #	.rva	.LOPENSSL_armcap	; self-reference
+    #	.rva	OPENSSL_armcap_P	; real target
+    #
+    # Non-position-independent [and ISA-neutral] alternative is so much
+    # simpler:
+    #
+    #	ldr	r0,.LOPENSSL_armcap
+    #	ldr	r0,[r0]
+    #	...
+    #.LOPENSSL_armcap:
+    #	.long	OPENSSL_armcap_P
+    #
+    "\tDCDU\t@_[0]\n\tRELOC\t2"
+} if ($flavour =~ /win(?!64)/);
+
+################################################################
+# some broken instructions in Visual Studio armasm[64]...
+
+my $it = sub {} if ($flavour =~ /win32/);	# omit 'it'
+
+my $ext = sub {
+    "\text8\t".join(',',@_);
+} if ($flavour =~ /win64/);
+
+my $csel = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    my @regs = split(m|,\s*|,$args);
+    my $cond = pop(@regs);
+
+    "\tcsel$cond\t".join(',',@regs);
+} if ($flavour =~ /win64/);
+
+my $csetm = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    my @regs = split(m|,\s*|,$args);
+    my $cond = pop(@regs);
+
+    "\tcsetm$cond\t".join(',',@regs);
+} if ($flavour =~ /win64/);
+
+# ... then conditional branch instructions are also broken, but
+# maintaining all the variants is tedious, so I kludge-fix it
+# elsewhere...
+################################################################
+my $adrp = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    "\tadrp\t$args\@PAGE";
+} if ($flavour =~ /ios64/);
+
+my $paciasp = sub {
+    ($flavour =~ /linux/) ? "\t.inst\t0xd503233f"
+                          : &$inst(0xd503233f);
+};
+
+my $autiasp = sub {
+    ($flavour =~ /linux/) ? "\t.inst\t0xd50323bf"
+                          : &$inst(0xd50323bf);
+};
+
+sub range {
+  my ($r,$sfx,$start,$end) = @_;
+
+    join(",",map("$r$_$sfx",($start..$end)));
+}
+
+sub expand_line {
+  my $line = shift;
+  my @ret = ();
+
+    pos($line)=0;
+
+    while ($line =~ m/\G[^@\/\{\"]*/g) {
+	if ($line =~ m/\G(@|\/\/|$)/gc) {
+	    last;
+	}
+	elsif ($line =~ m/\G\{/gc) {
+	    my $saved_pos = pos($line);
+	    $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e;
+	    pos($line) = $saved_pos;
+	    $line =~ m/\G[^\}]*\}/g;
+	}
+	elsif ($line =~ m/\G\"/gc) {
+	    $line =~ m/\G[^\"]*\"/g;
+	}
+    }
+
+    $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
+
+    if ($flavour =~ /win/) {
+	# adjust alignment hints, "[rN,:32]" -> "[rN@32]"
+	$line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/;
+	# adjust local labels, ".Lwhatever" -> "|$Lwhatever|"
+	$line =~ s/\.(L\w{2,})/|\$$1|/g;
+	# omit "#:lo12:" on win64
+	$line =~ s/#:lo12://;
+    } elsif ($flavour =~ /coff(?!64)/) {
+	$line =~ s/\.L(\w{2,})/(\$ML$1)/g;
+    } elsif ($flavour =~ /ios64/) {
+	$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
+    }
+
+    return $line;
+}
+
+while(my $line=<>) {
+
+    # fix up assembler-specific commentary delimiter
+    $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/);
+
+    if ($line =~ m/^\s*(#|@|;|\/\/)/)	{ print $line; next; }
+
+    $line =~ s|/\*.*\*/||;	# get rid of C-style comments...
+    $line =~ s|^\s+||;		# ... and skip white spaces in beginning...
+    $line =~ s|\s+$||;		# ... and at the end
+
+    {
+	$line =~ s|[\b\.]L(\w{2,})|L$1|g;	# common denominator for Locallabel
+	$line =~ s|\bL(\w{2,})|\.L$1|g	if ($dotinlocallabels);
+    }
+
+    {
+	$line =~ s|(^[\.\w]+)\:\s*||;
+	my $label = $1;
+	if ($label) {
+	    $label = ($GLOBALS{$label} or $label);
+	    if ($flavour =~ /win/) {
+		$label =~ s|^\.L(?=\w)|\$L|;
+		printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : "");
+	    } else {
+		$label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/);
+		printf "%s:", $label;
+	    }
+	}
+    }
+
+    if ($line !~ m/^[#@;]/) {
+	$line =~ s|^\s*(\.?)(\S+)\s*||;
+	my $c = $1; $c = "\t" if ($c eq "");
+	my $mnemonic = $2;
+	my $opcode;
+	if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) {
+	    $opcode = eval("\$$1_$2");
+	} else {
+	    $opcode = eval("\$$mnemonic");
+	}
+
+	my $arg=expand_line($line);
+
+	if (ref($opcode) eq 'CODE') {
+	    $line = &$opcode($arg);
+	} elsif ($mnemonic)         {
+	    if ($flavour =~ /win64/) {
+		# "b.cond" -> "bcond", kludge-fix:-(
+		$mnemonic =~ s/^b\.([a-z]{2}$)/b$1/;
+	    }
+	    $line = $c.$mnemonic;
+	    $line.= "\t$arg" if ($arg ne "");
+	}
+    }
+
+    print $line if ($line);
+    print "\n";
+}
+
+print "\tEND\n" if ($flavour =~ /win/);
+
+close STDOUT;
--- a/blst/asm/ct_inverse_mod_256-armv8.pl
+++ b/blst/asm/ct_inverse_mod_256-armv8.pl
@ -0,0 +1,586 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 -
+# on Cortex-A57.
+#
+# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
+#                                                       const vec256 modx);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_256(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 31
+    mask = (1 << k) - 1
+
+    for i in range(0, 512 // k - 1):
+        # __ab_approximation_31
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-k-2)) << k)
+            b_ = (b & mask) | ((b >> (n-k-2)) << k)
+
+        # __inner_loop_31
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smul_256_n_shift_by_31
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smul_512x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 512 % k + k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 512 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    mod <<= 512 - mod.bit_length()  # align to the left
+    if v < 0:
+        v += mod
+    if v < 0:
+        v += mod
+    elif v == 1<<512
+        v -= mod
+
+    return v & (2**512 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
+my @acc=map("x$_",(4..11));
+my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17));
+my $cnt = $n_ptr;
+my @t = map("x$_",(19..26));
+my ($a_lo, $b_lo) = @acc[3,7];
+
+$frame = 16+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256, %function
+.align	5
+ct_inverse_mod_256:
+	paciasp
+	stp	x29, x30, [sp,#-80]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	sub	sp, sp, #$frame
+
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0]
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2]
+
+	add	$in_ptr, sp, #16+511	// find closest 512-byte-aligned spot
+	and	$in_ptr, $in_ptr, #-512	// in the frame...
+	str	$out_ptr, [sp]
+
+	ldp	@acc[4], @acc[5], [$n_ptr,#8*0]
+	ldp	@acc[6], @acc[7], [$n_ptr,#8*2]
+
+	stp	@acc[0], @acc[1], [$in_ptr,#8*0]	// copy input to |a|
+	stp	@acc[2], @acc[3], [$in_ptr,#8*2]
+	stp	@acc[4], @acc[5], [$in_ptr,#8*4]	// copy modulus to |b|
+	stp	@acc[6], @acc[7], [$in_ptr,#8*6]
+
+	////////////////////////////////////////// first iteration
+	bl	.Lab_approximation_31_256_loaded
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	str	$f0,[$out_ptr,#8*8]		// initialize |u| with |f0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*4	// pointer to dst |b|
+	bl	__smul_256_n_shift_by_31
+	str	$f0, [$out_ptr,#8*9]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	ldr	@acc[4], [$in_ptr,#8*8]		// |u|
+	ldr	@acc[5], [$in_ptr,#8*13]	// |v|
+	madd	@acc[0], $f_, @acc[4], xzr	// |u|*|f0|
+	madd	@acc[0], $g_, @acc[5], @acc[0]	// |v|*|g0|
+	str	@acc[0], [$out_ptr,#8*4]
+	asr	@acc[1], @acc[0], #63		// sign extenstion
+	stp	@acc[1], @acc[1], [$out_ptr,#8*5]
+	stp	@acc[1], @acc[1], [$out_ptr,#8*7]
+
+	madd	@acc[0], $f0, @acc[4], xzr	// |u|*|f1|
+	madd	@acc[0], $g0, @acc[5], @acc[0]	// |v|*|g1|
+	str	@acc[0], [$out_ptr,#8*9]
+	asr	@acc[1], @acc[0], #63		// sign extenstion
+	stp	@acc[1], @acc[1], [$out_ptr,#8*10]
+	stp	@acc[1], @acc[1], [$out_ptr,#8*12]
+___
+for($i=2; $i<15; $i++) {
+$code.=<<___;
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	@t[3], @t[3], @t[4]
+	str	@t[3], [$out_ptr,#8*4]
+
+	mov	$f_, $f0			// corrected |f1|
+	mov	$g_, $g0			// corrected |g1|
+	add	$out_ptr, $out_ptr, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+___
+$code.=<<___	if ($i>7);
+	bl	__smul_512x63_tail
+___
+$code.=<<___	if ($i<=7);
+	adc	@t[3], @t[3], @t[4]
+	stp	@t[3], @t[3], [$out_ptr,#8*4]
+	stp	@t[3], @t[3], [$out_ptr,#8*6]
+___
+}
+$code.=<<___;
+	////////////////////////////////////////// two[!] last iterations
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #47			// 31 + 512 % 31
+	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
+	ldr	$a_lo, [$in_ptr,#8*0]		// just load
+	ldr	$b_lo, [$in_ptr,#8*4]
+	bl	__inner_loop_62_256
+
+	mov	$f_, $f1
+	mov	$g_, $g1
+	ldr	$out_ptr, [sp]			// original out_ptr
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	ldr	x30, [x29,#8]
+
+	smulh	@t[1], @acc[3], $g_		// figure out top-most limb
+	ldp	@acc[4], @acc[5], [$nx_ptr,#8*0]
+	adc	@t[4], @t[4], @t[6]
+	ldp	@acc[6], @acc[7], [$nx_ptr,#8*2]
+
+	add	@t[1], @t[1], @t[4]		// @t[1] is 1, 0 or -1
+	asr	@t[0], @t[1], #63		// sign as mask
+
+	and	@t[4],   @acc[4], @t[0]		// add mod<<256 conditionally
+	and	@t[5],   @acc[5], @t[0]
+	adds	@acc[0], @acc[0], @t[4]
+	and	@t[6],   @acc[6], @t[0]
+	adcs	@acc[1], @acc[1], @t[5]
+	and	@t[7],   @acc[7], @t[0]
+	adcs	@acc[2], @acc[2], @t[6]
+	adcs	@acc[3], @t[3],   @t[7]
+	adc	@t[1], @t[1], xzr		// @t[1] is 1, 0 or -1
+
+	neg	@t[0], @t[1]
+	orr	@t[1], @t[1], @t[0]		// excess bit or sign as mask
+	asr	@t[0], @t[0], #63		// excess bit as mask
+
+	and	@acc[4], @acc[4], @t[1]		// mask |mod|
+	and	@acc[5], @acc[5], @t[1]
+	and	@acc[6], @acc[6], @t[1]
+	and	@acc[7], @acc[7], @t[1]
+
+	eor	@acc[4], @acc[4], @t[0]		// conditionally negate |mod|
+	eor	@acc[5], @acc[5], @t[0]
+	adds	@acc[4], @acc[4], @t[0], lsr#63
+	eor	@acc[6], @acc[6], @t[0]
+	adcs	@acc[5], @acc[5], xzr
+	eor	@acc[7], @acc[7], @t[0]
+	adcs	@acc[6], @acc[6], xzr
+	adc	@acc[7], @acc[7], xzr
+
+	adds	@acc[0], @acc[0], @acc[4]	// final adjustment for |mod|<<256
+	adcs	@acc[1], @acc[1], @acc[5]
+	adcs	@acc[2], @acc[2], @acc[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
+	adc	@acc[3], @acc[3], @acc[7]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*6]
+
+	add	sp, sp, #$frame
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldr	x29, [sp],#80
+	autiasp
+	ret
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+
+////////////////////////////////////////////////////////////////////////
+.type	__smul_256x63, %function
+.align	5
+__smul_256x63:
+___
+for($j=0; $j<2; $j++) {
+my $f_ = $f_;   $f_ = $g_          if ($j);
+my @acc = @acc; @acc = @acc[4..7]  if ($j);
+my $k = 8*8+8*5*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |u| (or |v|)
+	asr	$f1, $f_, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	$f_, $f_, $f1		// conditionally negate |f_| (or |g_|)
+	ldr	@t[3+$j], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], $f1	// conditionally negate |u| (or |v|)
+	sub	$f_, $f_, $f1
+	eor	@acc[1], @acc[1], $f1
+	adds	@acc[0], @acc[0], $f1, lsr#63
+	eor	@acc[2], @acc[2], $f1
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[2], @acc[2], xzr
+	eor	@t[3+$j], @t[3+$j], $f1
+	 umulh	@t[0], @acc[0], $f_
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], $f_
+	adcs	@t[3+$j], @t[3+$j], xzr
+	 umulh	@t[2], @acc[2], $f_
+___
+$code.=<<___	if ($j!=0);
+	adc	$g1, xzr, xzr		// used in __smul_512x63_tail
+___
+$code.=<<___;
+	mul	@acc[0], @acc[0], $f_
+	 cmp	$f_, #0
+	mul	@acc[1], @acc[1], $f_
+	 csel	@t[3+$j], @t[3+$j], xzr, ne
+	mul	@acc[2], @acc[2], $f_
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@t[5+$j], @acc[3], $f_
+	adcs	@acc[2], @acc[2], @t[1]
+	adcs	@t[5+$j], @t[5+$j], @t[2]
+___
+$code.=<<___	if ($j==0);
+	adc	@t[7], xzr, xzr
+___
+}
+$code.=<<___;
+	adc	@t[7], @t[7], xzr
+
+	adds	@acc[0], @acc[0], @acc[4]
+	adcs	@acc[1], @acc[1], @acc[5]
+	adcs	@acc[2], @acc[2], @acc[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@t[5],   @t[5],   @t[6]
+	stp	@acc[2], @t[5], [$out_ptr,#8*2]
+
+	ret
+.size	__smul_256x63,.-__smul_256x63
+
+.type	__smul_512x63_tail, %function
+.align	5
+__smul_512x63_tail:
+	umulh	@t[5], @acc[3], $f_
+	ldp	@acc[1], @acc[2], [$in_ptr,#8*18]	// load rest of |v|
+	adc	@t[7], @t[7], xzr
+	ldr	@acc[3], [$in_ptr,#8*20]
+	and	@t[3], @t[3], $f_
+
+	umulh	@acc[7], @acc[7], $g_	// resume |v|*|g1| chain
+
+	sub	@t[5], @t[5], @t[3]	// tie up |u|*|f1| chain
+	asr	@t[6], @t[5], #63
+
+	eor	@acc[1], @acc[1], $f1	// conditionally negate rest of |v|
+	eor	@acc[2], @acc[2], $f1
+	adds	@acc[1], @acc[1], $g1
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[2], @acc[2], xzr
+	 umulh	@t[0], @t[4],   $g_
+	adc	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], $g_
+	add	@acc[7], @acc[7], @t[7]
+	 umulh	@t[2], @acc[2], $g_
+
+	mul	@acc[0], @t[4],   $g_
+	mul	@acc[1], @acc[1], $g_
+	adds	@acc[0], @acc[0], @acc[7]
+	mul	@acc[2], @acc[2], $g_
+	adcs	@acc[1], @acc[1], @t[0]
+	mul	@t[3],   @acc[3], $g_
+	adcs	@acc[2], @acc[2], @t[1]
+	adcs	@t[3],   @t[3],   @t[2]
+	adc	@t[4], xzr, xzr		// used in the final step
+
+	adds	@acc[0], @acc[0], @t[5]
+	adcs	@acc[1], @acc[1], @t[6]
+	adcs	@acc[2], @acc[2], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
+	adcs	@t[3],   @t[3],   @t[6]	// carry is used in the final step
+	stp	@acc[2], @t[3],   [$out_ptr,#8*6]
+
+	ret
+.size	__smul_512x63_tail,.-__smul_512x63_tail
+
+.type	__smul_256_n_shift_by_31, %function
+.align	5
+__smul_256_n_shift_by_31:
+___
+for($j=0; $j<2; $j++) {
+my $f0 = $f0;   $f0 = $g0           if ($j);
+my @acc = @acc; @acc = @acc[4..7]   if ($j);
+my $k = 8*4*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |a| (or |b|)
+	asr	@t[5], $f0, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	@t[6], $f0, @t[5]	// conditionally negate |f0| (or |g0|)
+
+	eor	@acc[0], @acc[0], @t[5]	// conditionally negate |a| (or |b|)
+	sub	@t[6], @t[6], @t[5]
+	eor	@acc[1], @acc[1], @t[5]
+	adds	@acc[0], @acc[0], @t[5], lsr#63
+	eor	@acc[2], @acc[2], @t[5]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[5]
+	 umulh	@t[0], @acc[0], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	 umulh	@t[1], @acc[1], @t[6]
+	adc	@acc[3], @acc[3], xzr
+	 umulh	@t[2], @acc[2], @t[6]
+	and	@t[5], @t[5], @t[6]
+	 umulh	@t[3+$j], @acc[3], @t[6]
+	neg	@t[5], @t[5]
+
+	mul	@acc[0], @acc[0], @t[6]
+	mul	@acc[1], @acc[1], @t[6]
+	mul	@acc[2], @acc[2], @t[6]
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], @t[1]
+	adcs	@acc[3], @acc[3], @t[2]
+	adc	@t[3+$j], @t[3+$j], @t[5]
+___
+}
+$code.=<<___;
+	adds	@acc[0], @acc[0], @acc[4]
+	adcs	@acc[1], @acc[1], @acc[5]
+	adcs	@acc[2], @acc[2], @acc[6]
+	adcs	@acc[3], @acc[3], @acc[7]
+	adc	@acc[4], @t[3],   @t[4]
+
+	extr	@acc[0], @acc[1], @acc[0], #31
+	extr	@acc[1], @acc[2], @acc[1], #31
+	extr	@acc[2], @acc[3], @acc[2], #31
+	asr	@t[4], @acc[4], #63	// result's sign as mask
+	extr	@acc[3], @acc[4], @acc[3], #31
+
+	eor	@acc[0], @acc[0], @t[4]	// ensure the result is positive
+	eor	@acc[1], @acc[1], @t[4]
+	adds	@acc[0], @acc[0], @t[4], lsr#63
+	eor	@acc[2], @acc[2], @t[4]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[4]
+	adcs	@acc[2], @acc[2], xzr
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adc	@acc[3], @acc[3], xzr
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+
+	eor	$f0, $f0, @t[4]		// adjust |f/g| accordingly
+	eor	$g0, $g0, @t[4]
+	sub	$f0, $f0, @t[4]
+	sub	$g0, $g0, @t[4]
+
+	ret
+.size	__smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
+___
+
+{
+my @a = @acc[0..3];
+my @b = @acc[4..7];
+my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]);
+
+$code.=<<___;
+.type	__ab_approximation_31_256, %function
+.align	4
+__ab_approximation_31_256:
+	ldp	@a[2], @a[3], [$in_ptr,#8*2]
+	ldp	@b[2], @b[3], [$in_ptr,#8*6]
+	ldp	@a[0], @a[1], [$in_ptr,#8*0]
+	ldp	@b[0], @b[1], [$in_ptr,#8*4]
+
+.Lab_approximation_31_256_loaded:
+	orr	@t[0], @a[3], @b[3]	// check top-most limbs, ...
+	cmp	@t[0], #0
+	csel	@a[3], @a[3], @a[2], ne
+	csel	@b[3], @b[3], @b[2], ne
+	csel	@a[2], @a[2], @a[1], ne
+	orr	@t[0], @a[3], @b[3]	// and ones before top-most, ...
+	csel	@b[2], @b[2], @b[1], ne
+
+	cmp	@t[0], #0
+	csel	@a[3], @a[3], @a[2], ne
+	csel	@b[3], @b[3], @b[2], ne
+	csel	@a[2], @a[2], @a[0], ne
+	orr	@t[0], @a[3], @b[3]	// and one more, ...
+	csel	@b[2], @b[2], @b[0], ne
+
+	clz	@t[0], @t[0]
+	cmp	@t[0], #64
+	csel	@t[0], @t[0], xzr, ne
+	csel	@a[3], @a[3], @a[2], ne
+	csel	@b[3], @b[3], @b[2], ne
+	neg	@t[1], @t[0]
+
+	lslv	@a[3], @a[3], @t[0]	// align high limbs to the left
+	lslv	@b[3], @b[3], @t[0]
+	lsrv	@a[2], @a[2], @t[1]
+	lsrv	@b[2], @b[2], @t[1]
+	and	@a[2], @a[2], @t[1], asr#6
+	and	@b[2], @b[2], @t[1], asr#6
+	orr	$a_lo, @a[3], @a[2]
+	orr	$b_lo, @b[3], @b[2]
+
+	bfxil	$a_lo, @a[0], #0, #31
+	bfxil	$b_lo, @b[0], #0, #31
+
+	b	__inner_loop_31_256
+	ret
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+
+.type	__inner_loop_31_256, %function
+.align	4
+__inner_loop_31_256:
+	mov	$cnt, #31
+	mov	$fg0, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	$fg1, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	$bias,#0x7FFFFFFF7FFFFFFF
+
+.Loop_31_256:
+	sbfx	@t[3], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_lo, @t[3]
+	sub	@t[1], $b_lo, $a_lo	// |b_|-|a_|
+	subs	@t[2], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	@t[0], $fg1
+	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
+	csel	$a_lo, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	$fg1, $fg1, $fg0,    hs	// exchange |fg0| and |fg1|
+	csel	$fg0, $fg0, @t[0],   hs
+	lsr	$a_lo, $a_lo, #1
+	and	@t[0], $fg1, @t[3]
+	and	@t[1], $bias, @t[3]
+	sub	$fg0, $fg0, @t[0]	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	$fg1, $fg1, $fg1	// |f1|<<=1
+	add	$fg0, $fg0, @t[1]
+	sub	$fg1, $fg1, $bias
+	cbnz	$cnt, .Loop_31_256
+
+	mov	$bias, #0x7FFFFFFF
+	ubfx	$f0, $fg0, #0, #32
+	ubfx	$g0, $fg0, #32, #32
+	ubfx	$f1, $fg1, #0, #32
+	ubfx	$g1, $fg1, #32, #32
+	sub	$f0, $f0, $bias		// remove bias
+	sub	$g0, $g0, $bias
+	sub	$f1, $f1, $bias
+	sub	$g1, $g1, $bias
+
+	ret
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256, %function
+.align	4
+__inner_loop_62_256:
+	mov	$f0, #1		// |f0|=1
+	mov	$g0, #0		// |g0|=0
+	mov	$f1, #0		// |f1|=0
+	mov	$g1, #1		// |g1|=1
+
+.Loop_62_256:
+	sbfx	@t[3], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_lo, @t[3]
+	sub	@t[1], $b_lo, $a_lo	// |b_|-|a_|
+	subs	@t[2], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	@t[0], $f0
+	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
+	csel	$a_lo, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	mov	@t[1], $g0
+	csel	$f0, $f0, $f1,       hs	// exchange |f0| and |f1|
+	csel	$f1, $f1, @t[0],     hs
+	csel	$g0, $g0, $g1,       hs	// exchange |g0| and |g1|
+	csel	$g1, $g1, @t[1],     hs
+	lsr	$a_lo, $a_lo, #1
+	and	@t[0], $f1, @t[3]
+	and	@t[1], $g1, @t[3]
+	add	$f1, $f1, $f1		// |f1|<<=1
+	add	$g1, $g1, $g1		// |g1|<<=1
+	sub	$f0, $f0, @t[0]		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$g0, $g0, @t[1]		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	$cnt, .Loop_62_256
+
+	ret
+.size	__inner_loop_62_256,.-__inner_loop_62_256
+___
+}
+
+foreach(split("\n",$code)) {
+    s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/;
+    print $_,"\n";
+}
+close STDOUT;
--- a/blst/asm/ct_inverse_mod_256-x86_64.pl
+++ b/blst/asm/ct_inverse_mod_256-x86_64.pl
@ -0,0 +1,837 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake.
+#
+# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
+#                                                       const vec256 modx);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_256(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 31
+    mask = (1 << k) - 1
+
+    for i in range(0, 512 // k - 1):
+        # __ab_approximation_31
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-k-2)) << k)
+            b_ = (b & mask) | ((b >> (n-k-2)) << k)
+
+        # __inner_loop_31
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smulq_256_n_shift_by_31
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smulq_512x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 512 % k + k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 512 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    mod <<= 512 - mod.bit_length()  # align to the left
+    if v < 0:
+        v += mod
+    if v < 0:
+        v += mod
+    elif v == 1<<512
+        v -= mod
+
+    return v & (2**512 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc = map("%r$_",(8..15));
+my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
+my $cnt = "%edx";
+
+$frame = 8*6+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256,\@function,4,"unwind"
+.align	32
+ct_inverse_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*6+511(%rsp), %rax	# find closest 512-byte-aligned spot
+	and	\$-512, %rax		# in the frame...
+	mov	$out_ptr, 8*4(%rsp)
+	mov	$nx_ptr,  8*5(%rsp)
+
+	mov	8*0($in_ptr), @acc[0]	# load input
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+
+	mov	8*0($n_ptr), @acc[4]	# load modulus
+	mov	8*1($n_ptr), @acc[5]
+	mov	8*2($n_ptr), @acc[6]
+	mov	8*3($n_ptr), @acc[7]
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+
+	mov	@acc[4], 8*4(%rax)	# copy modulus to |b|
+	mov	@acc[5], 8*5(%rax)
+	mov	@acc[6], 8*6(%rax)
+	mov	@acc[7], 8*7(%rax)
+	mov	%rax, $in_ptr
+
+	################################# first iteration
+	mov	\$31, $cnt
+	call	__ab_approximation_31_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	mov	$f1, 8*2(%rsp)
+	mov	$g1, 8*3(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_256_n_shift_by_31
+	#mov	$f0, 8*0(%rsp)		# corrected |f0|
+	#mov	$g0, 8*1(%rsp)		# corrected |g0|
+	mov	$f0, 8*8($out_ptr)	# initialize |u| with |f0|
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_256_n_shift_by_31
+	#mov	$f0, 8*2(%rsp)		# corrected |f1|
+	#mov	$g0, 8*3(%rsp)		# corrected |g1|
+	mov	$f0, 8*9($out_ptr)	# initialize |v| with |f1|
+
+	################################# second iteration
+	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	mov	$f1, 8*2(%rsp)
+	mov	$g1, 8*3(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_256_n_shift_by_31
+	mov	$f0, 8*0(%rsp)		# corrected |f0|
+	mov	$g0, 8*1(%rsp)		# corrected |g0|
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_256_n_shift_by_31
+	#mov	$f0, 8*2(%rsp)		# corrected |f1|
+	#mov	$g0, 8*3(%rsp)		# corrected |g1|
+
+	mov	8*8($in_ptr),  @acc[0]	# |u|
+	mov	8*13($in_ptr), @acc[4]	# |v|
+	mov	@acc[0], @acc[1]
+	imulq	8*0(%rsp), @acc[0]	# |u|*|f0|
+	mov	@acc[4], @acc[5]
+	imulq	8*1(%rsp), @acc[4]	# |v|*|g0|
+	add	@acc[4], @acc[0]
+	mov	@acc[0], 8*4($out_ptr)	# destination |u|
+	sar	\$63, @acc[0]		# sign extension
+	mov	@acc[0], 8*5($out_ptr)
+	mov	@acc[0], 8*6($out_ptr)
+	mov	@acc[0], 8*7($out_ptr)
+	mov	@acc[0], 8*8($out_ptr)
+	lea	8*8($in_ptr), $in_ptr	# make in_ptr "rewindable" with xor
+
+	imulq	$f0, @acc[1]		# |u|*|f1|
+	imulq	$g0, @acc[5]		# |v|*|g1|
+	add	@acc[5], @acc[1]
+	mov	@acc[1], 8*9($out_ptr)	# destination |v|
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*10($out_ptr)
+	mov	@acc[1], 8*11($out_ptr)
+	mov	@acc[1], 8*12($out_ptr)
+	mov	@acc[1], 8*13($out_ptr)
+___
+for($i=2; $i<15; $i++) {
+my $smul_512x63  = $i>8  ? "__smulq_512x63"
+                         : "__smulq_256x63";
+$code.=<<___;
+	xor	\$256+8*8, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	mov	$f1, 8*2(%rsp)
+	mov	$g1, 8*3(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_256_n_shift_by_31
+	mov	$f0, 8*0(%rsp)		# corrected |f0|
+	mov	$g0, 8*1(%rsp)		# corrected |g0|
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_256_n_shift_by_31
+	mov	$f0, 8*2(%rsp)		# corrected |f1|
+	mov	$g0, 8*3(%rsp)		# corrected |g1|
+
+	mov	8*0(%rsp), $f0		# |f0|
+	mov	8*1(%rsp), $g0		# |g0|
+	lea	8*8($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |u|
+	call	__smulq_256x63
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*5($out_ptr),$out_ptr	# pointer to destination |v|
+	call	$smul_512x63
+___
+$code.=<<___	if ($i==8);
+	sar	\$63, %rbp		# sign extension
+	mov	%rbp, 8*5($out_ptr)
+	mov	%rbp, 8*6($out_ptr)
+	mov	%rbp, 8*7($out_ptr)
+___
+}
+$code.=<<___;
+	################################# two[!] last iterations in one go
+	xor	\$256+8*8, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$47, $cnt		# 31 + 512 % 31
+	#call	__ab_approximation_31	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	#xor	@acc[1],      @acc[1]	# |a_hi|
+	mov	8*4($in_ptr), @acc[2]	# |b_lo|
+	#xor	@acc[3],      @acc[3]	# |b_hi|
+	call	__inner_loop_62_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	#mov	$f1, 8*2(%rsp)
+	#mov	$g1, 8*3(%rsp)
+
+	#mov	8*0(%rsp), $f0		# |f0|
+	#mov	8*1(%rsp), $g0		# |g0|
+	lea	8*8($in_ptr), $in_ptr	# pointer to source |u|v|
+	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	#call	__smulq_256x63
+
+	#mov	8*2(%rsp), $f0		# |f1|
+	#mov	8*3(%rsp), $g0		# |g1|
+	mov	$f1, $f0
+	mov	$g1, $g0
+	mov	8*4(%rsp), $out_ptr	# original |out_ptr|
+	call	__smulq_512x63
+	adc	%rbp, %rdx		# the excess limb of the result
+
+	mov	8*5(%rsp), $in_ptr	# original |nx_ptr|
+	mov	%rdx, %rax
+	sar	\$63, %rdx		# result's sign as mask
+
+	mov	%rdx, @acc[0]		# mask |modulus|
+	mov	%rdx, @acc[1]
+	and	8*0($in_ptr), @acc[0]
+	mov	%rdx, @acc[2]
+	and	8*1($in_ptr), @acc[1]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), %rdx
+
+	add	@acc[0], @acc[4]	# conditionally add |modulus|<<256
+	adc	@acc[1], @acc[5]
+	adc	@acc[2], @acc[6]
+	adc	%rdx,    @acc[7]
+	adc	\$0,     %rax
+
+	mov	%rax, %rdx
+	neg	%rax
+	or	%rax, %rdx		# excess bit or sign as mask
+	sar	\$63, %rax		# excess bit as mask
+
+	mov	%rdx, @acc[0]		# mask |modulus|
+	mov	%rdx, @acc[1]
+	and	8*0($in_ptr), @acc[0]
+	mov	%rdx, @acc[2]
+	and	8*1($in_ptr), @acc[1]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), %rdx
+
+	xor	%rax, @acc[0]		# conditionally negate |modulus|
+	xor	%rcx, %rcx
+	xor	%rax, @acc[1]
+	sub	%rax, %rcx
+	xor	%rax, @acc[2]
+	xor	%rax, %rdx
+	add	%rcx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, %rdx
+
+	add	@acc[0], @acc[4]	# final adjustment for |modulus|<<256
+	adc	@acc[1], @acc[5]
+	adc	@acc[2], @acc[6]
+	adc	%rdx,    @acc[7]
+
+	mov	@acc[4], 8*4($out_ptr)	# store absolute value
+	mov	@acc[5], 8*5($out_ptr)
+	mov	@acc[6], 8*6($out_ptr)
+	mov	@acc[7], 8*7($out_ptr)
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+___
+########################################################################
+# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
+# to the maximum bit-length of the *result*, and "63" - to the maximum
+# bit-length of the |f?| and |g?| single-limb multiplicands. However!
+# The latter should not be taken literally, as they are always chosen so
+# that "bad things" don't happen. For example, there comes a point when
+# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
+# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
+# because past that point |f0| is always 1 and |g0| is always 0. And,
+# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
+# perform full-width |u|*|f1| multiplication, half-width one with sign
+# extension is sufficient...
+$code.=<<___;
+.type	__smulq_512x63,\@abi-omnipotent
+.align	32
+__smulq_512x63:
+	mov	8*0($in_ptr), @acc[0]	# load |u|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), %rbp	# sign limb
+
+	mov	$f0, %rbx
+	sar	\$63, $f0		# |f0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit
+
+	xor	$f0, %rbx		# conditionally negate |f0|
+	add	%rax, %rbx
+
+	xor	$f0, @acc[0]		# conditionally negate |u|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, %rbp
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, %rbp
+
+	mulq	%rbx			# |u|*|f0|
+	mov	%rax, 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<3; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	@acc[$i], 8*$i($out_ptr)
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	and	%rbx, %rbp
+	neg	%rbp
+	mulq	%rbx
+	add	%rax, @acc[3]
+	adc	%rdx, %rbp
+	mov	@acc[3], 8*3($out_ptr)
+
+	mov	8*5($in_ptr), @acc[0]	# load |v|
+	mov	8*6($in_ptr), @acc[1]
+	mov	8*7($in_ptr), @acc[2]
+	mov	8*8($in_ptr), @acc[3]
+	mov	8*9($in_ptr), @acc[4]
+	mov	8*10($in_ptr), @acc[5]
+	mov	8*11($in_ptr), @acc[6]
+	mov	8*12($in_ptr), @acc[7]
+
+	mov	$g0, $f0
+	sar	\$63, $f0		# |g0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |g0|'s sign as bit
+
+	xor	$f0, $g0		# conditionally negate |g0|
+	add	%rax, $g0
+
+	xor	$f0, @acc[0]		# conditionally negate |v|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, @acc[4]
+	xor	$f0, @acc[5]
+	xor	$f0, @acc[6]
+	xor	$f0, @acc[7]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+	adc	\$0, @acc[7]
+
+	mulq	$g0
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<7; $i++) {
+$code.=<<___;
+	mulq	$g0
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	imulq	$g0
+	add	%rax, @acc[7]
+	adc	\$0, %rdx		# used in the final step
+
+	mov	%rbp, %rbx
+	sar	\$63, %rbp		# sign extension
+
+	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	%rbx, @acc[4]
+	adc	%rbp, @acc[5]
+	adc	%rbp, @acc[6]
+	adc	%rbp, @acc[7]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+	mov	@acc[6], 8*6($out_ptr)
+	mov	@acc[7], 8*7($out_ptr)
+
+	ret
+.size	__smulq_512x63,.-__smulq_512x63
+
+.type	__smulq_256x63,\@abi-omnipotent
+.align	32
+__smulq_256x63:
+___
+for($j=0; $j<2; $j++) {
+my $k = 8*5*$j;
+my @acc=@acc;	@acc=@acc[4..7]	if($j);
+my $top="%rbp";	$top=$g0	if($j);
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+	mov	$k+8*4($in_ptr), $top	# sign/excess limb
+
+	mov	$f0, %rbx
+	sar	\$63, $f0		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	$f0, %rbx		# conditionally negate |f0|
+	add	%rax, %rbx
+
+	xor	$f0, @acc[0]		# conditionally negate |u| (or |v|)
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, $top
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, $top
+
+	mulq	%rbx
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<3; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	and	%rbx, $top
+	neg	$top
+	mulq	%rbx
+	add	%rax, @acc[3]
+	adc	%rdx, $top
+___
+$code.=<<___	if ($j==0);
+	mov	$g0, $f0
+___
+}
+$code.=<<___;
+	add	@acc[4], @acc[0]	# accumulate |u|*|f0|
+	adc	@acc[5], @acc[1]
+	adc	@acc[6], @acc[2]
+	adc	@acc[7], @acc[3]
+	adc	%rcx, %rbp
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	%rbp,    8*4($out_ptr)
+
+	ret
+.size	__smulq_256x63,.-__smulq_256x63
+___
+########################################################################
+# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
+# the names refers to maximum bit-lengths of |a| and |b|. As already
+# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
+# chosen so that "bad things" don't happen. For example, so that the
+# sum of the products doesn't overflow, and that the final result is
+# never wider than inputs...
+{
+$code.=<<___;
+.type	__smulq_256_n_shift_by_31,\@abi-omnipotent
+.align	32
+__smulq_256_n_shift_by_31:
+	mov	$f0, 8*0($out_ptr)	# offload |f0|
+	mov	$g0, 8*1($out_ptr)	# offload |g0|
+	mov	$f0, %rbp
+___
+for($j=0; $j<2; $j++) {
+my $k = 8*4*$j;
+my @acc=@acc;	@acc=@acc[4..7] if ($j);
+my $f0="%rbp";	$f0=$g0		if ($j);
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+
+	mov	$f0, %rbx
+	sar	\$63, $f0		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	$f0, %rbx		# conditionally negate |f0| (or |g0|)
+	add	%rax, %rbx
+
+	xor	$f0, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+
+	mulq	%rbx
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	and	%rbx, $f0
+	neg	$f0
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<3; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[3]
+	adc	%rdx, $f0
+___
+}
+$code.=<<___;
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	adc	@acc[6], @acc[2]
+	adc	@acc[7], @acc[3]
+	adc	$g0, %rbp
+
+	mov	8*0($out_ptr), $f0	# restore original |f0|
+	mov	8*1($out_ptr), $g0	# restore original |g0|
+
+	shrd	\$31, @acc[1], @acc[0]
+	shrd	\$31, @acc[2], @acc[1]
+	shrd	\$31, @acc[3], @acc[2]
+	shrd	\$31, %rbp,    @acc[3]
+
+	sar	\$63, %rbp		# sign as mask
+	xor	%rax, %rax
+	sub	%rbp, %rax		# sign as bit
+
+	xor	%rbp, @acc[0]		# conditionally negate the result
+	xor	%rbp, @acc[1]
+	xor	%rbp, @acc[2]
+	xor	%rbp, @acc[3]
+	add	%rax, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+
+	xor	%rbp, $f0		# conditionally negate |f0|
+	xor	%rbp, $g0		# conditionally negate |g0|
+	add	%rax, $f0
+	add	%rax, $g0
+
+	ret
+.size	__smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31
+___
+}
+
+{
+my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
+my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
+my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
+my ($a_, $b_) = ($a_lo, $b_lo);
+{
+my @a = ($a_lo, $t1, $a_hi);
+my @b = ($b_lo, $t2, $b_hi);
+
+$code.=<<___;
+.type	__ab_approximation_31_256,\@abi-omnipotent
+.align	32
+__ab_approximation_31_256:
+	mov	8*3($in_ptr), @a[2]	# load |a| in reverse order
+	mov	8*7($in_ptr), @b[2]	# load |b| in reverse order
+	mov	8*2($in_ptr), @a[1]
+	mov	8*6($in_ptr), @b[1]
+	mov	8*1($in_ptr), @a[0]
+	mov	8*5($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# check top-most limbs, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*0($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*4($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0
+	bsr	$t0, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[0], @a[2]
+	cmovz	@b[0], @b[2]
+	cmovz	$t0, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[1], @a[2]	# align second limb to the left
+	shldq	%cl, @b[1], @b[2]
+
+	mov	\$0x7FFFFFFF, %eax
+	and	%rax, @a[0]
+	and	%rax, @b[0]
+	not	%rax
+	and	%rax, @a[2]
+	and	%rax, @b[2]
+	or	@a[2], @a[0]
+	or	@b[2], @b[0]
+
+	jmp	__inner_loop_31_256
+
+	ret
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+___
+}
+$code.=<<___;
+.type	__inner_loop_31_256,\@abi-omnipotent
+.align	32			# comment and punish Coffee Lake by up to 40%
+__inner_loop_31_256:		################# by Thomas Pornin
+	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
+	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
+	mov	\$0x7FFFFFFF7FFFFFFF, $bias
+
+.Loop_31_256:
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	mov	$fg0, $t2
+	mov	$fg1, $t3
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	cmovb	$fg1, $fg0
+	cmovb	$t2, $fg1
+
+	sub	$b_, $a_		# |a_|-|b_|
+	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
+	add	$bias, $fg0
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $fg0
+	cmovz	$t3, $fg1
+
+	shr	\$1, $a_		# |a_|>>=1
+	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
+	sub	$bias, $fg1
+	sub	\$1, $cnt
+	jnz	.Loop_31_256
+
+	shr	\$32, $bias
+	mov	%ecx, %edx		# $fg0, $f0
+	mov	${fg1}d, ${f1}d
+	shr	\$32, $g0
+	shr	\$32, $g1
+	sub	$bias, $f0		# remove the bias
+	sub	$bias, $g0
+	sub	$bias, $f1
+	sub	$bias, $g1
+
+	ret
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256,\@abi-omnipotent
+.align	32
+__inner_loop_62_256:
+	mov	$cnt, %r15d
+	mov	\$1, $f0	# |f0|=1
+	xor	$g0, $g0	# |g0|=0
+	xor	$f1, $f1	# |f1|=0
+	mov	$f0, $g1	# |g1|=1
+	mov	$f0, %r14
+
+.Loop_62_256:
+	xor	$t0, $t0
+	test	%r14, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
+	mov	$b_lo, $t1
+	cmovnz	$b_lo, $t0
+	sub	$a_lo, $t1	# |b_|-|a_|
+	mov	$a_lo, $t2
+	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
+	cmovc	$t1, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
+	cmovc	$t2, $b_lo	# |b_| = |a_|
+	mov	$f0, $t0	# exchange |f0| and |f1|
+	cmovc	$f1, $f0
+	cmovc	$t0, $f1
+	mov	$g0, $t1	# exchange |g0| and |g1|
+	cmovc	$g1, $g0
+	cmovc	$t1, $g1
+	xor	$t0, $t0
+	xor	$t1, $t1
+	shr	\$1, $a_lo
+	test	%r14, $t2	# if |a_| was odd, then we'll be subtracting...
+	cmovnz	$f1, $t0
+	cmovnz	$g1, $t1
+	add	$f1, $f1	# |f1|<<=1
+	add	$g1, $g1	# |g1|<<=1
+	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
+	sub	\$1, %r15d
+	jnz	.Loop_62_256
+
+	ret
+.size	__inner_loop_62_256,.-__inner_loop_62_256
+___
+}
+
+print $code;
+close STDOUT;
--- a/blst/asm/ct_inverse_mod_384-armv8.pl
+++ b/blst/asm/ct_inverse_mod_384-armv8.pl
@ -0,0 +1,610 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >12x better [on
+# Cortex cores] than modulus-specific FLT addition chain...
+#
+# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_383(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 62
+    w = 64
+    mask = (1 << w) - 1
+
+    for i in range(0, 766 // k):
+        # __ab_approximation_62
+        n = max(a.bit_length(), b.bit_length())
+        if n < 128:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_62
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smul_383_n_shift_by_62
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smul_767x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 766 % k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 766 % k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    if v < 0:
+        v += mod << (768 - mod.bit_length())    # left aligned
+
+    return v & (2**768 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
+my @acc=map("x$_",(3..14));
+my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21));
+my $cnt = $n_ptr;
+my @t = map("x$_",(22..28,2));
+my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11];
+
+$frame = 16+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383, %function
+.align	5
+ct_inverse_mod_383:
+	paciasp
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #$frame
+
+	ldp	@t[0],   @acc[1], [$in_ptr,#8*0]
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2]
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4]
+
+	add	$in_ptr, sp, #16+511	// find closest 512-byte-aligned spot
+	and	$in_ptr, $in_ptr, #-512	// in the frame...
+	stp	$out_ptr, $nx_ptr, [sp]
+
+	ldp	@acc[6], @acc[7], [$n_ptr,#8*0]
+	ldp	@acc[8], @acc[9], [$n_ptr,#8*2]
+	ldp	@acc[10], @acc[11], [$n_ptr,#8*4]
+
+	stp	@t[0],   @acc[1], [$in_ptr,#8*0]	// copy input to |a|
+	stp	@acc[2], @acc[3], [$in_ptr,#8*2]
+	stp	@acc[4], @acc[5], [$in_ptr,#8*4]
+	stp	@acc[6], @acc[7], [$in_ptr,#8*6]	// copy modulus to |b|
+	stp	@acc[8], @acc[9], [$in_ptr,#8*8]
+	stp	@acc[10], @acc[11], [$in_ptr,#8*10]
+
+	////////////////////////////////////////// first iteration
+	mov	$cnt, #62
+	bl	.Lab_approximation_62_loaded
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	str	$f0,[$out_ptr,#8*12]		// initialize |u| with |f0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |b|
+	bl	__smul_383_n_shift_by_62
+	str	$f0, [$out_ptr,#8*12]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #62
+	bl	__ab_approximation_62
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	ldr	@acc[4], [$in_ptr,#8*12]	// |u|
+	ldr	@acc[5], [$in_ptr,#8*18]	// |v|
+	mul	@acc[0], $f_, @acc[4]		// |u|*|f0|
+	smulh	@acc[1], $f_, @acc[4]
+	mul	@acc[2], $g_, @acc[5]		// |v|*|g0|
+	smulh	@acc[3], $g_, @acc[5]
+	adds	@acc[0], @acc[0], @acc[2]
+	adc	@acc[1], @acc[1], @acc[3]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
+	asr	@acc[2], @acc[1], #63		// sign extenstion
+	stp	@acc[2], @acc[2], [$out_ptr,#8*8]
+	stp	@acc[2], @acc[2], [$out_ptr,#8*10]
+
+	mul	@acc[0], $f0, @acc[4]		// |u|*|f1|
+	smulh	@acc[1], $f0, @acc[4]
+	mul	@acc[2], $g0, @acc[5]		// |v|*|g1|
+	smulh	@acc[3], $g0, @acc[5]
+	adds	@acc[0], @acc[0], @acc[2]
+	adc	@acc[1], @acc[1], @acc[3]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*12]
+	asr	@acc[2], @acc[1], #63		// sign extenstion
+	stp	@acc[2], @acc[2], [$out_ptr,#8*14]
+	stp	@acc[2], @acc[2], [$out_ptr,#8*16]
+___
+for($i=2; $i<11; $i++) {
+$code.=<<___;
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #62
+	bl	__ab_approximation_62
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	$f_, $f0			// corrected |f1|
+	mov	$g_, $g0			// corrected |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+___
+$code.=<<___	if ($i>5);
+	bl	__smul_767x63_tail
+___
+$code.=<<___	if ($i==5);
+	asr	@t[5], @t[5], #63		// sign extension
+	stp	@t[5], @t[5], [$out_ptr,#8*6]
+	stp	@t[5], @t[5], [$out_ptr,#8*8]
+	stp	@t[5], @t[5], [$out_ptr,#8*10]
+___
+}
+$code.=<<___;
+	////////////////////////////////////////// iteration before last
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldp	$a_lo, $a_hi, [$in_ptr,#8*0]	// just load
+	ldp	$b_lo, $b_hi, [$in_ptr,#8*6]
+	bl	__inner_loop_62
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	str	$a_lo, [$out_ptr,#8*0]
+	str	$b_lo, [$out_ptr,#8*6]
+
+	mov	$f_, $f0			// exact |f0|
+	mov	$g_, $g0			// exact |g0|
+	mov	$f0, $f1
+	mov	$g0, $g1
+	add	$out_ptr, $out_ptr, #8*12	// pointer to dst |u|
+	bl	__smul_383x63
+
+	mov	$f_, $f0			// exact |f1|
+	mov	$g_, $g0			// exact |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+
+	////////////////////////////////////////// last iteration
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #22			// 766 % 62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldr	$a_lo, [$in_ptr,#8*0]		// just load
+	eor	$a_hi, $a_hi, $a_hi
+	ldr	$b_lo, [$in_ptr,#8*6]
+	eor	$b_hi, $b_hi, $b_hi
+	bl	__inner_loop_62
+
+	mov	$f_, $f1
+	mov	$g_, $g1
+	ldp	$out_ptr, $f0, [sp]		// original out_ptr and n_ptr
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	ldr	x30, [x29,#8]
+
+	asr	@t[0], @acc[5], #63		// sign as mask
+	ldp	@acc[6], @acc[7], [$f0,#8*0]
+	ldp	@acc[8], @acc[9], [$f0,#8*2]
+	ldp	@acc[10], @acc[11], [$f0,#8*4]
+
+	and	@acc[6], @acc[6], @t[0]		// add mod<<384 conditionally
+	and	@acc[7], @acc[7], @t[0]
+	adds	@acc[0], @acc[0], @acc[6]
+	and	@acc[8], @acc[8], @t[0]
+	adcs	@acc[1], @acc[1], @acc[7]
+	and	@acc[9], @acc[9], @t[0]
+	adcs	@acc[2], @acc[2], @acc[8]
+	and	@acc[10], @acc[10], @t[0]
+	adcs	@acc[3], @acc[3], @acc[9]
+	and	@acc[11], @acc[11], @t[0]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
+	adcs	@acc[4], @acc[4], @acc[10]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*8]
+	adc	@acc[5], @acc[5], @acc[11]
+	stp	@acc[4], @acc[5], [$out_ptr,#8*10]
+
+	add	sp, sp, #$frame
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	autiasp
+	ret
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+
+////////////////////////////////////////////////////////////////////////
+// see corresponding commentary in ctx_inverse_mod_384-x86_64...
+.type	__smul_383x63, %function
+.align	5
+__smul_383x63:
+___
+for($j=0; $j<2; $j++) {
+my $f_ = $f_;   $f_ = $g_          if ($j);
+my @acc = @acc; @acc = @acc[6..11] if ($j);
+my $k = 8*12+8*6*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |u| (or |v|)
+	asr	$f1, $f_, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	$f_, $f_, $f1		// conditionally negate |f_| (or |g_|)
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], $f1	// conditionally negate |u| (or |v|)
+	sub	$f_, $f_, $f1
+	eor	@acc[1], @acc[1], $f1
+	adds	@acc[0], @acc[0], $f1, lsr#63
+	eor	@acc[2], @acc[2], $f1
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], $f1
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[0], @acc[0], $f_
+	eor	@acc[5], @acc[5], $f1
+	 umulh	@t[1], @acc[1], $f_
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[2], @acc[2], $f_
+	adcs	@acc[5], @acc[5], xzr
+	 umulh	@t[3], @acc[3], $f_
+___
+$code.=<<___	if ($j);
+	adc	$g1, xzr, xzr		// used in __smul_767x63_tail
+___
+$code.=<<___;
+	umulh	@t[4], @acc[4], $f_
+	mul	@acc[0], @acc[0], $f_
+	mul	@acc[1], @acc[1], $f_
+	mul	@acc[2], @acc[2], $f_
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], $f_
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[4], @acc[4], $f_
+	adcs	@acc[3], @acc[3], @t[2]
+	mul	@t[5+$j],@acc[5], $f_
+	adcs	@acc[4], @acc[4], @t[3]
+	adcs	@t[5+$j],@t[5+$j],@t[4]
+___
+$code.=<<___	if ($j==0);
+	adc	@t[7], xzr, xzr
+___
+}
+$code.=<<___;
+	adc	@t[7], @t[7], xzr
+
+	adds	@acc[0], @acc[0], @acc[6]
+	adcs	@acc[1], @acc[1], @acc[7]
+	adcs	@acc[2], @acc[2], @acc[8]
+	adcs	@acc[3], @acc[3], @acc[9]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@acc[4], @acc[4], @acc[10]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+	adcs	@t[5],   @t[5],   @t[6]
+	stp	@acc[4], @t[5],   [$out_ptr,#8*4]
+	adc	@t[6],   @t[7],   xzr	// used in __smul_767x63_tail
+
+	ret
+.size	__smul_383x63,.-__smul_383x63
+
+.type	__smul_767x63_tail, %function
+.align	5
+__smul_767x63_tail:
+	smulh	@t[5],   @acc[5], $f_
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*24]	// load rest of |v|
+	umulh	@acc[11],@acc[11], $g_
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*26]
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*28]
+
+	eor	@acc[0], @acc[0], $f1	// conditionally negate rest of |v|
+	eor	@acc[1], @acc[1], $f1
+	eor	@acc[2], @acc[2], $f1
+	adds	@acc[0], @acc[0], $g1
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[4], @acc[4], $f1
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[5], @acc[5], $f1
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[0], @acc[0], $g_
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[1], @acc[1], $g_
+	adc	@acc[5], @acc[5], xzr
+
+	umulh	@t[2], @acc[2], $g_
+	 add	@acc[11], @acc[11], @t[6]
+	umulh	@t[3], @acc[3], $g_
+	 asr	@t[6], @t[5], #63
+	umulh	@t[4], @acc[4], $g_
+	mul	@acc[0], @acc[0], $g_
+	mul	@acc[1], @acc[1], $g_
+	mul	@acc[2], @acc[2], $g_
+	adds	@acc[0], @acc[0], @acc[11]
+	mul	@acc[3], @acc[3], $g_
+	adcs	@acc[1], @acc[1], @t[0]
+	mul	@acc[4], @acc[4], $g_
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[5], @acc[5], $g_
+	adcs	@acc[3], @acc[3], @t[2]
+	adcs	@acc[4], @acc[4], @t[3]
+	adc	@acc[5], @acc[5], @t[4]
+
+	adds	@acc[0], @acc[0], @t[5]
+	adcs	@acc[1], @acc[1], @t[6]
+	adcs	@acc[2], @acc[2], @t[6]
+	adcs	@acc[3], @acc[3], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
+	adcs	@acc[4], @acc[4], @t[6]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*8]
+	adc	@acc[5], @acc[5], @t[6]
+	stp	@acc[4], @acc[5], [$out_ptr,#8*10]
+
+	ret
+.size	__smul_767x63_tail,.-__smul_767x63_tail
+
+.type	__smul_383_n_shift_by_62, %function
+.align	5
+__smul_383_n_shift_by_62:
+___
+for($j=0; $j<2; $j++) {
+my $f0 = $f0;   $f0 = $g0           if ($j);
+my @acc = @acc; @acc = @acc[6..11]  if ($j);
+my $k = 8*6*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |a| (or |b|)
+	asr	@t[6], $f0, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	@t[7], $f0, @t[6]	// conditionally negate |f0| (or |g0|)
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], @t[6]	// conditionally negate |a| (or |b|)
+	sub	@t[7], @t[7], @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	 umulh	@t[0], @acc[0], @t[7]
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], @t[7]
+	eor	@acc[5], @acc[5], @t[6]
+	 umulh	@t[2], @acc[2], @t[7]
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[3], @acc[3], @t[7]
+	adc	@acc[5], @acc[5], xzr
+
+	umulh	@t[4], @acc[4], @t[7]
+	smulh	@t[5+$j], @acc[5], @t[7]
+	mul	@acc[0], @acc[0], @t[7]
+	mul	@acc[1], @acc[1], @t[7]
+	mul	@acc[2], @acc[2], @t[7]
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], @t[7]
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[4], @acc[4], @t[7]
+	adcs	@acc[3], @acc[3], @t[2]
+	mul	@acc[5], @acc[5], @t[7]
+	adcs	@acc[4], @acc[4], @t[3]
+	adcs	@acc[5], @acc[5] ,@t[4]
+	adc	@t[5+$j], @t[5+$j], xzr
+___
+}
+$code.=<<___;
+	adds	@acc[0], @acc[0], @acc[6]
+	adcs	@acc[1], @acc[1], @acc[7]
+	adcs	@acc[2], @acc[2], @acc[8]
+	adcs	@acc[3], @acc[3], @acc[9]
+	adcs	@acc[4], @acc[4], @acc[10]
+	adcs	@acc[5], @acc[5], @acc[11]
+	adc	@acc[6], @t[5],   @t[6]
+
+	extr	@acc[0], @acc[1], @acc[0], #62
+	extr	@acc[1], @acc[2], @acc[1], #62
+	extr	@acc[2], @acc[3], @acc[2], #62
+	asr	@t[6], @acc[6], #63
+	extr	@acc[3], @acc[4], @acc[3], #62
+	extr	@acc[4], @acc[5], @acc[4], #62
+	extr	@acc[5], @acc[6], @acc[5], #62
+
+	eor	@acc[0], @acc[0], @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	adcs	@acc[3], @acc[3], xzr
+	eor	@acc[5], @acc[5], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@acc[4], @acc[4], xzr
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+	adc	@acc[5], @acc[5], xzr
+	stp	@acc[4], @acc[5], [$out_ptr,#8*4]
+
+	eor	$f0, $f0, @t[6]
+	eor	$g0, $g0, @t[6]
+	sub	$f0, $f0, @t[6]
+	sub	$g0, $g0, @t[6]
+
+	ret
+.size	__smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
+___
+
+{
+my @a = @acc[0..5];
+my @b = @acc[6..11];
+
+$code.=<<___;
+.type	__ab_approximation_62, %function
+.align	4
+__ab_approximation_62:
+	ldp	@a[4], @a[5], [$in_ptr,#8*4]
+	ldp	@b[4], @b[5], [$in_ptr,#8*10]
+	ldp	@a[2], @a[3], [$in_ptr,#8*2]
+	ldp	@b[2], @b[3], [$in_ptr,#8*8]
+
+.Lab_approximation_62_loaded:
+	orr	@t[0], @a[5], @b[5]	// check top-most limbs, ...
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[3], ne
+	orr	@t[0], @a[5], @b[5]	// ... ones before top-most, ...
+	csel	@b[4], @b[4], @b[3], ne
+
+	ldp	@a[0], @a[1], [$in_ptr,#8*0]
+	ldp	@b[0], @b[1], [$in_ptr,#8*6]
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[2], ne
+	orr	@t[0], @a[5], @b[5]	// ... and ones before that ...
+	csel	@b[4], @b[4], @b[2], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[1], ne
+	orr	@t[0], @a[5], @b[5]
+	csel	@b[4], @b[4], @b[1], ne
+
+	clz	@t[0], @t[0]
+	cmp	@t[0], #64
+	csel	@t[0], @t[0], xzr, ne
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	neg	@t[1], @t[0]
+
+	lslv	@a[5], @a[5], @t[0]	// align high limbs to the left
+	lslv	@b[5], @b[5], @t[0]
+	lsrv	@a[4], @a[4], @t[1]
+	lsrv	@b[4], @b[4], @t[1]
+	and	@a[4], @a[4], @t[1], asr#6
+	and	@b[4], @b[4], @t[1], asr#6
+	orr	@a[5], @a[5], @a[4]
+	orr	@b[5], @b[5], @b[4]
+
+	b	__inner_loop_62
+	ret
+.size	__ab_approximation_62,.-__ab_approximation_62
+___
+}
+$code.=<<___;
+.type	__inner_loop_62, %function
+.align	4
+__inner_loop_62:
+	mov	$f0, #1		// |f0|=1
+	mov	$g0, #0		// |g0|=0
+	mov	$f1, #0		// |f1|=0
+	mov	$g1, #1		// |g1|=1
+
+.Loop_62:
+	sbfx	@t[6], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	$cnt, $cnt, #1
+	subs	@t[2], $b_lo, $a_lo	// |b_|-|a_|
+	and	@t[0], $b_lo, @t[6]
+	sbc	@t[3], $b_hi, $a_hi
+	and	@t[1], $b_hi, @t[6]
+	subs	@t[4], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	@t[0], $f0
+	sbcs	@t[5], $a_hi, @t[1]
+	mov	@t[1], $g0
+	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
+	csel	$b_hi, $b_hi, $a_hi, hs
+	csel	$a_lo, @t[4], @t[2], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	$a_hi, @t[5], @t[3], hs
+	csel	$f0, $f0, $f1,       hs	// exchange |f0| and |f1|
+	csel	$f1, $f1, @t[0],     hs
+	csel	$g0, $g0, $g1,       hs	// exchange |g0| and |g1|
+	csel	$g1, $g1, @t[1],     hs
+	extr	$a_lo, $a_hi, $a_lo, #1
+	lsr	$a_hi, $a_hi, #1
+	and	@t[0], $f1, @t[6]
+	and	@t[1], $g1, @t[6]
+	add	$f1, $f1, $f1		// |f1|<<=1
+	add	$g1, $g1, $g1		// |g1|<<=1
+	sub	$f0, $f0, @t[0]		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$g0, $g0, @t[1]		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	$cnt, .Loop_62
+
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
+___
+
+print $code;
+close STDOUT;
--- a/blst/asm/ct_is_square_mod_384-armv8.pl
+++ b/blst/asm/ct_is_square_mod_384-armv8.pl
@ -0,0 +1,398 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast quadratic residue test as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >12x better [on
+# Cortex cores] than modulus-specific Legendre symbol addition chain...
+#
+# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_is_square_mod_384(inp, mod):
+    a = inp
+    b = mod
+    L = 0   # only least significant bit, adding 1 makes up for sign change
+
+    k = 30
+    w = 32
+    mask = (1 << w) - 1
+
+    for i in range(0, 768 // k - 1):
+        # __ab_approximation_30
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_30
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                    L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+            L += (b_ + 2) >> 2          # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+        # __smulq_384_n_shift_by_30
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if b < 0:
+            b = -b
+        if a < 0:
+            a = -a
+            L += (b % 4) >> 1           # |b| is always odd, the second bit
+                                        # tells the whole story
+
+    if True:
+        for j in range(0, 768 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b = b, a
+                    L += (a & b) >> 1   # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a = a-b
+            a = a >> 1
+            L += (b + 2) >> 2           # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+    return (L & 1) ^ 1
+___
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2));
+my @acc=map("x$_",(3..14));
+my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20));
+my @t = map("x$_",(21..28));
+my ($a_, $b_) = @acc[5,11];
+
+$frame = 2*256;
+
+$code.=<<___;
+.text
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384, %function
+.align	5
+ct_is_square_mod_384:
+	paciasp
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #$frame
+
+	ldp	@acc[0], @acc[1], [x0,#8*0]		// load input
+	ldp	@acc[2], @acc[3], [x0,#8*2]
+	ldp	@acc[4], @acc[5], [x0,#8*4]
+
+	add	$in_ptr, sp, #255	// find closest 256-byte-aligned spot
+	and	$in_ptr, $in_ptr, #-256	// in the frame...
+
+	ldp	@acc[6], @acc[7], [x1,#8*0]		// load modulus
+	ldp	@acc[8], @acc[9], [x1,#8*2]
+	ldp	@acc[10], @acc[11], [x1,#8*4]
+
+	stp	@acc[0], @acc[1], [$in_ptr,#8*6]	// copy input to |a|
+	stp	@acc[2], @acc[3], [$in_ptr,#8*8]
+	stp	@acc[4], @acc[5], [$in_ptr,#8*10]
+	stp	@acc[6], @acc[7], [$in_ptr,#8*0]	// copy modulus to |b|
+	stp	@acc[8], @acc[9], [$in_ptr,#8*2]
+	stp	@acc[10], @acc[11], [$in_ptr,#8*4]
+
+	eor	$L, $L, $L			// init the Legendre symbol
+	mov	$cnt, #24			// 24 is 768/30-1
+	b	.Loop_is_square
+
+.align	4
+.Loop_is_square:
+	bl	__ab_approximation_30
+	sub	$cnt, $cnt, #1
+
+	eor	$out_ptr, $in_ptr, #128		// pointer to dst |b|
+	bl	__smul_384_n_shift_by_30
+
+	mov	$f1, $f0			// |f0|
+	mov	$g1, $g0			// |g0|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |a|
+	bl	__smul_384_n_shift_by_30
+
+	ldp	@acc[6], @acc[7], [$out_ptr,#-8*6]
+	eor	$in_ptr, $in_ptr, #128		// flip-flop src |a|b|
+	and	@t[6], @t[6], @acc[6]		// if |a| was negative,
+	add	$L, $L, @t[6], lsr#1		// adjust |L|
+
+	cbnz	$cnt, .Loop_is_square
+
+	////////////////////////////////////////// last iteration
+	//bl	__ab_approximation_30		// |a| and |b| are exact,
+	//ldr	$a_, [$in_ptr,#8*6]		// just load
+	mov	$b_, @acc[6]			// ldr	$b_, [$in_ptr,#8*0]
+	mov	$cnt, #48			// 48 is 768%30 + 30
+	bl	__inner_loop_48
+	ldr	x30, [x29,#8]
+
+	and	x0, $L, #1
+	eor	x0, x0, #1
+
+	add	sp, sp, #$frame
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	autiasp
+	ret
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smul_384_n_shift_by_30, %function
+.align	5
+__smul_384_n_shift_by_30:
+___
+for($j=0; $j<2; $j++) {
+my $fx = $g1;   $fx = $f1           if ($j);
+my @acc = @acc; @acc = @acc[6..11]  if ($j);
+my $k = 8*6*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |b| (or |a|)
+	asr	@t[6], $fx, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	$fx, $fx, @t[6]		// conditionally negate |g1| (or |f1|)
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], @t[6]	// conditionally negate |b| (or |a|)
+	sub	$fx, $fx, @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	 umulh	@t[0], @acc[0], $fx
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], $fx
+	eor	@acc[5], @acc[5], @t[6]
+	 umulh	@t[2], @acc[2], $fx
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[3], @acc[3], $fx
+	adc	@acc[5], @acc[5], xzr
+
+	umulh	@t[4], @acc[4], $fx
+	and	@t[7], $fx, @t[6]
+	umulh	@t[5+$j], @acc[5], $fx
+	neg	@t[7], @t[7]
+	mul	@acc[0], @acc[0], $fx
+	mul	@acc[1], @acc[1], $fx
+	mul	@acc[2], @acc[2], $fx
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], $fx
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[4], @acc[4], $fx
+	adcs	@acc[3], @acc[3], @t[2]
+	mul	@acc[5], @acc[5], $fx
+	adcs	@acc[4], @acc[4], @t[3]
+	adcs	@acc[5], @acc[5] ,@t[4]
+	adc	@t[5+$j], @t[5+$j], @t[7]
+___
+}
+$code.=<<___;
+	adds	@acc[0], @acc[0], @acc[6]
+	adcs	@acc[1], @acc[1], @acc[7]
+	adcs	@acc[2], @acc[2], @acc[8]
+	adcs	@acc[3], @acc[3], @acc[9]
+	adcs	@acc[4], @acc[4], @acc[10]
+	adcs	@acc[5], @acc[5], @acc[11]
+	adc	@acc[6], @t[5],   @t[6]
+
+	extr	@acc[0], @acc[1], @acc[0], #30
+	extr	@acc[1], @acc[2], @acc[1], #30
+	extr	@acc[2], @acc[3], @acc[2], #30
+	asr	@t[6], @acc[6], #63
+	extr	@acc[3], @acc[4], @acc[3], #30
+	extr	@acc[4], @acc[5], @acc[4], #30
+	extr	@acc[5], @acc[6], @acc[5], #30
+
+	eor	@acc[0], @acc[0], @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	adcs	@acc[3], @acc[3], xzr
+	eor	@acc[5], @acc[5], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@acc[4], @acc[4], xzr
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+	adc	@acc[5], @acc[5], xzr
+	stp	@acc[4], @acc[5], [$out_ptr,#8*4]
+
+	ret
+.size	__smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
+___
+
+{
+my @a = @acc[0..5];
+my @b = @acc[6..11];
+my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]);
+
+$code.=<<___;
+.type	__ab_approximation_30, %function
+.align	4
+__ab_approximation_30:
+	ldp	@b[4], @b[5], [$in_ptr,#8*4]	// |a| is still in registers
+	ldp	@b[2], @b[3], [$in_ptr,#8*2]
+
+	orr	@t[0], @a[5], @b[5]	// check top-most limbs, ...
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[3], ne
+	orr	@t[0], @a[5], @b[5]	// ... ones before top-most, ...
+	csel	@b[4], @b[4], @b[3], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[2], ne
+	orr	@t[0], @a[5], @b[5]	// ... and ones before that ...
+	csel	@b[4], @b[4], @b[2], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[1], ne
+	orr	@t[0], @a[5], @b[5]	// and one more, ...
+	csel	@b[4], @b[4], @b[1], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[0], ne
+	orr	@t[0], @a[5], @b[5]
+	csel	@b[4], @b[4], @b[0], ne
+
+	clz	@t[0], @t[0]
+	cmp	@t[0], #64
+	csel	@t[0], @t[0], xzr, ne
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	neg	@t[1], @t[0]
+
+	lslv	@a[5], @a[5], @t[0]	// align high limbs to the left
+	lslv	@b[5], @b[5], @t[0]
+	lsrv	@a[4], @a[4], @t[1]
+	lsrv	@b[4], @b[4], @t[1]
+	and	@a[4], @a[4], @t[1], asr#6
+	and	@b[4], @b[4], @t[1], asr#6
+	orr	$a_, @a[5], @a[4]
+	orr	$b_, @b[5], @b[4]
+
+	bfxil	$a_, @a[0], #0, #32
+	bfxil	$b_, @b[0], #0, #32
+
+	b	__inner_loop_30
+	ret
+.size	__ab_approximation_30,.-__ab_approximation_30
+
+.type	__inner_loop_30, %function
+.align	4
+__inner_loop_30:
+	mov	$cnt, #30
+	mov	$fg0, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	$fg1, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	$bias,#0x7FFFFFFF7FFFFFFF
+
+.Loop_30:
+	sbfx	@t[3], $a_, #0, #1	// if |a_| is odd, then we'll be subtracting
+	 and	@t[4], $a_, $b_
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_, @t[3]
+
+	sub	@t[1], $b_, $a_		// |b_|-|a_|
+	subs	@t[2], $a_, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	 add	@t[4], $L, @t[4], lsr#1	// L + (a_ & b_) >> 1
+	mov	@t[0], $fg1
+	csel	$b_, $b_, $a_, hs	// |b_| = |a_|
+	csel	$a_, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	$fg1, $fg1, $fg0,  hs	// exchange |fg0| and |fg1|
+	csel	$fg0, $fg0, @t[0], hs
+	 csel	$L,   $L,   @t[4], hs
+	lsr	$a_, $a_, #1
+	and	@t[0], $fg1, @t[3]
+	and	@t[1], $bias, @t[3]
+	 add	$t[2], $b_, #2
+	sub	$fg0, $fg0, @t[0]	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	$fg1, $fg1, $fg1	// |f1|<<=1
+	 add	$L, $L, $t[2], lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+	add	$fg0, $fg0, @t[1]
+	sub	$fg1, $fg1, $bias
+
+	cbnz	$cnt, .Loop_30
+
+	mov	$bias, #0x7FFFFFFF
+	ubfx	$f0, $fg0, #0, #32
+	ubfx	$g0, $fg0, #32, #32
+	ubfx	$f1, $fg1, #0, #32
+	ubfx	$g1, $fg1, #32, #32
+	sub	$f0, $f0, $bias		// remove the bias
+	sub	$g0, $g0, $bias
+	sub	$f1, $f1, $bias
+	sub	$g1, $g1, $bias
+
+	ret
+.size	__inner_loop_30,.-__inner_loop_30
+___
+}
+
+$code.=<<___;
+.type	__inner_loop_48, %function
+.align	4
+__inner_loop_48:
+.Loop_48:
+	sbfx	@t[3], $a_, #0, #1	// if |a_| is odd, then we'll be subtracting
+	 and	@t[4], $a_, $b_
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_, @t[3]
+	sub	@t[1], $b_, $a_		// |b_|-|a_|
+	subs	@t[2], $a_, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	 add	@t[4], $L, @t[4], lsr#1
+	csel	$b_, $b_, $a_, hs	// |b_| = |a_|
+	csel	$a_, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	 csel	$L,   $L,   @t[4], hs
+	 add	$t[2], $b_, #2
+	lsr	$a_, $a_, #1
+	 add	$L, $L, $t[2], lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+
+	cbnz	$cnt, .Loop_48
+
+	ret
+.size	__inner_loop_48,.-__inner_loop_48
+___
+
+print $code;
+close STDOUT;
--- a/blst/asm/ct_is_square_mod_384-x86_64.pl
+++ b/blst/asm/ct_is_square_mod_384-x86_64.pl
@ -0,0 +1,494 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast quadratic residue test as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >5x better than
+# modulus-specific Legendre symbol addition chain...
+#
+# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_is_square_mod_384(inp, mod):
+    a = inp
+    b = mod
+    L = 0   # only least significant bit, adding 1 makes up for sign change
+
+    k = 30
+    w = 32
+    mask = (1 << w) - 1
+
+    for i in range(0, 768 // k - 1):
+        # __ab_approximation_30
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_30
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                    L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+            L += (b_ + 2) >> 2          # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+        # __smulq_384_n_shift_by_30
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if b < 0:
+            b = -b
+        if a < 0:
+            a = -a
+            L += (b % 4) >> 1           # |b| is always odd, the second bit
+                                        # tells the whole story
+
+    if True:
+        for j in range(0, 768 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b = b, a
+                    L += (a & b) >> 1   # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a = a-b
+            a = a >> 1
+            L += (b + 2) >> 2           # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+    return (L & 1) ^ 1
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr) = ("%rdi", "%rsi");
+my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx");
+my @acc=map("%r$_",(8..15));
+my $L = "%rbp";
+
+$frame = 8*3+2*256;
+
+$code.=<<___;
+.text
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384,\@function,2,"unwind"
+.align	32
+ct_is_square_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*3+255(%rsp), %rax	# find closest 256-byte-aligned spot
+	and	\$-256, %rax		# in the frame...
+
+	mov	8*0(%rdi), @acc[0]	# load input
+	mov	8*1(%rdi), @acc[1]
+	mov	8*2(%rdi), @acc[2]
+	mov	8*3(%rdi), @acc[3]
+	mov	8*4(%rdi), @acc[4]
+	mov	8*5(%rdi), @acc[5]
+
+	mov	8*0(%rsi), @acc[6]	# load modulus
+	mov	8*1(%rsi), @acc[7]
+	mov	8*2(%rsi), %rbx
+	mov	8*3(%rsi), %rcx
+	mov	8*4(%rsi), %rdx
+	mov	8*5(%rsi), %rdi
+	mov	%rax, $in_ptr		# pointer to source |a|b|
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+	mov	@acc[4], 8*4(%rax)
+	mov	@acc[5], 8*5(%rax)
+
+	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
+	mov	@acc[7], 8*7(%rax)
+	mov	%rbx,    8*8(%rax)
+	mov	%rcx,    8*9(%rax)
+	mov	%rdx,    8*10(%rax)
+	mov	%rdi,    8*11(%rax)
+
+	xor	$L, $L			# initialize the Legendre symbol
+	mov	\$24, %ecx		# 24 is 768/30-1
+	jmp	.Loop_is_square
+
+.align	32
+.Loop_is_square:
+	mov	%ecx, 8*2(%rsp)		# offload loop counter
+
+	call	__ab_approximation_30
+	mov	$f0, 8*0(%rsp)		# offload |f0| and |g0|
+	mov	$g0, 8*1(%rsp)
+
+	mov	\$128+8*6, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |b|
+	call	__smulq_384_n_shift_by_30
+
+	mov	8*0(%rsp), $f1		# pop |f0| and |g0|
+	mov	8*1(%rsp), $g1
+	lea	-8*6($out_ptr),$out_ptr	# pointer to destination |a|
+	call	__smulq_384_n_shift_by_30
+
+	mov	8*2(%rsp), %ecx		# re-load loop counter
+	xor	\$128, $in_ptr		# flip-flop pointer to source |a|b|
+
+	and	8*6($out_ptr), @acc[6]	# if |a| was negative, adjust |L|
+	shr	\$1, @acc[6]
+	add	@acc[6], $L
+
+	sub	\$1, %ecx
+	jnz	.Loop_is_square
+
+	################################# last iteration
+	#call	__ab_approximation_30	# |a| and |b| are exact, just load
+	#mov	8*0($in_ptr), @acc[0]	# |a_|
+	mov	8*6($in_ptr), @acc[1]	# |b_|
+	call	__inner_loop_48		# 48 is 768%30+30
+
+	mov	\$1, %rax
+	and	$L,  %rax
+	xor	\$1, %rax		# return value
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smulq_384_n_shift_by_30,\@abi-omnipotent
+.align	32
+__smulq_384_n_shift_by_30:
+___
+for($j=0; $j<2; $j++) {
+$code.=<<___;
+	mov	8*0($in_ptr), @acc[0]	# load |a| (or |b|)
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	%rdx, %rbx		# |f1| (or |g1|)
+	sar	\$63, %rdx		# |f1|'s sign as mask (or |g1|'s)
+	xor	%rax, %rax
+	sub	%rdx, %rax		# |f1|'s sign as bit (or |g1|'s)
+
+	xor	%rdx, %rbx		# conditionally negate |f1| (or |g1|)
+	add	%rax, %rbx
+
+	xor	%rdx, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rdx, @acc[1]
+	xor	%rdx, @acc[2]
+	xor	%rdx, @acc[3]
+	xor	%rdx, @acc[4]
+	xor	%rdx, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	%rdx, @acc[6+$j]
+	and	%rbx, @acc[6+$j]
+	mulq	%rbx			# |a|*|f1| (or |b|*|g1|)
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	neg	@acc[6+$j]
+	mulq	%rbx
+	add	%rax, @acc[5]
+	adc	%rdx, @acc[6+$j]
+___
+$code.=<<___	if ($j==0);
+	lea	8*6($in_ptr), $in_ptr	# pointer to |b|
+	mov	$g1, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
+
+	add	8*0($out_ptr), @acc[0]
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+	adc	@acc[7],       @acc[6]
+
+	shrd	\$30, @acc[1], @acc[0]
+	shrd	\$30, @acc[2], @acc[1]
+	shrd	\$30, @acc[3], @acc[2]
+	shrd	\$30, @acc[4], @acc[3]
+	shrd	\$30, @acc[5], @acc[4]
+	shrd	\$30, @acc[6], @acc[5]
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	%rbx, %rbx
+	sub	@acc[6], %rbx		# sign as bit
+
+	xor	@acc[6], @acc[0]	# conditionally negate the result
+	xor	@acc[6], @acc[1]
+	xor	@acc[6], @acc[2]
+	xor	@acc[6], @acc[3]
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], @acc[5]
+	add	%rbx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	ret
+.size	__smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30
+___
+{
+my ($a_, $b_) = @acc[0..1];
+my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15));
+my ($fg0, $fg1, $bias) = ($g0, $g1, $t5);
+my $cnt = "%edi";
+{
+my @a = @acc[0..5];
+my @b = (@a[1..3], $t4, $t5, $g0);
+
+$code.=<<___;
+.type	__ab_approximation_30,\@abi-omnipotent
+.align	32
+__ab_approximation_30:
+	mov	8*11($in_ptr), @b[5]	# load |b| in reverse order
+	mov	8*10($in_ptr), @b[4]
+	mov	8*9($in_ptr),  @b[3]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# check top-most limbs, ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[3], @a[4]
+	mov	8*8($in_ptr), @b[2]
+	cmovz	@b[3], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# ... ones before top-most, ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[2], @a[4]
+	mov	8*7($in_ptr), @b[1]
+	cmovz	@b[2], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# ... and ones before that ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[1], @a[4]
+	mov	8*6($in_ptr), @b[0]
+	cmovz	@b[1], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# ... and ones before that ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[0], @a[4]
+	cmovz	@b[0], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax
+	bsr	%rax, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[0], @a[5]
+	cmovz	@b[0], @b[5]
+	cmovz	%rax, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[4], @a[5]	# align second limb to the left
+	shldq	%cl, @b[4], @b[5]
+
+	mov	\$0xFFFFFFFF00000000, %rax
+	mov	@a[0]d, ${a_}d
+	mov	@b[0]d, ${b_}d
+	and	%rax, @a[5]
+	and	%rax, @b[5]
+	or	@a[5], ${a_}
+	or	@b[5], ${b_}
+
+	jmp	__inner_loop_30
+
+	ret
+.size	__ab_approximation_30,.-__ab_approximation_30
+___
+}
+$code.=<<___;
+.type	__inner_loop_30,\@abi-omnipotent
+.align	32
+__inner_loop_30:		################# by Thomas Pornin
+	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
+	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
+	lea	-1($fg0), $bias			# 0x7FFFFFFF7FFFFFFF
+	mov	\$30, $cnt
+
+.Loop_30:
+	 mov	$a_, %rax
+	 and	$b_, %rax
+	 shr	\$1, %rax		# (a_ & b_) >> 1
+
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	 lea	(%rax,$L), %rax		# pre-"negate" |L|
+	mov	$fg0, $t2
+	mov	$fg1, $t3
+	 mov	$L,   $t4
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	cmovb	$fg1, $fg0
+	cmovb	$t2, $fg1
+	 cmovb	%rax, $L
+
+	sub	$b_, $a_		# |a_|-|b_|
+	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
+	add	$bias, $fg0
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $fg0
+	cmovz	$t3, $fg1
+	cmovz	$t4, $L
+
+	 lea	2($b_), %rax
+	shr	\$1, $a_		# |a_|>>=1
+	 shr	\$2, %rax
+	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
+	 lea	(%rax,$L), $L		# "negate" |L| if |b|%8 is 3 or 5
+	sub	$bias, $fg1
+
+	sub	\$1, $cnt
+	jnz	.Loop_30
+
+	shr	\$32, $bias
+	mov	%ebx, %eax		# $fg0 -> $f0
+	shr	\$32, $g0
+	mov	%ecx, %edx		# $fg1 -> $f1
+	shr	\$32, $g1
+	sub	$bias, $f0		# remove the bias
+	sub	$bias, $g0
+	sub	$bias, $f1
+	sub	$bias, $g1
+
+	ret
+.size	__inner_loop_30,.-__inner_loop_30
+
+.type	__inner_loop_48,\@abi-omnipotent
+.align	32
+__inner_loop_48:
+	mov	\$48, $cnt		# 48 is 768%30+30
+
+.Loop_48:
+	 mov	$a_, %rax
+	 and	$b_, %rax
+	 shr	\$1, %rax		# (a_ & b_) >> 1
+
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	 lea	(%rax,$L), %rax
+	 mov	$L,  $t2
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	 cmovb	%rax, $L
+
+	sub	$b_, $a_		# |a_|-|b_|
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $L
+
+	 lea	2($b_), %rax
+	shr	\$1, $a_		# |a_|>>=1
+	 shr	\$2, %rax
+	 add	%rax, $L		# "negate" |L| if |b|%8 is 3 or 5
+
+	sub	\$1, $cnt
+	jnz	.Loop_48
+
+	ret
+.size	__inner_loop_48,.-__inner_loop_48
+___
+}
+
+print $code;
+close STDOUT;
--- a/blst/asm/ctq_inverse_mod_384-x86_64.pl
+++ b/blst/asm/ctq_inverse_mod_384-x86_64.pl
@ -0,0 +1,886 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >5x better than
+# modulus-specific FLT addition chain...
+#
+# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_383(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 62
+    w = 64
+    mask = (1 << w) - 1
+
+    for i in range(0, 766 // k):
+        # __ab_approximation_62
+        n = max(a.bit_length(), b.bit_length())
+        if n < 128:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_62
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smulq_383_n_shift_by_62
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smulq_767x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 766 % k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 766 % k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    if v < 0:
+        v += mod << (768 - mod.bit_length())    # left aligned
+
+    return v & (2**768 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
+my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
+my $cnt = "%edi";
+
+$frame = 8*11+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383,\@function,4,"unwind"
+.align	32
+ct_inverse_mod_383:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*11+511(%rsp), %rax	# find closest 512-byte-aligned spot
+	and	\$-512, %rax		# in the frame...
+	mov	$out_ptr, 8*4(%rsp)
+	mov	$nx_ptr, 8*5(%rsp)
+
+	mov	8*0($in_ptr), @acc[0]	# load input
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	8*0($n_ptr), @acc[6]	# load modulus
+	mov	8*1($n_ptr), @acc[7]
+	mov	8*2($n_ptr), @acc[8]
+	mov	8*3($n_ptr), @acc[9]
+	mov	8*4($n_ptr), @acc[10]
+	mov	8*5($n_ptr), @acc[11]
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+	mov	@acc[4], 8*4(%rax)
+	mov	@acc[5], 8*5(%rax)
+
+	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
+	mov	@acc[7], 8*7(%rax)
+	mov	@acc[8], 8*8(%rax)
+	mov	@acc[9], 8*9(%rax)
+	mov	@acc[10], 8*10(%rax)
+	mov	%rax, $in_ptr		# pointer to source |a|b|1|0|
+	mov	@acc[11], 8*11(%rax)
+
+	################################# first iteration
+	mov	\$62, $cnt
+	call	__ab_approximation_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_383_n_shift_by_62
+	#mov	$f0, 8*7(%rsp)		# corrected |f0|
+	#mov	$g0, 8*8(%rsp)		# corrected |g0|
+	mov	$f0, 8*12($out_ptr)	# initialize |u| with |f0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_383_n_shift_by_62
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+	mov	$f0, 8*12($out_ptr)	# initialize |v| with |f1|
+
+	################################# second iteration
+	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
+	mov	\$62, $cnt
+	call	__ab_approximation_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_383_n_shift_by_62
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_383_n_shift_by_62
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*12($in_ptr), %rax	# |u|
+	mov	8*18($in_ptr), @acc[3]	# |v|
+	mov	$f0, %rbx
+	mov	%rax, @acc[2]
+	imulq	8*7(%rsp)		# |u|*|f0|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	8*8(%rsp)		# |v|*|g0|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*6($out_ptr)	# destination |u|
+	mov	@acc[1], 8*7($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*8($out_ptr)
+	mov	@acc[1], 8*9($out_ptr)
+	mov	@acc[1], 8*10($out_ptr)
+	mov	@acc[1], 8*11($out_ptr)
+	lea	8*12($in_ptr),$in_ptr	# make in_ptr "rewindable" with xor
+
+	mov	@acc[2], %rax
+	imulq	%rbx			# |u|*|f1|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	%rcx			# |v|*|g1|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*12($out_ptr)	# destination |v|
+	mov	@acc[1], 8*13($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*14($out_ptr)
+	mov	@acc[1], 8*15($out_ptr)
+	mov	@acc[1], 8*16($out_ptr)
+	mov	@acc[1], 8*17($out_ptr)
+___
+for($i=2; $i<11; $i++) {
+my $smul_767x63  = $i>5 ? "__smulq_767x63"
+                        : "__smulq_383x63";
+$code.=<<___;
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$62, $cnt
+	call	__ab_approximation_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_383_n_shift_by_62
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_383_n_shift_by_62
+	mov	$f0, 8*9(%rsp)		# corrected |f1|
+	mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*7(%rsp), $f0		# |f0|
+	mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	call	__smulq_383x63
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
+	call	$smul_767x63
+___
+$code.=<<___	if ($i==5);
+	sar	\$63, @acc[5]		# sign extension
+	mov	@acc[5], 8*6($out_ptr)
+	mov	@acc[5], 8*7($out_ptr)
+	mov	@acc[5], 8*8($out_ptr)
+	mov	@acc[5], 8*9($out_ptr)
+	mov	@acc[5], 8*10($out_ptr)
+	mov	@acc[5], 8*11($out_ptr)
+___
+}
+$code.=<<___;
+	################################# iteration before last
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$62, $cnt
+	#call	__ab_approximation_62	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	mov	8*1($in_ptr), @acc[1]	# |a_hi|
+	mov	8*6($in_ptr), @acc[2]	# |b_lo|
+	mov	8*7($in_ptr), @acc[3]	# |b_hi|
+	call	__inner_loop_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[2], 8*6($out_ptr)
+
+	#mov	8*7(%rsp), $f0		# |f0|
+	#mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*12($out_ptr),$out_ptr	# pointer to destination |u|
+	call	__smulq_383x63
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
+	call	__smulq_767x63
+
+	################################# last iteration
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$22, $cnt		# 766 % 62
+	#call	__ab_approximation_62	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	xor	@acc[1],      @acc[1]	# |a_hi|
+	mov	8*6($in_ptr), @acc[2]	# |b_lo|
+	xor	@acc[3],   @acc[3]	# |b_hi|
+	call	__inner_loop_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	#mov	$f1, 8*9(%rsp)
+	#mov	$g1, 8*10(%rsp)
+
+	#mov	8*7(%rsp), $f0		# |f0|
+	#mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	#call	__smulq_383x63
+
+	#mov	8*9(%rsp), $f0		# |f1|
+	#mov	8*10(%rsp), $g0		# |g1|
+	mov	$f1, $f0
+	mov	$g1, $g0
+	mov	8*4(%rsp), $out_ptr	# original out_ptr
+	call	__smulq_767x63
+
+	mov	8*5(%rsp), $in_ptr	# original n_ptr
+	mov	%rax, %rdx		# top limb of the result
+	sar	\$63, %rax		# result's sign as mask
+
+	mov	%rax, @acc[0]		# mask |modulus|
+	mov	%rax, @acc[1]
+	mov	%rax, @acc[2]
+	and	8*0($in_ptr), @acc[0]
+	and	8*1($in_ptr), @acc[1]
+	mov	%rax, @acc[3]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), @acc[3]
+	mov	%rax, @acc[4]
+	and	8*4($in_ptr), @acc[4]
+	and	8*5($in_ptr), %rax
+
+	add	@acc[0], @acc[6]	# conditionally add |modulus|<<384
+	adc	@acc[1], @acc[7]
+	adc	@acc[2], @acc[8]
+	adc	@acc[3], @acc[9]
+	adc	@acc[4], %rcx
+	adc	%rax,    %rdx
+
+	mov	@acc[6], 8*6($out_ptr)	# store absolute value
+	mov	@acc[7], 8*7($out_ptr)
+	mov	@acc[8], 8*8($out_ptr)
+	mov	@acc[9], 8*9($out_ptr)
+	mov	%rcx,    8*10($out_ptr)
+	mov	%rdx,    8*11($out_ptr)
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+___
+########################################################################
+# see corresponding commentary in ctx_inverse_mod_384-x86_64...
+{
+my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
+my $fx = @acc[9];
+
+$code.=<<___;
+.type	__smulq_767x63,\@abi-omnipotent
+.align	32
+__smulq_767x63:
+	mov	8*0($in_ptr), @acc[0]	# load |u|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	$f0, $fx
+	sar	\$63, $f0		# |f0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit
+
+	mov	$out_ptr, 8*1(%rsp)
+	mov	$in_ptr, 8*2(%rsp)
+	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
+
+	xor	$f0, $fx		# conditionally negate |f0|
+	add	%rax, $fx
+
+	xor	$f0, @acc[0]		# conditionally negate |u|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, @acc[4]
+	xor	$f0, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulq	$fx			# |u|*|f0|
+	mov	%rax, 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+	mov	@acc[$i], 8*$i($out_ptr)
+___
+}
+$code.=<<___;
+	imulq	$fx
+	add	%rax, @acc[$i]
+	adc	\$0, %rdx
+
+	mov	@acc[5], 8*5($out_ptr)
+	mov	%rdx, 8*6($out_ptr)
+	sar	\$63, %rdx		# sign extension
+	mov	%rdx, 8*7($out_ptr)
+___
+{
+my $fx=$in_ptr;
+$code.=<<___;
+	mov	$g0, $f0		# load |g0|
+
+	mov	8*0($in_ptr), @acc[0]	# load |v|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+	mov	8*6($in_ptr), @acc[6]
+	mov	8*7($in_ptr), @acc[7]
+	mov	8*8($in_ptr), @acc[8]
+	mov	8*9($in_ptr), @acc[9]
+	mov	8*10($in_ptr), @acc[10]
+	mov	8*11($in_ptr), @acc[11]
+
+	mov	$f0, $fx		# overrides in_ptr
+	sar	\$63, $f0		# |g0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |g0|'s sign as bit
+
+	xor	$f0, $fx		# conditionally negate |g0|
+	add	%rax, $fx
+
+	xor	$f0, @acc[0]		# conditionally negate |v|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, @acc[4]
+	xor	$f0, @acc[5]
+	xor	$f0, @acc[6]
+	xor	$f0, @acc[7]
+	xor	$f0, @acc[8]
+	xor	$f0, @acc[9]
+	xor	$f0, @acc[10]
+	xor	$f0, @acc[11]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+	adc	\$0, @acc[7]
+	adc	\$0, @acc[8]
+	adc	\$0, @acc[9]
+	adc	\$0, @acc[10]
+	adc	\$0, @acc[11]
+
+	mulq	$fx			# |v|*|g0|
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<11; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	mov	8*1(%rsp), %rdx		# out_ptr
+	imulq	$fx, %rax
+	mov	8*2(%rsp), $in_ptr	# restore original in_ptr
+	add	@acc[11], %rax
+
+	add	8*0(%rdx), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1(%rdx), @acc[1]
+	adc	8*2(%rdx), @acc[2]
+	adc	8*3(%rdx), @acc[3]
+	adc	8*4(%rdx), @acc[4]
+	adc	8*5(%rdx), @acc[5]
+	adc	8*6(%rdx), @acc[6]
+	mov	8*7(%rdx), @acc[11]	# sign extension
+	adc	@acc[11], @acc[7]
+	adc	@acc[11], @acc[8]
+	adc	@acc[11], @acc[9]
+	adc	@acc[11], @acc[10]
+	adc	@acc[11], %rax
+
+	mov	%rdx, $out_ptr		# restore original out_ptr
+
+	mov	@acc[0], 8*0(%rdx)
+	mov	@acc[1], 8*1(%rdx)
+	mov	@acc[2], 8*2(%rdx)
+	mov	@acc[3], 8*3(%rdx)
+	mov	@acc[4], 8*4(%rdx)
+	mov	@acc[5], 8*5(%rdx)
+	mov	@acc[6], 8*6(%rdx)
+	mov	@acc[7], 8*7(%rdx)
+	mov	@acc[8], 8*8(%rdx)
+	mov	@acc[9], 8*9(%rdx)
+	mov	@acc[10], 8*10(%rdx)
+	mov	%rax,     8*11(%rdx)
+
+	ret
+.size	__smulq_767x63,.-__smulq_767x63
+___
+}
+$code.=<<___;
+.type	__smulq_383x63,\@abi-omnipotent
+.align	32
+__smulq_383x63:
+___
+for($j=0; $j<2; $j++) {
+$code.=<<___;
+	mov	8*0($in_ptr), @acc[0]	# load |u| (or |v|)
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	%rdx, $fx
+	sar	\$63, %rdx		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	%rdx, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rdx, $fx		# conditionally negate |f0|
+	add	%rax, $fx
+
+	xor	%rdx, @acc[0]		# conditionally negate |u| (or |v|)
+	xor	%rdx, @acc[1]
+	xor	%rdx, @acc[2]
+	xor	%rdx, @acc[3]
+	xor	%rdx, @acc[4]
+	xor	%rdx, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulq	$fx			# |u|*|f0| (or |v|*|g0|)
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___	if ($j==0);
+	imulq	$fx, %rax
+	add	%rax, @acc[$i]
+
+	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
+	mov	$g0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	imulq	$fx, %rax
+	add	%rax, @acc[$i]
+
+	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
+
+	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	ret
+.size	__smulq_383x63,.-__smulq_383x63
+___
+{
+$code.=<<___;
+.type	__smulq_383_n_shift_by_62,\@abi-omnipotent
+.align	32
+__smulq_383_n_shift_by_62:
+	mov	$f0, @acc[8]
+___
+my $f0 = @acc[8];
+for($j=0; $j<2; $j++) {
+$code.=<<___;
+	mov	8*0($in_ptr), @acc[0]	# load |a| (or |b|)
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	%rdx, $fx
+	sar	\$63, %rdx		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	%rdx, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rdx, $fx		# conditionally negate |f0| (or |g0|)
+	add	%rax, $fx
+
+	xor	%rdx, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rdx, @acc[1]
+	xor	%rdx, @acc[2]
+	xor	%rdx, @acc[3]
+	xor	%rdx, @acc[4]
+	xor	%rdx, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulq	$fx			# |a|*|f0| (or |b|*|g0|)
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___	if ($j==0);
+	imulq	$fx
+	add	%rax, @acc[$i]
+	adc	\$0, %rdx
+
+	lea	8*6($in_ptr), $in_ptr	# pointer to |b|
+	mov	%rdx, @acc[6]
+	mov	$g0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	imulq	$fx
+	add	%rax, @acc[$i]
+	adc	\$0, %rdx
+
+	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
+
+	add	8*0($out_ptr), @acc[0]
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+	adc	%rdx,          @acc[6]
+	mov	$f0, %rdx
+
+	shrd	\$62, @acc[1], @acc[0]
+	shrd	\$62, @acc[2], @acc[1]
+	shrd	\$62, @acc[3], @acc[2]
+	shrd	\$62, @acc[4], @acc[3]
+	shrd	\$62, @acc[5], @acc[4]
+	shrd	\$62, @acc[6], @acc[5]
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	$fx, $fx
+	sub	@acc[6], $fx		# sign as bit
+
+	xor	@acc[6], @acc[0]	# conditionally negate the result
+	xor	@acc[6], @acc[1]
+	xor	@acc[6], @acc[2]
+	xor	@acc[6], @acc[3]
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], @acc[5]
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	xor	@acc[6], %rdx		# conditionally negate |f0|
+	xor	@acc[6], $g0		# conditionally negate |g0|
+	add	$fx, %rdx
+	add	$fx, $g0
+
+	ret
+.size	__smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62
+___
+} }
+
+{
+my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
+my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi");
+{
+my @a = ($a_lo, $t1, $a_hi);
+my @b = ($b_lo, $t2, $b_hi);
+
+$code.=<<___;
+.type	__ab_approximation_62,\@abi-omnipotent
+.align	32
+__ab_approximation_62:
+	mov	8*5($in_ptr), @a[2]	# load |a| in reverse order
+	mov	8*11($in_ptr), @b[2]	# load |b| in reverse order
+	mov	8*4($in_ptr), @a[1]
+	mov	8*10($in_ptr), @b[1]
+	mov	8*3($in_ptr), @a[0]
+	mov	8*9($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# check top-most limbs, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+	mov	8*2($in_ptr), @a[0]
+	mov	8*8($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... ones before top-most, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+	mov	8*1($in_ptr), @a[0]
+	mov	8*7($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+	mov	8*0($in_ptr), @a[0]
+	mov	8*6($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0
+	bsr	$t0, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	$t0, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[1], @a[2]	# align second limb to the left
+	shldq	%cl, @b[1], @b[2]
+
+	jmp	__inner_loop_62
+
+	ret
+.size	__ab_approximation_62,.-__ab_approximation_62
+___
+}
+$code.=<<___;
+.type	__inner_loop_62,\@abi-omnipotent
+.align	8
+.long	0
+__inner_loop_62:
+	mov	\$1, $f0	# |f0|=1
+	xor	$g0, $g0	# |g0|=0
+	xor	$f1, $f1	# |f1|=0
+	mov	\$1, $g1	# |g1|=1
+	mov	$in_ptr, 8(%rsp)
+
+.Loop_62:
+	xor	$t0, $t0
+	xor	$t1, $t1
+	test	\$1, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
+	mov	$b_lo, $t2
+	mov	$b_hi, $t3
+	cmovnz	$b_lo, $t0
+	cmovnz	$b_hi, $t1
+	sub	$a_lo, $t2	# |b_|-|a_|
+	sbb	$a_hi, $t3
+	mov	$a_lo, $t4
+	mov	$a_hi, $t5
+	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
+	sbb	$t1, $a_hi
+	cmovc	$t2, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
+	cmovc	$t3, $a_hi
+	cmovc	$t4, $b_lo	# |b_| = |a_|
+	cmovc	$t5, $b_hi
+	mov	$f0, $t0	# exchange |f0| and |f1|
+	cmovc	$f1, $f0
+	cmovc	$t0, $f1
+	mov	$g0, $t1	# exchange |g0| and |g1|
+	cmovc	$g1, $g0
+	cmovc	$t1, $g1
+	xor	$t0, $t0
+	xor	$t1, $t1
+	shrd	\$1, $a_hi, $a_lo
+	shr	\$1, $a_hi
+	test	\$1, $t4	# if |a_| was odd, then we'll be subtracting...
+	cmovnz	$f1, $t0
+	cmovnz	$g1, $t1
+	add	$f1, $f1	# |f1|<<=1
+	add	$g1, $g1	# |g1|<<=1
+	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
+	sub	\$1, $cnt
+	jnz	.Loop_62
+
+	mov	8(%rsp), $in_ptr
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
+___
+}
+
+print $code;
+close STDOUT;
--- a/blst/asm/ctx_inverse_mod_384-x86_64.pl
+++ b/blst/asm/ctx_inverse_mod_384-x86_64.pl
@ -0,0 +1,995 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >4x better than
+# modulus-specific FLT addition chain...
+#
+# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_383(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 31
+    mask = (1 << k) - 1
+
+    for i in range(0, 766 // k):
+        # __ab_approximation_31
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-k-2)) << k)
+            b_ = (b & mask) | ((b >> (n-k-2)) << k)
+
+        # __inner_loop_31
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smulx_383_n_shift_by_31
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smulx_767x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 766 % k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 766 % k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    if v < 0:
+        v += mod << (768 - mod.bit_length())    # left aligned
+
+    return v & (2**768 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
+my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
+my $cnt = "%edi";
+
+$frame = 8*11+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ctx_inverse_mod_383
+.type	ctx_inverse_mod_383,\@function,4,"unwind"
+.align	32
+ctx_inverse_mod_383:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*11+511(%rsp), %rax	# find closest 512-byte-aligned spot
+	and	\$-512, %rax		# in the frame...
+	mov	$out_ptr, 8*4(%rsp)
+	mov	$nx_ptr, 8*5(%rsp)
+
+	mov	8*0($in_ptr), @acc[0]	# load input
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	8*0($n_ptr), @acc[6]	# load modulus
+	mov	8*1($n_ptr), @acc[7]
+	mov	8*2($n_ptr), @acc[8]
+	mov	8*3($n_ptr), @acc[9]
+	mov	8*4($n_ptr), @acc[10]
+	mov	8*5($n_ptr), @acc[11]
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+	mov	@acc[4], 8*4(%rax)
+	mov	@acc[5], 8*5(%rax)
+
+	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
+	mov	@acc[7], 8*7(%rax)
+	mov	@acc[8], 8*8(%rax)
+	mov	@acc[9], 8*9(%rax)
+	mov	@acc[10], 8*10(%rax)
+	mov	%rax, $in_ptr
+	mov	@acc[11], 8*11(%rax)
+
+	################################# first iteration
+	mov	\$31, $cnt
+	call	__ab_approximation_31
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulx_383_n_shift_by_31
+	#mov	$f0, 8*7(%rsp)		# corrected |f0|
+	#mov	$g0, 8*8(%rsp)		# corrected |g0|
+	mov	$f0, 8*12($out_ptr)	# initialize |u| with |f0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulx_383_n_shift_by_31
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+	mov	$f0, 8*12($out_ptr)	# initialize |v| with |f1|
+
+	################################# second iteration
+	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulx_383_n_shift_by_31
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulx_383_n_shift_by_31
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*12($in_ptr), %rax	# |u|
+	mov	8*18($in_ptr), @acc[3]	# |v|
+	mov	$f0, %rbx
+	mov	%rax, @acc[2]
+	imulq	8*7(%rsp)		# |u|*|f0|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	8*8(%rsp)		# |v|*|g0|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*6($out_ptr)	# destination |u|
+	mov	@acc[1], 8*7($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*8($out_ptr)
+	mov	@acc[1], 8*9($out_ptr)
+	mov	@acc[1], 8*10($out_ptr)
+	mov	@acc[1], 8*11($out_ptr)
+	lea	8*12($in_ptr), $in_ptr	# make in_ptr "rewindable" with xor
+
+	mov	@acc[2], %rax
+	imulq	%rbx			# |u|*|f1|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	%rcx			# |v|*|g1|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*12($out_ptr)	# destination |v|
+	mov	@acc[1], 8*13($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*14($out_ptr)
+	mov	@acc[1], 8*15($out_ptr)
+	mov	@acc[1], 8*16($out_ptr)
+	mov	@acc[1], 8*17($out_ptr)
+___
+for($i=2; $i<23; $i++) {
+my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31"
+                         : "__smulx_191_n_shift_by_31";
+my $smul_767x63  = $i>11 ? "__smulx_767x63"
+                         : "__smulx_383x63";
+$code.=<<___;
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	$smul_n_shift
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	$smul_n_shift
+	mov	$f0, 8*9(%rsp)		# corrected |f1|
+	mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*7(%rsp), $f0		# |f0|
+	mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	call	__smulx_383x63
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
+	call	$smul_767x63
+___
+$code.=<<___	if ($i==11);
+	sar	\$63, @acc[5]		# sign extension
+	mov	@acc[5], 8*6($out_ptr)
+	mov	@acc[5], 8*7($out_ptr)
+	mov	@acc[5], 8*8($out_ptr)
+	mov	@acc[5], 8*9($out_ptr)
+	mov	@acc[5], 8*10($out_ptr)
+	mov	@acc[5], 8*11($out_ptr)
+___
+}
+$code.=<<___;
+	################################# two[!] last iterations in one go
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$53, $cnt		# 31 + 766 % 31
+	#call	__ab_approximation_31	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	#xor	@acc[1],      @acc[1]	# |a_hi|
+	mov	8*6($in_ptr), @acc[2]	# |b_lo|
+	#xor	@acc[3],      @acc[3]	# |b_hi|
+	call	__inner_loop_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	#mov	$f1, 8*9(%rsp)
+	#mov	$g1, 8*10(%rsp)
+
+	#mov	8*7(%rsp), $f0		# |f0|
+	#mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	#call	__smulx_383x63
+
+	#mov	8*9(%rsp), $f0		# |f1|
+	#mov	8*10(%rsp), $g0		# |g1|
+	mov	$f1, $f0
+	mov	$g1, $g0
+	mov	8*4(%rsp), $out_ptr	# original out_ptr
+	call	__smulx_767x63
+
+	mov	8*5(%rsp), $in_ptr	# original n_ptr
+	mov	%rax, %rdx		# top limb of the result
+	sar	\$63, %rax		# result's sign as mask
+
+	mov	%rax, @acc[0]		# mask |modulus|
+	mov	%rax, @acc[1]
+	mov	%rax, @acc[2]
+	and	8*0($in_ptr), @acc[0]
+	and	8*1($in_ptr), @acc[1]
+	mov	%rax, @acc[3]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), @acc[3]
+	mov	%rax, @acc[4]
+	and	8*4($in_ptr), @acc[4]
+	and	8*5($in_ptr), %rax
+
+	add	@acc[0], @acc[6]	# conditionally add |modulus|<<384
+	adc	@acc[1], @acc[7]
+	adc	@acc[2], @acc[8]
+	adc	@acc[3], @acc[9]
+	adc	@acc[4], %rcx
+	adc	%rax,    %rdx
+
+	mov	@acc[6], 8*6($out_ptr)	# store absolute value
+	mov	@acc[7], 8*7($out_ptr)
+	mov	@acc[8], 8*8($out_ptr)
+	mov	@acc[9], 8*9($out_ptr)
+	mov	%rcx,    8*10($out_ptr)
+	mov	%rdx,    8*11($out_ptr)
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ctx_inverse_mod_383,.-ctx_inverse_mod_383
+___
+########################################################################
+# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
+# to the maximum bit-length of the *result*, and "63" - to the maximum
+# bit-length of the |f?| and |g?| single-limb multiplicands. However!
+# The latter should not be taken literally, as they are always chosen so
+# that "bad things" don't happen. For example, there comes a point when
+# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
+# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
+# because past that point |f0| is always 1 and |g0| is always 0. And,
+# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
+# perform full-width |u|*|f1| multiplication, half-width one with sign
+# extension is sufficient...
+{
+my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
+my $fx = @acc[9];
+
+$code.=<<___;
+.type	__smulx_767x63,\@abi-omnipotent
+.align	32
+__smulx_767x63:
+	mov	8*0($in_ptr), @acc[0]	# load |u|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	$f0, %rax
+	sar	\$63, %rax		# |f0|'s sign as mask
+	xor	$fx, $fx		# overrides in_ptr
+	sub	%rax, $fx		# |f0|'s sign as bit
+
+	mov	$out_ptr, 8*1(%rsp)
+	mov	$in_ptr,  8*2(%rsp)
+	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
+
+	xor	%rax, $f0		# conditionally negate |f0|
+	add	$fx, $f0
+
+	xor	%rax, @acc[0]		# conditionally negate |u|
+	xor	%rax, @acc[1]
+	xor	%rax, @acc[2]
+	xor	%rax, @acc[3]
+	xor	%rax, @acc[4]
+	xor	@acc[5], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, %rax
+
+	mulx	@acc[0], @acc[0], $fx	# |u|*|f0|
+	mulx	@acc[1], @acc[1], @acc[5]
+	add	$fx, @acc[1]
+___
+for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___;
+	adc	\$0, $fx
+	imulq	%rdx
+	add	$fx, %rax
+	adc	\$0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	%rax,    8*5($out_ptr)
+	mov	%rdx,    8*6($out_ptr)
+	sar	\$63, %rdx		# sign extension
+	mov	%rdx, 8*7($out_ptr)
+___
+{
+my $fx=$in_ptr;
+$code.=<<___;
+	mov	$g0, $f0		# load |g0|
+	mov	$g0, %rax
+
+	mov	8*0($in_ptr), @acc[0]	# load |v|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+	mov	8*6($in_ptr), @acc[6]
+	mov	8*7($in_ptr), @acc[7]
+	mov	8*8($in_ptr), @acc[8]
+	mov	8*9($in_ptr), @acc[9]
+	mov	8*10($in_ptr), @acc[10]
+	mov	8*11($in_ptr), @acc[11]
+
+	sar	\$63, %rax		# |g0|'s sign as mask
+	xor	$fx, $fx		# overrides in_ptr
+	sub	%rax, $fx		# |g0|'s sign as bit
+
+	xor	%rax, $f0		# conditionally negate |g0|
+	add	$fx, $f0
+
+	xor	%rax, @acc[0]		# conditionally negate |v|
+	xor	%rax, @acc[1]
+	xor	%rax, @acc[2]
+	xor	%rax, @acc[3]
+	xor	%rax, @acc[4]
+	xor	%rax, @acc[5]
+	xor	%rax, @acc[6]
+	xor	%rax, @acc[7]
+	xor	%rax, @acc[8]
+	xor	%rax, @acc[9]
+	xor	%rax, @acc[10]
+	xor	%rax, @acc[11]
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+	adc	\$0, @acc[7]
+	adc	\$0, @acc[8]
+	adc	\$0, @acc[9]
+	adc	\$0, @acc[10]
+	adc	\$0, @acc[11]
+
+	mulx	@acc[0], @acc[0], %rax	# |v|*|g0|
+	mulx	@acc[1], @acc[1], $fx
+	add	%rax, @acc[1]
+___
+for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___;
+	mulx	@acc[11], @acc[11], $fx
+	mov	8*1(%rsp), %rdx		# out_ptr
+	mov	8*2(%rsp), $in_ptr	# restore original in_ptr
+	adc	@acc[11], %rax
+
+	add	8*0(%rdx), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1(%rdx), @acc[1]
+	adc	8*2(%rdx), @acc[2]
+	adc	8*3(%rdx), @acc[3]
+	adc	8*4(%rdx), @acc[4]
+	adc	8*5(%rdx), @acc[5]
+	adc	8*6(%rdx), @acc[6]
+	mov	8*7(%rdx), @acc[11]	# sign extension
+	adc	@acc[11], @acc[7]
+	adc	@acc[11], @acc[8]
+	adc	@acc[11], @acc[9]
+	adc	@acc[11], @acc[10]
+	adc	@acc[11], %rax
+
+	mov	%rdx, $out_ptr		# restore original out_ptr
+
+	mov	@acc[0], 8*0(%rdx)
+	mov	@acc[1], 8*1(%rdx)
+	mov	@acc[2], 8*2(%rdx)
+	mov	@acc[3], 8*3(%rdx)
+	mov	@acc[4], 8*4(%rdx)
+	mov	@acc[5], 8*5(%rdx)
+	mov	@acc[6], 8*6(%rdx)
+	mov	@acc[7], 8*7(%rdx)
+	mov	@acc[8], 8*8(%rdx)
+	mov	@acc[9], 8*9(%rdx)
+	mov	@acc[10], 8*10(%rdx)
+	mov	%rax,     8*11(%rdx)
+
+	ret
+.size	__smulx_767x63,.-__smulx_767x63
+___
+}
+$code.=<<___;
+.type	__smulx_383x63,\@abi-omnipotent
+.align	32
+__smulx_383x63:
+___
+for($j=0; $j<2; $j++) {
+my $k = 8*6*$j;
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+	mov	$k+8*4($in_ptr), @acc[4]
+	mov	$k+8*5($in_ptr), @acc[5]
+
+	mov	$f0, $fx
+	sar	\$63, $fx		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	$fx, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	$fx, $f0		# conditionally negate |f0|
+	add	%rax, $f0
+
+	xor	$fx, @acc[0]		# conditionally negate |u| (or |v|)
+	xor	$fx, @acc[1]
+	xor	$fx, @acc[2]
+	xor	$fx, @acc[3]
+	xor	$fx, @acc[4]
+	xor	$fx, @acc[5]
+	add	%rax, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulx	@acc[0], @acc[0], $fx	# |u|*|f0| (or |v|*|g0|)
+	mulx	@acc[1], @acc[1], %rax
+	add	$fx, @acc[1]
+___
+for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___	if ($j==0);
+	mulx	@acc[$i], @acc[$i], %rax
+	mov	$g0, $f0
+	adc	$fx, @acc[$i]
+
+	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], %rax
+	adc	$fx, @acc[$i]
+
+	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	ret
+.size	__smulx_383x63,.-__smulx_383x63
+___
+########################################################################
+# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
+# the names refers to maximum bit-lengths of |a| and |b|. As already
+# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
+# chosen so that "bad things" don't happen. For example, so that the
+# sum of the products doesn't overflow, and that the final result is
+# never wider than inputs...
+{
+$code.=<<___;
+.type	__smulx_383_n_shift_by_31,\@abi-omnipotent
+.align	32
+__smulx_383_n_shift_by_31:
+	mov	$f0, @acc[8]
+	xor	@acc[6], @acc[6]
+___
+my $f0 = @acc[8];
+for($j=0; $j<2; $j++) {
+my $k = 8*6*$j;
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+	mov	$k+8*4($in_ptr), @acc[4]
+	mov	$k+8*5($in_ptr), @acc[5]
+
+	mov	%rdx, %rax
+	sar	\$63, %rax		# |f0|'s sign as mask (or |g0|'s)
+	xor	$fx, $fx
+	sub	%rax, $fx		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rax, %rdx		# conditionally negate |f0| (or |g0|)
+	add	$fx, %rdx
+
+	xor	%rax, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rax, @acc[1]
+	xor	%rax, @acc[2]
+	xor	%rax, @acc[3]
+	xor	%rax, @acc[4]
+	xor	@acc[5], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, %rax
+
+	mulx	@acc[0], @acc[0], $fx	# |a|*|f0| (or |b|*|g0|)
+	mulx	@acc[1], @acc[1], @acc[5]
+	add	$fx, @acc[1]
+___
+for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___	if ($j==0);
+	adc	\$0, $fx
+	imulq	%rdx
+	add	$fx, %rax
+	adc	%rdx, @acc[6]
+
+	mov	$g0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	%rax,    8*5($out_ptr)
+___
+}
+$code.=<<___;
+	adc	\$0, $fx
+	imulq	%rdx
+	add	$fx, %rax
+	adc	\$0, %rdx
+
+	add	8*0($out_ptr), @acc[0]
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), %rax
+	adc	%rdx,          @acc[6]
+	mov	$f0, %rdx
+
+	shrd	\$31, @acc[1], @acc[0]
+	shrd	\$31, @acc[2], @acc[1]
+	shrd	\$31, @acc[3], @acc[2]
+	shrd	\$31, @acc[4], @acc[3]
+	shrd	\$31, %rax,    @acc[4]
+	shrd	\$31, @acc[6], %rax
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	$fx, $fx
+	sub	@acc[6], $fx		# sign as bit
+
+	xor	@acc[6], @acc[0]	# conditionally negate the result
+	xor	@acc[6], @acc[1]
+	xor	@acc[6], @acc[2]
+	xor	@acc[6], @acc[3]
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, %rax
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	%rax,    8*5($out_ptr)
+
+	xor	@acc[6], %rdx		# conditionally negate |f0|
+	xor	@acc[6], $g0		# conditionally negate |g0|
+	add	$fx, %rdx
+	add	$fx, $g0
+
+	ret
+.size	__smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31
+___
+} {
+$code.=<<___;
+.type	__smulx_191_n_shift_by_31,\@abi-omnipotent
+.align	32
+__smulx_191_n_shift_by_31:
+	mov	$f0, @acc[8]
+___
+my $f0 = @acc[8];
+for($j=0; $j<2; $j++) {
+my $k = 8*6*$j;
+my @acc=@acc;
+   @acc=@acc[3..5] if ($j);
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+
+	mov	%rdx, %rax
+	sar	\$63, %rax		# |f0|'s sign as mask (or |g0|'s)
+	xor	$fx, $fx
+	sub	%rax, $fx		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rax, %rdx		# conditionally negate |f0| (or |g0|)
+	add	$fx, %rdx
+
+	xor	%rax, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rax, @acc[1]
+	xor	@acc[2], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, %rax
+
+	mulx	@acc[0], @acc[0], $fx	# |a|*|f0| (or |b|*|g0|)
+	mulx	@acc[1], @acc[1], @acc[2]
+	add	$fx, @acc[1]
+	adc	\$0, @acc[2]
+	imulq	%rdx
+	add	%rax, @acc[2]
+	adc	\$0, %rdx
+___
+$code.=<<___	if ($j==0);
+	mov	%rdx, @acc[6]
+	mov	$g0, %rdx
+___
+}
+$code.=<<___;
+	add	@acc[0], @acc[3]
+	adc	@acc[1], @acc[4]
+	adc	@acc[2], @acc[5]
+	adc	%rdx,    @acc[6]
+	mov	$f0, %rdx
+
+	shrd	\$31, @acc[4], @acc[3]
+	shrd	\$31, @acc[5], @acc[4]
+	shrd	\$31, @acc[6], @acc[5]
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	$fx, $fx
+	sub	@acc[6], $fx		# sign as bit
+
+	xor	@acc[6], @acc[3]	# conditionally negate the result
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], @acc[5]
+	add	$fx, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	@acc[3], 8*0($out_ptr)
+	mov	@acc[4], 8*1($out_ptr)
+	mov	@acc[5], 8*2($out_ptr)
+
+	xor	@acc[6], %rdx		# conditionally negate |f0|
+	xor	@acc[6], $g0		# conditionally negate |g0|
+	add	$fx, %rdx
+	add	$fx, $g0
+
+	ret
+.size	__smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31
+___
+} }
+
+{
+my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
+my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
+my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
+my ($a_, $b_) = ($a_lo, $b_lo);
+{
+my @a = ($a_lo, $t1, $a_hi);
+my @b = ($b_lo, $t2, $b_hi);
+
+$code.=<<___;
+.type	__ab_approximation_31,\@abi-omnipotent
+.align	32
+__ab_approximation_31:
+	mov	8*5($in_ptr), @a[2]	# load |a| in reverse order
+	mov	8*11($in_ptr), @b[2]	# load |b| in reverse order
+	mov	8*4($in_ptr), @a[1]
+	mov	8*10($in_ptr), @b[1]
+	mov	8*3($in_ptr), @a[0]
+	mov	8*9($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# check top-most limbs, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*2($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*8($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... ones before top-most, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*1($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*7($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*0($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*6($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0
+	bsr	$t0, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[0], @a[2]
+	cmovz	@b[0], @b[2]
+	cmovz	$t0, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[1], @a[2]	# align second limb to the left
+	shldq	%cl, @b[1], @b[2]
+
+	mov	\$0x7FFFFFFF, %eax
+	and	%rax, @a[0]
+	and	%rax, @b[0]
+	andn	@a[2], %rax, @a[2]
+	andn	@b[2], %rax, @b[2]
+	or	@a[2], @a[0]
+	or	@b[2], @b[0]
+
+	jmp	__inner_loop_31
+
+	ret
+.size	__ab_approximation_31,.-__ab_approximation_31
+___
+}
+$code.=<<___;
+.type	__inner_loop_31,\@abi-omnipotent
+.align	32
+__inner_loop_31:		################# by Thomas Pornin
+	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
+	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
+	mov	\$0x7FFFFFFF7FFFFFFF, $bias
+
+.Loop_31:
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	mov	$fg0, $t2
+	mov	$fg1, $t3
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	cmovb	$fg1, $fg0
+	cmovb	$t2, $fg1
+
+	sub	$b_, $a_		# |a_|-|b_|
+	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
+	add	$bias, $fg0
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $fg0
+	cmovz	$t3, $fg1
+
+	shr	\$1, $a_		# |a_|>>=1
+	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
+	sub	$bias, $fg1
+	sub	\$1, $cnt
+	jnz	.Loop_31
+
+	shr	\$32, $bias
+	mov	%ecx, %edx		# $fg0, $f0
+	mov	${fg1}d, ${f1}d
+	shr	\$32, $g0
+	shr	\$32, $g1
+	sub	$bias, $f0		# remove the bias
+	sub	$bias, $g0
+	sub	$bias, $f1
+	sub	$bias, $g1
+
+	ret
+.size	__inner_loop_31,.-__inner_loop_31
+
+.type	__inner_loop_62,\@abi-omnipotent
+.align	32
+__inner_loop_62:
+	mov	\$1, $f0	# |f0|=1
+	xor	$g0, $g0	# |g0|=0
+	xor	$f1, $f1	# |f1|=0
+	mov	\$1, $g1	# |g1|=1
+
+.Loop_62:
+	xor	$t0, $t0
+	test	\$1, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
+	mov	$b_lo, $t1
+	cmovnz	$b_lo, $t0
+	sub	$a_lo, $t1	# |b_|-|a_|
+	mov	$a_lo, $t2
+	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
+	cmovc	$t1, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
+	cmovc	$t2, $b_lo	# |b_| = |a_|
+	mov	$f0, $t0	# exchange |f0| and |f1|
+	cmovc	$f1, $f0
+	cmovc	$t0, $f1
+	mov	$g0, $t1	# exchange |g0| and |g1|
+	cmovc	$g1, $g0
+	cmovc	$t1, $g1
+	xor	$t0, $t0
+	xor	$t1, $t1
+	shr	\$1, $a_lo
+	test	\$1, $t2	# if |a_| was odd, then we'll be subtracting...
+	cmovnz	$f1, $t0
+	cmovnz	$g1, $t1
+	add	$f1, $f1	# |f1|<<=1
+	add	$g1, $g1	# |g1|<<=1
+	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
+	sub	\$1, $cnt
+	jnz	.Loop_62
+
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
+___
+}
+
+print $code;
+close STDOUT;
--- a/blst/asm/div3w-armv8.pl
+++ b/blst/asm/div3w-armv8.pl
@ -0,0 +1,122 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+$code.=<<___;
+.text
+
+.globl	div_3_limbs
+.type	div_3_limbs,%function
+.align	5
+div_3_limbs:
+	ldp	x4,x5,[x0]	// load R
+	eor	x0,x0,x0	// Q = 0
+	mov	x3,#64		// loop counter
+	nop
+
+.Loop:
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,x0	// Q <<= 1
+	sbcs	x7,x5,x2
+	add	x0,x0,#1	// Q + speculative bit
+	csel	x4,x4,x6,lo	// select between R and R - D
+	 extr	x1,x2,x1,#1	// D >>= 1
+	csel	x5,x5,x7,lo
+	 lsr	x2,x2,#1
+	sbc	x0,x0,xzr	// subtract speculative bit
+	sub	x3,x3,#1
+	cbnz	x3,.Loop
+
+	asr	x3,x0,#63	// top bit -> mask
+	add	x0,x0,x0	// Q <<= 1
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,#1	// Q + specilative bit
+	sbcs	x7,x5,x2
+	sbc	x0,x0,xzr	// subtract speculative bit
+
+	orr	x0,x0,x3	// all ones if overflow
+
+	ret
+.size	div_3_limbs,.-div_3_limbs
+___
+{
+my ($div_rem, $divisor, $quot) = map("x$_",(0..2));
+my @div = map("x$_",(3..4));
+my @acc = map("x$_",(5..7));
+my @t = map("x$_",(8..11));
+
+$code.=<<___;
+.globl	quot_rem_128
+.type	quot_rem_128,%function
+.align	5
+quot_rem_128:
+	ldp	@div[0],@div[1],[$divisor]
+
+	mul	@acc[0],@div[0],$quot	// divisor[0:1} * quotient
+	umulh	@acc[1],@div[0],$quot
+	mul	@t[3],  @div[1],$quot
+	umulh	@acc[2],@div[1],$quot
+
+	ldp	@t[0],@t[1],[$div_rem]	// load 3 limbs of the dividend
+	ldr	@t[2],[$div_rem,#16]
+
+	adds	@acc[1],@acc[1],@t[3]
+	adc	@acc[2],@acc[2],xzr
+
+	subs	@t[0],@t[0],@acc[0]	// dividend - divisor * quotient
+	sbcs	@t[1],@t[1],@acc[1]
+	sbcs	@t[2],@t[2],@acc[2]
+	sbc	@acc[0],xzr,xzr		// borrow -> mask
+
+	add	$quot,$quot,@acc[0]	// if borrowed, adjust the quotient ...
+	and	@div[0],@div[0],@acc[0]
+	and	@div[1],@div[1],@acc[0]
+	adds	@t[0],@t[0],@div[0]	// ... and add divisor
+	adc	@t[1],@t[1],@div[1]
+
+	stp	@t[0],@t[1],[$div_rem]	// save 2 limbs of the remainder
+	str	$quot,[$div_rem,#16]	// and one limb of the quotient
+
+	mov	x0,$quot		// return adjusted quotient
+
+	ret
+.size	quot_rem_128,.-quot_rem_128
+
+.globl	quot_rem_64
+.type	quot_rem_64,%function
+.align	5
+quot_rem_64:
+	ldr	@div[0],[$divisor]
+	ldr	@t[0],[$div_rem]	// load 1 limb of the dividend
+
+	mul	@acc[0],@div[0],$quot	// divisor * quotient
+
+	sub	@t[0],@t[0],@acc[0]	// dividend - divisor * quotient
+
+	stp	@t[0],$quot,[$div_rem]	// save remainder and quotient
+
+	mov	x0,$quot		// return quotient
+
+	ret
+.size	quot_rem_64,.-quot_rem_64
+___
+}
+
+print $code;
+close STDOUT;
--- a/blst/asm/div3w-x86_64.pl
+++ b/blst/asm/div3w-x86_64.pl
@ -0,0 +1,184 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+$c_ref=<<'___';
+/*
+ * |div_top| points at two most significant limbs of the dividend, |d_hi|
+ * and |d_lo| are two most significant limbs of the divisor. If divisor
+ * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
+ * The divisor is required to be "bitwise left-aligned," and dividend's
+ * top limbs to be not larger than the divisor's. The latter limitation
+ * can be problematic in the first iteration of multi-precision division,
+ * where in most general case the condition would have to be "smaller."
+ * The subroutine considers four limbs, two of which are "overlapping,"
+ * hence the name... Another way to look at it is to think of the pair
+ * of the dividend's limbs being suffixed with a zero:
+ *   +-------+-------+-------+
+ * R |       |       |   0   |
+ *   +-------+-------+-------+
+ *           +-------+-------+
+ * D         |       |       |
+ *           +-------+-------+
+ */
+limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)
+{
+    llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0];
+    llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo;
+    limb_t Q = 0, mask;
+    size_t i;
+
+    for (i = 0; i < LIMB_BITS; i++) {
+        Q <<= 1;
+        mask = (R >= D);
+        Q |= mask;
+        R -= (D & ((llimb_t)0 - mask));
+        D >>= 1;
+    }
+
+    mask = 0 - (Q >> (LIMB_BITS - 1));   /* does it overflow? */
+
+    Q <<= 1;
+    Q |= (R >= D);
+
+    return (Q | mask);
+}
+___
+
+$code.=<<___;
+.text
+
+.globl	div_3_limbs
+.hidden	div_3_limbs
+.type	div_3_limbs,\@function,3
+.align	32
+div_3_limbs:
+	mov	(%rdi),%r8		# load R.lo
+	mov	8(%rdi),%r9		# load R.hi
+	xor	%rax,%rax		# Q = 0
+	mov	\$64,%ecx		# loop counter
+
+.Loop:
+	 mov	%r8,%r10		# put aside R
+	sub	%rsi,%r8		# R -= D
+	 mov	%r9,%r11
+	sbb	%rdx,%r9
+	lea	1(%rax,%rax),%rax	# Q <<= 1 + speculative bit
+	 mov	%rdx,%rdi
+	cmovc	%r10,%r8		# restore R if R - D borrowed
+	cmovc	%r11,%r9
+	sbb	\$0,%rax		# subtract speculative bit
+	 shl	\$63,%rdi
+	 shr	\$1,%rsi
+	 shr	\$1,%rdx
+	 or	%rdi,%rsi		# D >>= 1
+	sub	\$1,%ecx
+	jnz	.Loop
+
+	lea	1(%rax,%rax),%rcx	# Q <<= 1 + speculative bit
+	sar	\$63,%rax		# top bit -> mask
+
+	sub	%rsi,%r8		# R -= D
+	sbb	%rdx,%r9
+	sbb	\$0,%rcx		# subtract speculative bit
+
+	or	%rcx,%rax		# all ones if overflow
+
+	ret
+.size	div_3_limbs,.-div_3_limbs
+___
+########################################################################
+# Calculate remainder and adjust the quotient, which can be off-by-one.
+# Then save quotient in limb next to top limb of the remainder. There is
+# place, because the remainder/next-iteration-dividend gets shorter by
+# one limb.
+{
+my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");
+my @acc = ("%r8", "%r9", "%rdx");
+my @tmp = ("%r10", "%r11", "%rax");
+
+$code.=<<___;
+.globl	quot_rem_128
+.hidden	quot_rem_128
+.type	quot_rem_128,\@function,3
+.align	32
+quot_rem_128:
+	mov	%rdx, %rax
+	mov	%rdx, $quotient
+
+	mulq	0($divisor)		# divisor[0:1] * quotient
+	mov	%rax, @acc[0]
+	mov	$quotient, %rax
+	mov	%rdx, @acc[1]
+
+	mulq	8($divisor)
+	add	%rax, @acc[1]
+	adc	\$0, %rdx		# %rdx is @acc[2]
+
+	mov	0($div_rem), @tmp[0]	# load 3 limbs of the dividend
+	mov	8($div_rem), @tmp[1]
+	mov	16($div_rem), @tmp[2]
+
+	sub	@acc[0], @tmp[0]	# dividend - divisor * quotient
+	sbb	@acc[1], @tmp[1]
+	sbb	@acc[2], @tmp[2]
+	sbb	@acc[0], @acc[0]	# borrow -> mask
+
+	add	@acc[0], $quotient	# if borrowed, adjust the quotient ...
+	mov	@acc[0], @acc[1]
+	and	0($divisor), @acc[0]
+	and	8($divisor), @acc[1]
+	add	@acc[0], @tmp[0]	# ... and add divisor
+	adc	@acc[1], @tmp[1]
+
+	mov	@tmp[0], 0($div_rem)	# save 2 limbs of the remainder ...
+	mov	@tmp[1], 8($div_rem)
+	mov	$quotient, 16($div_rem)	# ... and 1 limb of the quotient
+
+	mov	$quotient, %rax		# return adjusted quotient
+
+	ret
+.size	quot_rem_128,.-quot_rem_128
+
+########################################################################
+# Unlike 128-bit case above, quotient is exact. As result just one limb
+# of the dividend is sufficient to calculate the remainder...
+
+.globl	quot_rem_64
+.hidden	quot_rem_64
+.type	quot_rem_64,\@function,3
+.align	32
+quot_rem_64:
+	mov	%rdx, %rax		# return quotient
+	imulq	0($divisor), %rdx	# divisor[0] * quotient
+
+	mov	0($div_rem), @tmp[0]	# load 1 limb of the dividend
+
+	sub	%rdx, @tmp[0]		# dividend - divisor * quotient
+
+	mov	@tmp[0], 0($div_rem)	# save 1 limb of the remainder ...
+	mov	%rax, 8($div_rem)	# ... and 1 limb of the quotient
+
+	ret
+.size	quot_rem_64,.-quot_rem_64
+___
+}
+
+print $code;
+close STDOUT;
--- a/blst/asm/mul_mont_256-armv8.pl
+++ b/blst/asm/mul_mont_256-armv8.pl
@ -0,0 +1,409 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# As for "sparse" in subroutine names, see commentary in the
+# asm/mulx_mont_256-x86_64.pl module.
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
+
+@mod=map("x$_",(5..8));
+$bi="x9";
+@a=map("x$_",(10..13));
+@tmp=map("x$_",(14..17));
+@acc=map("x$_",(19..24));
+$m0=$n_ptr;
+
+$code.=<<___;
+.text
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,%function
+.align	5
+mul_mont_sparse_256:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldr	$bi,        [$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	mul	@acc[0],@a[0],$bi
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	mul	@acc[1],@a[1],$bi
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	mul	@acc[2],@a[2],$bi
+	mul	@acc[3],@a[3],$bi
+
+	 umulh	@tmp[0],@a[0],$bi
+	 umulh	@tmp[1],@a[1],$bi
+	mul	$m0,$n0,@acc[0]
+	 umulh	@tmp[2],@a[2],$bi
+	 umulh	@tmp[3],@a[3],$bi
+	 adds	@acc[1],@acc[1],@tmp[0]
+	//mul	@tmp[0],@mod[0],$m0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$m0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$m0
+	 adc	@acc[4],xzr,    @tmp[3]
+	mul	@tmp[3],@mod[3],$m0
+___
+for ($i=1;$i<4;$i++) {
+$code.=<<___;
+	ldr	$bi,[$b_ptr,8*$i]
+	subs	xzr,@acc[0],#1		//adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@acc[4],@acc[4],xzr
+
+	 adds	@acc[0],@acc[1],@tmp[0]
+	mul	@tmp[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	mul	@tmp[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	mul	@tmp[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	mul	@tmp[3],@a[3],$bi
+	 adc	@acc[4],xzr,xzr
+
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@tmp[2]
+	mul	$m0,$n0,@acc[0]
+	 umulh	@tmp[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@a[3],$bi
+	adc	@acc[4],@acc[4],xzr
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	//mul	@tmp[0],@mod[0],$m0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$m0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$m0
+	 adc	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$m0
+___
+}
+$code.=<<___;
+	subs	xzr,@acc[0],#1		//adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@acc[4],@acc[4],xzr
+
+	 adds	@acc[0],@acc[1],@tmp[0]
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	 adc	@acc[4],xzr,xzr
+
+	subs	@tmp[0],@acc[0],@mod[0]
+	sbcs	@tmp[1],@acc[1],@mod[1]
+	sbcs	@tmp[2],@acc[2],@mod[2]
+	sbcs	@tmp[3],@acc[3],@mod[3]
+	sbcs	xzr,    @acc[4],xzr
+
+	csel	@acc[0],@acc[0],@tmp[0],lo
+	csel	@acc[1],@acc[1],@tmp[1],lo
+	csel	@acc[2],@acc[2],@tmp[2],lo
+	csel	@acc[3],@acc[3],@tmp[3],lo
+
+	stp	@acc[0],@acc[1],[$r_ptr]
+	stp	@acc[2],@acc[3],[$r_ptr,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+___
+{
+my @acc = (@a,@acc[0..3]);
+my @a = @mod;
+
+$code.=<<___;
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,%function
+.align	5
+sqr_mont_sparse_256:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	mov	$n0,$n_ptr
+
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x]
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	@acc[1],@a[1],@a[0]	// a[1]*a[0]
+	umulh	@tmp[1],@a[1],@a[0]
+	mul	@acc[2],@a[2],@a[0]	// a[2]*a[0]
+	umulh	@tmp[2],@a[2],@a[0]
+	mul	@acc[3],@a[3],@a[0]	// a[3]*a[0]
+	umulh	@acc[4],@a[3],@a[0]
+
+	adds	@acc[2],@acc[2],@tmp[1]	// accumulate high parts of multiplication
+	 mul	@tmp[0],@a[2],@a[1]	// a[2]*a[1]
+	 umulh	@tmp[1],@a[2],@a[1]
+	adcs	@acc[3],@acc[3],@tmp[2]
+	 mul	@tmp[2],@a[3],@a[1]	// a[3]*a[1]
+	 umulh	@tmp[3],@a[3],@a[1]
+	adc	@acc[4],@acc[4],xzr	// can't overflow
+
+	mul	@acc[5],@a[3],@a[2]	// a[3]*a[2]
+	umulh	@acc[6],@a[3],@a[2]
+
+	adds	@tmp[1],@tmp[1],@tmp[2]	// accumulate high parts of multiplication
+	 mul	@acc[0],@a[0],@a[0]	// a[0]*a[0]
+	adc	@tmp[2],@tmp[3],xzr	// can't overflow
+
+	adds	@acc[3],@acc[3],@tmp[0]	// accumulate low parts of multiplication
+	 umulh	@a[0],@a[0],@a[0]
+	adcs	@acc[4],@acc[4],@tmp[1]
+	 mul	@tmp[1],@a[1],@a[1]	// a[1]*a[1]
+	adcs	@acc[5],@acc[5],@tmp[2]
+	 umulh	@a[1],@a[1],@a[1]
+	adc	@acc[6],@acc[6],xzr	// can't overflow
+
+	adds	@acc[1],@acc[1],@acc[1]	// acc[1-6]*=2
+	 mul	@tmp[2],@a[2],@a[2]	// a[2]*a[2]
+	adcs	@acc[2],@acc[2],@acc[2]
+	 umulh	@a[2],@a[2],@a[2]
+	adcs	@acc[3],@acc[3],@acc[3]
+	 mul	@tmp[3],@a[3],@a[3]	// a[3]*a[3]
+	adcs	@acc[4],@acc[4],@acc[4]
+	 umulh	@a[3],@a[3],@a[3]
+	adcs	@acc[5],@acc[5],@acc[5]
+	adcs	@acc[6],@acc[6],@acc[6]
+	adc	@acc[7],xzr,xzr
+
+	adds	@acc[1],@acc[1],@a[0]	// +a[i]*a[i]
+	adcs	@acc[2],@acc[2],@tmp[1]
+	adcs	@acc[3],@acc[3],@a[1]
+	adcs	@acc[4],@acc[4],@tmp[2]
+	adcs	@acc[5],@acc[5],@a[2]
+	adcs	@acc[6],@acc[6],@tmp[3]
+	adc	@acc[7],@acc[7],@a[3]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	adds	@acc[0],@acc[0],@acc[4]	// accumulate upper half
+	adcs	@acc[1],@acc[1],@acc[5]
+	adcs	@acc[2],@acc[2],@acc[6]
+	adcs	@acc[3],@acc[3],@acc[7]
+	adc	@acc[4],xzr,xzr
+
+	subs	@tmp[0],@acc[0],@mod[0]
+	sbcs	@tmp[1],@acc[1],@mod[1]
+	sbcs	@tmp[2],@acc[2],@mod[2]
+	sbcs	@tmp[3],@acc[3],@mod[3]
+	sbcs	xzr,    @acc[4],xzr
+
+	csel	@acc[0],@acc[0],@tmp[0],lo
+	csel	@acc[1],@acc[1],@tmp[1],lo
+	csel	@acc[2],@acc[2],@tmp[2],lo
+	csel	@acc[3],@acc[3],@tmp[3],lo
+
+	stp	@acc[0],@acc[1],[$r_ptr]
+	stp	@acc[2],@acc[3],[$r_ptr,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+___
+}
+{
+my @a = (@a, $bi);
+
+$code.=<<___;
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,%function
+.align	5
+from_mont_256:
+	paciasp
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	$n0,$n_ptr
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	subs	@tmp[0],@a[0],@mod[0]
+	sbcs	@tmp[1],@a[1],@mod[1]
+	sbcs	@tmp[2],@a[2],@mod[2]
+	sbcs	@tmp[3],@a[3],@mod[3]
+
+	csel	@a[0],@a[0],@tmp[0],lo
+	csel	@a[1],@a[1],@tmp[1],lo
+	csel	@a[2],@a[2],@tmp[2],lo
+	csel	@a[3],@a[3],@tmp[3],lo
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ldr	x29,[sp],#16
+	autiasp
+	ret
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,%function
+.align	5
+redc_mont_256:
+	paciasp
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	$n0,$n_ptr
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	ldp	@tmp[0],@tmp[1],[$a_ptr,#32]
+	ldp	@tmp[2],@tmp[3],[$a_ptr,#48]
+
+	adds	@a[0],@a[0],@tmp[0]
+	adcs	@a[1],@a[1],@tmp[1]
+	adcs	@a[2],@a[2],@tmp[2]
+	adcs	@a[3],@a[3],@tmp[3]
+	adc	@a[4],xzr,xzr
+
+	subs	@tmp[0],@a[0],@mod[0]
+	sbcs	@tmp[1],@a[1],@mod[1]
+	sbcs	@tmp[2],@a[2],@mod[2]
+	sbcs	@tmp[3],@a[3],@mod[3]
+	sbcs	xzr,    @a[4],xzr
+
+	csel	@a[0],@a[0],@tmp[0],lo
+	csel	@a[1],@a[1],@tmp[1],lo
+	csel	@a[2],@a[2],@tmp[2],lo
+	csel	@a[3],@a[3],@tmp[3],lo
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ldr	x29,[sp],#16
+	autiasp
+	ret
+.size	redc_mont_256,.-redc_mont_256
+
+.type	__mul_by_1_mont_256,%function
+.align	5
+__mul_by_1_mont_256:
+	mul	$m0,$n0,@a[0]
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+___
+for ($i=1;$i<4;$i++) {
+$code.=<<___;
+	//mul	@tmp[0],@mod[0],$m0
+	mul	@tmp[1],@mod[1],$m0
+	mul	@tmp[2],@mod[2],$m0
+	mul	@tmp[3],@mod[3],$m0
+	subs	xzr,@a[0],#1		//adds	@a[0],@a[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@a[1],@a[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@a[2],@a[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@a[3],@a[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@a[4],xzr,xzr
+
+	 adds	@a[0],@a[1],@tmp[0]
+	 adcs	@a[1],@a[2],@tmp[1]
+	 adcs	@a[2],@a[3],@tmp[2]
+	mul	$m0,$n0,@a[0]
+	 adc	@a[3],@a[4],@tmp[3]
+___
+}
+$code.=<<___;
+	//mul	@tmp[0],@mod[0],$m0
+	mul	@tmp[1],@mod[1],$m0
+	mul	@tmp[2],@mod[2],$m0
+	mul	@tmp[3],@mod[3],$m0
+	subs	xzr,@a[0],#1		//adds	@a[0],@a[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@a[1],@a[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@a[2],@a[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@a[3],@a[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@a[4],xzr,xzr
+
+	 adds	@a[0],@a[1],@tmp[0]
+	 adcs	@a[1],@a[2],@tmp[1]
+	 adcs	@a[2],@a[3],@tmp[2]
+	 adc	@a[3],@a[4],@tmp[3]
+
+	ret
+.size	__mul_by_1_mont_256,.-__mul_by_1_mont_256
+___
+}
+
+print $code;
+
+close STDOUT;
--- a/blst/asm/mul_mont_384-armv8.pl
+++ b/blst/asm/mul_mont_384-armv8.pl
--- a/blst/asm/mulq_mont_256-x86_64.pl
+++ b/blst/asm/mulq_mont_256-x86_64.pl
@ -0,0 +1,513 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# As for "sparse" in subroutine names, see commentary in the
+# asm/mulx_mont_256-x86_64.pl module.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+{ ############################################################## 256 bits
+my @acc=map("%r$_",(9..15));
+
+{ ############################################################## mulq
+my ($hi, $a0) = ("%rbp", $r_ptr);
+
+$code.=<<___;
+.text
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,\@function,5,"unwind"
+.align	32
+mul_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($b_org), %rax
+	mov	8*0($a_ptr), @acc[4]
+	mov	8*1($a_ptr), @acc[5]
+	mov	8*2($a_ptr), @acc[3]
+	mov	8*3($a_ptr), $hi
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+
+	mov	%rax, @acc[6]
+	mulq	@acc[4]			# a[0]*b[0]
+	mov	%rax, @acc[0]
+	mov	@acc[6], %rax
+	mov	%rdx, @acc[1]
+	call	__mulq_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,\@function,4,"unwind"
+.align	32
+sqr_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), %rax
+	mov	$n_ptr, $n0
+	mov	8*1($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+	mov	8*2($a_ptr), @acc[3]
+	lea	($a_ptr), $b_ptr
+	mov	8*3($a_ptr), $hi
+
+	mov	%rax, @acc[6]
+	mulq	%rax			# a[0]*a[0]
+	mov	%rax, @acc[0]
+	mov	@acc[6], %rax
+	mov	%rdx, @acc[1]
+	call	__mulq_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+___
+{
+my @acc=@acc;
+$code.=<<___;
+.type	__mulq_mont_sparse_256,\@abi-omnipotent
+.align	32
+__mulq_mont_sparse_256:
+	mulq	@acc[5]			# a[1]*b[0]
+	add	%rax, @acc[1]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	@acc[3]			# a[2]*b[0]
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	mulq	$hi			# a[3]*b[0]
+	add	%rax, @acc[3]
+	 mov	8($b_ptr), %rax
+	adc	\$0, %rdx
+	xor	@acc[5], @acc[5]
+	mov	%rdx, @acc[4]
+
+___
+for (my $i=1; $i<4; $i++) {
+my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	mov	@acc[0], $a0
+	imulq	$n0, @acc[0]
+
+	################################# Multiply by b[$i]
+	mov	%rax, @acc[6]
+	mulq	8*0($a_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[4]
+	 mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[4]
+	adc	%rdx, @acc[5]		# can't overflow
+	xor	@acc[6], @acc[6]
+
+	################################# reduction
+	mulq	8*0($n_ptr)
+	add	%rax, $a0		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, $a0
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$a0, @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	 mov	$b_next, %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	add	%rdx, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	imulq	$n0, %rax
+	mov	8(%rsp), $a_ptr		# restore $r_ptr
+
+	################################# last reduction
+	mov	%rax, @acc[6]
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[0]		# guaranteed to be zero
+	mov	@acc[6], %rax
+	adc	%rdx, @acc[0]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	@acc[0], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	 mov	@acc[2], $b_ptr
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	add	%rax, @acc[3]
+	 mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	%rdx, @acc[4]
+	adc	\$0, @acc[5]
+
+	#################################
+	# Branch-less conditional subtraction of modulus
+
+	 mov	@acc[3], @acc[0]
+	sub	8*0($n_ptr), @acc[1]
+	sbb	8*1($n_ptr), @acc[2]
+	sbb	8*2($n_ptr), @acc[3]
+	 mov	@acc[4], $hi
+	sbb	8*3($n_ptr), @acc[4]
+	sbb	\$0, @acc[5]
+
+	cmovc	%rax, @acc[1]
+	cmovc	$b_ptr, @acc[2]
+	cmovc	@acc[0], @acc[3]
+	mov	@acc[1], 8*0($a_ptr)
+	cmovc	$hi, @acc[4]
+	mov	@acc[2], 8*1($a_ptr)
+	mov	@acc[3], 8*2($a_ptr)
+	mov	@acc[4], 8*3($a_ptr)
+
+	ret
+.cfi_endproc
+.size	__mulq_mont_sparse_256,.-__mulq_mont_sparse_256
+___
+} }
+{ my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
+
+$code.=<<___;
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,\@function,4,"unwind"
+.align	32
+from_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulq_by_1_mont_256
+
+	#################################
+	# Branch-less conditional acc[0:3] - modulus
+
+	#mov	@acc[4], %rax		# __mulq_by_1_mont_256 does it
+	mov	@acc[5], @acc[1]
+	mov	@acc[6], @acc[2]
+	mov	@acc[0], @acc[3]
+
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[6]
+	sbb	8*3($n_ptr), @acc[0]
+
+	cmovnc	@acc[4], %rax
+	cmovnc	@acc[5], @acc[1]
+	cmovnc	@acc[6], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[0], @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,\@function,4,"unwind"
+.align	32
+redc_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulq_by_1_mont_256
+
+	add	8*4($a_ptr), @acc[4]	# accumulate upper half
+	adc	8*5($a_ptr), @acc[5]
+	mov	@acc[4], %rax
+	adc	8*6($a_ptr), @acc[6]
+	mov	@acc[5], @acc[1]
+	adc	8*7($a_ptr), @acc[0]
+	sbb	$a_ptr, $a_ptr
+
+	#################################
+	# Branch-less conditional acc[0:4] - modulus
+
+	mov	@acc[6], @acc[2]
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[6]
+	mov	@acc[0], @acc[3]
+	sbb	8*3($n_ptr), @acc[0]
+	sbb	\$0, $a_ptr
+
+	cmovnc	@acc[4], %rax 
+	cmovnc	@acc[5], @acc[1]
+	cmovnc	@acc[6], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[0], @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	redc_mont_256,.-redc_mont_256
+___
+{
+my @acc=@acc;
+
+$code.=<<___;
+.type	__mulq_by_1_mont_256,\@abi-omnipotent
+.align	32
+__mulq_by_1_mont_256:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	mov	%rax, @acc[4]
+	imulq	$n0, %rax
+	mov	%rax, @acc[0]
+___
+for (my $i=0; $i<4; $i++) {
+my $hi = @acc[4];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[4]		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[4]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[4], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+___
+$code.=<<___	if ($i<3);
+	 mov	@acc[1], @acc[5]
+	 imulq	$n0, @acc[1]
+___
+$code.=<<___;
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	ret
+.size	__mulq_by_1_mont_256,.-__mulq_by_1_mont_256
+___
+} } }
+
+print $code;
+close STDOUT;
--- a/blst/asm/mulq_mont_384-x86_64.pl
+++ b/blst/asm/mulq_mont_384-x86_64.pl
--- a/blst/asm/mulx_mont_256-x86_64.pl
+++ b/blst/asm/mulx_mont_256-x86_64.pl
@ -0,0 +1,486 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# "Sparse" in subroutine names refers to most significant limb of the
+# modulus. Though "sparse" is a bit of misnomer, because limitation is
+# just not-all-ones. Or in other words not larger than 2^256-2^192-1.
+# In general Montgomery multiplication algorithm can handle one of the
+# inputs being non-reduced and capped by 1<<radix_width, 1<<256 in this
+# case, rather than the modulus. Whether or not mul_mont_sparse_256, a
+# *taylored* implementation of the algorithm, can handle such input can
+# be circumstantial. For example, in most general case it depends on
+# similar "bit sparsity" of individual limbs of the second, fully reduced
+# multiplicand. If you can't make such assumption about the limbs, then
+# non-reduced value shouldn't be larger than "same old" 2^256-2^192-1.
+# This requirement can be met by conditionally subtracting "bitwise
+# left-aligned" modulus. For example, if modulus is 200 bits wide, you
+# would need to conditionally subtract the value of modulus<<56. Common
+# source of non-reduced values is redc_mont_256 treating 512-bit inputs.
+# Well, more specifically ones with upper half not smaller than modulus.
+# Just in case, why limitation at all and not general-purpose 256-bit
+# subroutines? Unlike the 384-bit case, accounting for additional carry
+# has disproportionate impact on performance, especially in adcx/adox
+# implementation.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+{ ############################################################## 255 bits
+my @acc=map("%r$_",(10..15));
+
+{ ############################################################## mulq
+my ($lo,$hi)=("%rbp","%r9");
+
+$code.=<<___;
+.text
+
+.globl	mulx_mont_sparse_256
+.hidden	mulx_mont_sparse_256
+.type	mulx_mont_sparse_256,\@function,5,"unwind"
+.align	32
+mulx_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8,%rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), @acc[4]
+	mov	8*1($a_ptr), @acc[5]
+	mov	8*2($a_ptr), $lo
+	mov	8*3($a_ptr), $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	@acc[4], %rax, @acc[1]	# a[0]*b[0]
+	call	__mulx_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_mont_sparse_256,.-mulx_mont_sparse_256
+
+.globl	sqrx_mont_sparse_256
+.hidden	sqrx_mont_sparse_256
+.type	sqrx_mont_sparse_256,\@function,4,"unwind"
+.align	32
+sqrx_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8,%rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $b_ptr
+	mov	$n_ptr, $n0
+	mov	$b_org, $n_ptr
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[5]
+	mov	8*2($a_ptr), $lo
+	mov	8*3($a_ptr), $hi
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%rdx, %rax, @acc[1]	# a[0]*a[0]
+	call	__mulx_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_mont_sparse_256,.-sqrx_mont_sparse_256
+___
+{
+my @acc=@acc;
+$code.=<<___;
+.type	__mulx_mont_sparse_256,\@abi-omnipotent
+.align	32
+__mulx_mont_sparse_256:
+	mulx	@acc[5], @acc[5], @acc[2]
+	mulx	$lo, $lo, @acc[3]
+	add	@acc[5], @acc[1]
+	mulx	$hi, $hi, @acc[4]
+	 mov	8($b_ptr), %rdx
+	adc	$lo, @acc[2]
+	adc	$hi, @acc[3]
+	adc	\$0, @acc[4]
+
+___
+for (my $i=1; $i<4; $i++) {
+my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : "%rax";
+my $a5 = $i==1 ? @acc[5] : $lo;
+$code.=<<___;
+	 mov	%rax, @acc[0]
+	 imulq	$n0, %rax
+
+	################################# Multiply by b[$i]
+	xor	$a5, $a5		# [@acc[5]=0,] cf=0, of=0
+	mulx	8*0+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[1]
+	adcx	$hi, @acc[2]
+
+	mulx	8*1+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[2]
+	adcx	$hi, @acc[3]
+
+	mulx	8*2+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[3]
+	adcx	$hi, @acc[4]
+
+	mulx	8*3+128($a_ptr), $lo, $hi
+	 mov	%rax, %rdx
+	adox	$lo, @acc[4]
+	adcx	@acc[5], $hi 		# cf=0
+	adox	$hi, @acc[5]		# of=0
+
+	################################# reduction
+	mulx	8*0+128($n_ptr), $lo, %rax
+	adcx	$lo, @acc[0]		# guaranteed to be zero
+	adox	@acc[1], %rax
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, %rax		# @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	 mov	$b_next, %rdx
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+	adcx	@acc[0], @acc[4]
+	adox	@acc[0], @acc[5]
+	adcx	@acc[0], @acc[5]
+	adox	@acc[0], @acc[0]	# acc[5] in next iteration
+	adc	\$0, @acc[0]		# cf=0, of=0
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	imulq	$n0, %rdx
+
+	################################# last reduction
+	xor	$lo, $lo		# cf=0, of=0
+	mulx	8*0+128($n_ptr), @acc[0], $hi
+	adcx	%rax, @acc[0]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	 mov	@acc[1], %rdx
+	 lea	128($n_ptr), $n_ptr
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+	 mov	@acc[2], %rax
+	adcx	@acc[0], @acc[4]
+	adox	@acc[0], @acc[5]
+	adc	\$0, @acc[5]
+
+	#################################
+	# Branch-less conditional acc[1:5] - modulus
+
+	 mov	@acc[3], $lo
+	sub	8*0($n_ptr), @acc[1]
+	sbb	8*1($n_ptr), @acc[2]
+	sbb	8*2($n_ptr), @acc[3]
+	 mov	@acc[4], $hi
+	sbb	8*3($n_ptr), @acc[4]
+	sbb	\$0, @acc[5]
+
+	cmovc	%rdx, @acc[1]
+	cmovc	%rax, @acc[2]
+	cmovc	$lo,  @acc[3]
+	mov	@acc[1], 8*0($r_ptr)
+	cmovc	$hi,  @acc[4]
+	mov	@acc[2], 8*1($r_ptr)
+	mov	@acc[3], 8*2($r_ptr)
+	mov	@acc[4], 8*3($r_ptr)
+
+	ret
+.size	__mulx_mont_sparse_256,.-__mulx_mont_sparse_256
+___
+} }
+{ my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
+
+$code.=<<___;
+.globl	fromx_mont_256
+.hidden	fromx_mont_256
+.type	fromx_mont_256,\@function,4,"unwind"
+.align	32
+fromx_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulx_by_1_mont_256
+
+	#################################
+	# Branch-less conditional acc[0:3] - modulus
+
+	#mov	@acc[4], %rax		# __mulq_by_1_mont_256 does it
+	mov	@acc[5], %rdx
+	mov	@acc[0], @acc[2]
+	mov	@acc[1], @acc[3]
+
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[0]
+	sbb	8*3($n_ptr), @acc[1]
+
+	cmovnc	@acc[4], %rax
+	cmovnc	@acc[5], %rdx
+	cmovnc	@acc[0], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[1], @acc[3]
+	mov	%rdx,    8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	fromx_mont_256,.-fromx_mont_256
+
+.globl	redcx_mont_256
+.hidden	redcx_mont_256
+.type	redcx_mont_256,\@function,4,"unwind"
+.align	32
+redcx_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulx_by_1_mont_256
+
+	add	8*4($a_ptr), @acc[4]	# accumulate upper half
+	adc	8*5($a_ptr), @acc[5]
+	mov	@acc[4], %rax
+	adc	8*6($a_ptr), @acc[0]
+	mov	@acc[5], %rdx
+	adc	8*7($a_ptr), @acc[1]
+	sbb	$a_ptr, $a_ptr
+
+	#################################
+	# Branch-less conditional acc[0:4] - modulus
+
+	mov	@acc[0], @acc[2]
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[0]
+	mov	@acc[1], @acc[3]
+	sbb	8*3($n_ptr), @acc[1]
+	sbb	\$0, $a_ptr
+
+	cmovnc	@acc[4], %rax 
+	cmovnc	@acc[5], %rdx
+	cmovnc	@acc[0], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[1], @acc[3]
+	mov	%rdx,    8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	redcx_mont_256,.-redcx_mont_256
+___
+{
+my @acc=@acc;
+
+$code.=<<___;
+.type	__mulx_by_1_mont_256,\@abi-omnipotent
+.align	32
+__mulx_by_1_mont_256:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	mov	%rax, @acc[4]
+	imulq	$n0, %rax
+	mov	%rax, @acc[0]
+___
+for (my $i=0; $i<4; $i++) {
+my $hi = @acc[4];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[4]		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[4]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[4], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+___
+$code.=<<___	if ($i<3);
+	 mov	@acc[1], @acc[5]
+	 imulq	$n0, @acc[1]
+___
+$code.=<<___;
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	ret
+.size	__mulx_by_1_mont_256,.-__mulx_by_1_mont_256
+___
+} } }
+
+print $code;
+close STDOUT;
--- a/blst/asm/mulx_mont_384-x86_64.pl
+++ b/blst/asm/mulx_mont_384-x86_64.pl
--- a/blst/asm/sha256-armv8.pl
+++ b/blst/asm/sha256-armv8.pl
@ -0,0 +1,541 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# sha256_block procedure for ARMv8.
+#
+# This module is stripped of scalar code paths, with raionale that all
+# known processors are NEON-capable.
+#
+# See original module at CRYPTOGAMS for further details.
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+$BITS=256;
+$SZ=4;
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+$reg_t="w";
+$pre="blst_";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+$code.=<<___;
+.text
+
+.align	6
+.type	.LK$BITS,%object
+.LK$BITS:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0	//terminator
+.size	.LK$BITS,.-.LK$BITS
+.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+.globl	${pre}sha256_block_armv8
+.type	${pre}sha256_block_armv8,%function
+.align	6
+${pre}sha256_block_armv8:
+.Lv8_entry:
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1.32		{$ABCD,$EFGH},[$ctx]
+	adr		$Ktbl,.LK256
+
+.Loop_hw:
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
+	sub		$num,$num,#1
+	ld1.32		{$W0},[$Ktbl],#16
+	rev32		@MSG[0],@MSG[0]
+	rev32		@MSG[1],@MSG[1]
+	rev32		@MSG[2],@MSG[2]
+	rev32		@MSG[3],@MSG[3]
+	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
+	orr		$EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	ld1.32		{$W0},[$Ktbl],#16
+	add.i32		$W1,$W1,@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	ld1.32		{$W1},[$Ktbl]
+	add.i32		$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	add.i32		$W1,$W1,@MSG[3]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	add.i32		$ABCD,$ABCD,$ABCD_SAVE
+	add.i32		$EFGH,$EFGH,$EFGH_SAVE
+
+	cbnz		$num,.Loop_hw
+
+	st1.32		{$ABCD,$EFGH},[$ctx]
+
+	ldr		x29,[sp],#16
+	ret
+.size	${pre}sha256_block_armv8,.-${pre}sha256_block_armv8
+___
+}
+
+if ($SZ==4) {	######################################### NEON stuff #
+# You'll surely note a lot of similarities with sha256-armv4 module,
+# and of course it's not a coincidence. sha256-armv4 was used as
+# initial template, but was adapted for ARMv8 instruction set and
+# extensively re-tuned for all-round performance.
+
+my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
+my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
+my $Ktbl="x16";
+my $Xfer="x17";
+my @X = map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
+my $j=0;
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
+sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
+sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ushr_32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	&ushr_32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	&sli_32		($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ushr_32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&sli_32		($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T4,$T7,$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T4,$T7,32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T5,$T7,$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T3,$T7,$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_u32	($T3,$T7,32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T6,@X[0],$sigma1[0]);
+	 eval(shift(@insns));
+	  &ushr_32	($T7,@X[0],$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T6,@X[0],32-$sigma1[0]);
+	 eval(shift(@insns));
+	  &ushr_32	($T5,@X[0],$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T7,$T7,$T6);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T5,@X[0],32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_32		("{$T0}","[$Ktbl], #16");
+	 eval(shift(@insns));
+	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T5,$T5,$T5);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&mov		(&Dhi($T5), &Dlo($T7));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		($T0,$T0,@X[0]);
+	 while($#insns>=1) { eval(shift(@insns)); }
+	&st1_32		("{$T0}","[$Xfer], #16");
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_8		("{@X[0]}","[$inp],#16");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_32		("{$T0}","[$Ktbl],#16");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&rev32		(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&st1_32		("{$T0}","[$Xfer], #16");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
+	'&and	($t1,$f,$e)',
+	'&bic	($t4,$g,$e)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
+	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ror	($t0,$t0,"#$Sigma1[0]")',
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&ror	($t4,$t4,"#$Sigma0[0]")',
+	'&add	($d,$d,$h)',			# d+=h
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+.globl	${pre}sha256_block_data_order
+.type	${pre}sha256_block_data_order,%function
+.align	4
+${pre}sha256_block_data_order:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	sub	sp,sp,#16*4
+
+	adr	$Ktbl,.LK256
+	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
+
+	ld1.8	{@X[0]},[$inp], #16
+	ld1.8	{@X[1]},[$inp], #16
+	ld1.8	{@X[2]},[$inp], #16
+	ld1.8	{@X[3]},[$inp], #16
+	ld1.32	{$T0},[$Ktbl], #16
+	ld1.32	{$T1},[$Ktbl], #16
+	ld1.32	{$T2},[$Ktbl], #16
+	ld1.32	{$T3},[$Ktbl], #16
+	rev32	@X[0],@X[0]		// yes, even on
+	rev32	@X[1],@X[1]		// big-endian
+	rev32	@X[2],@X[2]
+	rev32	@X[3],@X[3]
+	mov	$Xfer,sp
+	add.32	$T0,$T0,@X[0]
+	add.32	$T1,$T1,@X[1]
+	add.32	$T2,$T2,@X[2]
+	st1.32	{$T0-$T1},[$Xfer], #32
+	add.32	$T3,$T3,@X[3]
+	st1.32	{$T2-$T3},[$Xfer]
+	sub	$Xfer,$Xfer,#32
+
+	ldp	$A,$B,[$ctx]
+	ldp	$C,$D,[$ctx,#8]
+	ldp	$E,$F,[$ctx,#16]
+	ldp	$G,$H,[$ctx,#24]
+	ldr	$t1,[sp,#0]
+	mov	$t2,wzr
+	eor	$t3,$B,$C
+	mov	$t4,wzr
+	b	.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	cmp	$t1,#0				// check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
+	cmp	$inp,$num
+	mov	$Xfer, #64
+	csel	$Xfer, $Xfer, xzr, eq
+	sub	$inp,$inp,$Xfer			// avoid SEGV
+	mov	$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	add	$A,$A,$t4			// h+=Sigma0(a) from the past
+	ldp	$t0,$t1,[$ctx,#0]
+	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
+	ldp	$t2,$t3,[$ctx,#8]
+	add	$A,$A,$t0			// accumulate
+	add	$B,$B,$t1
+	ldp	$t0,$t1,[$ctx,#16]
+	add	$C,$C,$t2
+	add	$D,$D,$t3
+	ldp	$t2,$t3,[$ctx,#24]
+	add	$E,$E,$t0
+	add	$F,$F,$t1
+	 ldr	$t1,[sp,#0]
+	stp	$A,$B,[$ctx,#0]
+	add	$G,$G,$t2
+	 mov	$t2,wzr
+	stp	$C,$D,[$ctx,#8]
+	add	$H,$H,$t3
+	stp	$E,$F,[$ctx,#16]
+	 eor	$t3,$B,$C
+	stp	$G,$H,[$ctx,#24]
+	 mov	$t4,wzr
+	 mov	$Xfer,sp
+	b.ne	.L_00_48
+
+	ldr	x29,[x29]
+	add	sp,sp,#16*4+16
+	ret
+.size	${pre}sha256_block_data_order,.-${pre}sha256_block_data_order
+___
+}
+
+{
+my ($out,$inp,$len) = map("x$_",(0..2));
+
+$code.=<<___;
+.globl	${pre}sha256_emit
+.hidden	${pre}sha256_emit
+.type	${pre}sha256_emit,%function
+.align	4
+${pre}sha256_emit:
+	ldp	x4,x5,[$inp]
+	ldp	x6,x7,[$inp,#16]
+#ifndef	__AARCH64EB__
+	rev	x4,x4
+	rev	x5,x5
+	rev	x6,x6
+	rev	x7,x7
+#endif
+	str	w4,[$out,#4]
+	lsr	x4,x4,#32
+	str	w5,[$out,#12]
+	lsr	x5,x5,#32
+	str	w6,[$out,#20]
+	lsr	x6,x6,#32
+	str	w7,[$out,#28]
+	lsr	x7,x7,#32
+	str	w4,[$out,#0]
+	str	w5,[$out,#8]
+	str	w6,[$out,#16]
+	str	w7,[$out,#24]
+	ret
+.size	${pre}sha256_emit,.-${pre}sha256_emit
+
+.globl	${pre}sha256_bcopy
+.hidden	${pre}sha256_bcopy
+.type	${pre}sha256_bcopy,%function
+.align	4
+${pre}sha256_bcopy:
+.Loop_bcopy:
+	ldrb	w3,[$inp],#1
+	sub	$len,$len,#1
+	strb	w3,[$out],#1
+	cbnz	$len,.Loop_bcopy
+	ret
+.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
+
+.globl	${pre}sha256_hcopy
+.hidden	${pre}sha256_hcopy
+.type	${pre}sha256_hcopy,%function
+.align	4
+${pre}sha256_hcopy:
+	ldp	x4,x5,[$inp]
+	ldp	x6,x7,[$inp,#16]
+	stp	x4,x5,[$out]
+	stp	x6,x7,[$out,#16]
+	ret
+.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
+___
+}
+
+{   my  %opcode = (
+	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
+	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge	or
+	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
+
+	s/\.[ui]?8(\s)/$1/;
+	s/\.\w?64\b//		and s/\.16b/\.2d/g	or
+	s/\.\w?32\b//		and s/\.16b/\.4s/g;
+	m/\bext\b/		and s/\.2d/\.16b/g	or
+	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
+
+	print $_,"\n";
+}
+
+close STDOUT;
--- a/blst/asm/sha256-portable-x86_64.pl
+++ b/blst/asm/sha256-portable-x86_64.pl
@ -0,0 +1,337 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# sha256_block procedure for x86_64.
+#
+# Scalar-only version with minor twist minimizing 'lea' instructions.
+
+$flavour = shift;
+$output  = pop;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+$pre="blst_";
+$func="${pre}sha256_block_data_order";
+$TABLE="K256";
+$SZ=4;
+@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
+				"%r8d","%r9d","%r10d","%r11d");
+($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+
+$ctx="%rdi";	# 1st arg, zapped by $a3
+$inp="%rsi";	# 2nd arg
+$Tbl="%rbp";
+
+$_ctx="16*$SZ+0*8(%rsp)";
+$_inp="16*$SZ+1*8(%rsp)";
+$_end="16*$SZ+2*8(%rsp)";
+$framesz="16*$SZ+3*8";
+
+sub ROUND_00_15()
+{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+  my $STRIDE=$SZ;
+  #   $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
+
+$code.=<<___;
+	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
+	mov	$f,$a2
+
+	xor	$e,$a0
+	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$g,$a2			# f^g
+
+	mov	$T1,`$SZ*($i&0xf)`(%rsp)
+	xor	$a,$a1
+	and	$e,$a2			# (f^g)&e
+
+	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
+	add	$h,$T1			# T1+=h
+	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
+
+	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
+	xor	$e,$a0
+	add	$a2,$T1			# T1+=Ch(e,f,g)
+
+	mov	$a,$a2
+	add	`$SZ*$i`($Tbl),$T1	# T1+=K[round]
+	xor	$a,$a1
+
+	xor	$b,$a2			# a^b, b^c in next round
+	ror	\$$Sigma1[0],$a0	# Sigma1(e)
+	mov	$b,$h
+
+	and	$a2,$a3
+	ror	\$$Sigma0[0],$a1	# Sigma0(a)
+	add	$a0,$T1			# T1+=Sigma1(e)
+
+	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
+	add	$T1,$d			# d+=T1
+	add	$T1,$h			# h+=T1
+___
+$code.=<<___ if ($i==31);
+	lea	`16*$SZ`($Tbl),$Tbl	# round+=16
+___
+$code.=<<___ if ($i<15);
+	add	$a1,$h			# h+=Sigma0(a)
+___
+	($a2,$a3) = ($a3,$a2);
+}
+
+sub ROUND_16_XX()
+{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
+	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
+
+	mov	$a0,$T1
+	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
+	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
+	mov	$a2,$a1
+	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
+
+	xor	$T1,$a0
+	shr	\$$sigma0[2],$T1
+	ror	\$$sigma0[0],$a0
+	xor	$a1,$a2
+	shr	\$$sigma1[2],$a1
+
+	ror	\$$sigma1[0],$a2
+	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
+	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
+	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
+
+	add	`$SZ*($i&0xf)`(%rsp),$T1
+	mov	$e,$a0
+	add	$a2,$T1
+	mov	$a,$a1
+___
+	&ROUND_00_15(@_);
+}
+
+$code=<<___;
+.text
+
+.globl	$func
+.type	$func,\@function,3,"unwind"
+.align	16
+$func:
+.cfi_startproc
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$$framesz,%rsp
+.cfi_adjust_cfa_offset	$framesz
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+.cfi_end_prologue
+
+	mov	$SZ*0($ctx),$A
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	mov	$B,$a3
+	lea	$TABLE(%rip),$Tbl
+	xor	$C,$a3			# magic
+___
+	for($i=0;$i<16;$i++) {
+		$code.="	mov	$SZ*$i($inp),$T1\n";
+		$code.="	mov	@ROT[4],$a0\n";
+		$code.="	mov	@ROT[0],$a1\n";
+		$code.="	bswap	$T1\n";
+		&ROUND_00_15($i,@ROT);
+		unshift(@ROT,pop(@ROT));
+	}
+$code.=<<___;
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+___
+	for(;$i<32;$i++) {
+		&ROUND_16_XX($i,@ROT);
+		unshift(@ROT,pop(@ROT));
+	}
+
+$code.=<<___;
+	cmpb	\$0x19,`$SZ-1`($Tbl)
+	jnz	.Lrounds_16_xx
+
+	mov	$_ctx,$ctx
+	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
+	lea	16*$SZ($inp),$inp
+
+	add	$SZ*0($ctx),$A
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop
+
+	lea	$framesz+6*8(%rsp),%r11
+.cfi_def_cfa	%r11,8
+	mov	$framesz(%rsp),%r15
+.cfi_restore	%r15
+	mov	-40(%r11),%r14
+.cfi_restore	%r14
+	mov	-32(%r11),%r13
+.cfi_restore	%r13
+	mov	-24(%r11),%r12
+.cfi_restore	%r12
+	mov	-16(%r11),%rbp
+.cfi_restore	%rbp
+	mov	-8(%r11),%rbx
+.cfi_restore	%rbx
+.cfi_epilogue
+	lea	(%r11),%rsp
+	ret
+.cfi_endproc
+.size	$func,.-$func
+
+.align	64
+.type	$TABLE,\@object
+$TABLE:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
+___
+{
+my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") :  # Win64 order
+                               ("%rdi","%rsi","%rdx");  # Unix order
+$code.=<<___;
+.globl	${pre}sha256_emit
+.hidden	${pre}sha256_emit
+.type	${pre}sha256_emit,\@abi-omnipotent
+.align	16
+${pre}sha256_emit:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	bswap	%r8
+	mov	24($inp), %r11
+	bswap	%r9
+	mov	%r8d, 4($out)
+	bswap	%r10
+	mov	%r9d, 12($out)
+	bswap	%r11
+	mov	%r10d, 20($out)
+	shr	\$32, %r8
+	mov	%r11d, 28($out)
+	shr	\$32, %r9
+	mov	%r8d, 0($out)
+	shr	\$32, %r10
+	mov	%r9d, 8($out)
+	shr	\$32, %r11
+	mov	%r10d, 16($out)
+	mov	%r11d, 24($out)
+	ret
+.size	${pre}sha256_emit,.-${pre}sha256_emit
+
+.globl	${pre}sha256_bcopy
+.hidden	${pre}sha256_bcopy
+.type	${pre}sha256_bcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_bcopy:
+	sub	$inp, $out
+.Loop_bcopy:
+	movzb	($inp), %eax
+	lea	1($inp), $inp
+	mov	%al, -1($out,$inp)
+	dec	$len
+	jnz	.Loop_bcopy
+	ret
+.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
+
+.globl	${pre}sha256_hcopy
+.hidden	${pre}sha256_hcopy
+.type	${pre}sha256_hcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_hcopy:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	mov	24($inp), %r11
+	mov	%r8, 0($out)
+	mov	%r9, 8($out)
+	mov	%r10, 16($out)
+	mov	%r11, 24($out)
+	ret
+.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+	print $_,"\n";
+}
+close STDOUT;
--- a/blst/asm/sha256-x86_64.pl
+++ b/blst/asm/sha256-x86_64.pl
@ -0,0 +1,789 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# sha256_block procedure for x86_64.
+#
+# This module is stripped of AVX and even scalar code paths, with
+# raionale that
+#
+# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one*
+#    processor, venerable Sandy Bridge;
+# b) AVX2 incurs costly power transitions, which would be justifiable
+#    if AVX2 code was executing most of the time, which is not the
+#    case in the context;
+# c) all comtemporary processors support SSSE3, so that nobody would
+#    actually use scalar code path anyway;
+#
+# See original module at CRYPTOGAMS for further details.
+
+$flavour = shift;
+$output  = pop;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+$pre="blst_";
+$func="${pre}sha256_block_data_order";
+$TABLE="K256";
+$SZ=4;
+@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
+				"%r8d","%r9d","%r10d","%r11d");
+($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+
+$ctx="%rdi";	# 1st arg, zapped by $a3
+$inp="%rsi";	# 2nd arg
+$Tbl="%rbp";
+
+$_ctx="16*$SZ+0*8(%rsp)";
+$_inp="16*$SZ+1*8(%rsp)";
+$_end="16*$SZ+2*8(%rsp)";
+$framesz="16*$SZ+3*8";
+
+$code=<<___;
+.text
+
+.align	64
+.type	$TABLE,\@object
+$TABLE:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
+___
+
+######################################################################
+# SIMD code paths
+#
+{{{
+######################################################################
+# Intel SHA Extensions implementation of SHA256 update function.
+#
+my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
+
+my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
+my @MSG=map("%xmm$_",(3..6));
+
+$code.=<<___;
+.globl	${pre}sha256_block_data_order_shaext
+.hidden	${pre}sha256_block_data_order_shaext
+.type	${pre}sha256_block_data_order_shaext,\@function,3,"unwind"
+.align	64
+${pre}sha256_block_data_order_shaext:
+.cfi_startproc
+___
+$code.=<<___ if ($win64);
+	sub	\$0x58,%rsp
+.cfi_adjust_cfa_offset	0x58
+	movaps	%xmm6,-0x58(%r11)
+.cfi_offset	%xmm6,-0x60
+	movaps	%xmm7,-0x48(%r11)
+.cfi_offset	%xmm7,-0x50
+	movaps	%xmm8,-0x38(%r11)
+.cfi_offset	%xmm8,-0x40
+	movaps	%xmm9,-0x28(%r11)
+.cfi_offset	%xmm9,-0x30
+	movaps	%xmm10,-0x18(%r11)
+.cfi_offset	%xmm10,-0x20
+.cfi_end_prologue
+___
+$code.=<<___;
+	lea		K256+0x80(%rip),$Tbl
+	movdqu		($ctx),$ABEF		# DCBA
+	movdqu		16($ctx),$CDGH		# HGFE
+	movdqa		0x100-0x80($Tbl),$TMP	# byte swap mask
+
+	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
+	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
+	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
+	movdqa		$TMP,$BSWAP		# offload
+	palignr		\$8,$CDGH,$ABEF		# ABEF
+	punpcklqdq	$Wi,$CDGH		# CDGH
+	jmp		.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu		($inp),@MSG[0]
+	movdqu		0x10($inp),@MSG[1]
+	movdqu		0x20($inp),@MSG[2]
+	pshufb		$TMP,@MSG[0]
+	movdqu		0x30($inp),@MSG[3]
+
+	movdqa		0*16-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	pshufb		$TMP,@MSG[1]
+	movdqa		$CDGH,$CDGH_SAVE	# offload
+	sha256rnds2	$ABEF,$CDGH		# 0-3
+	pshufd		\$0x0e,$Wi,$Wi
+	nop
+	movdqa		$ABEF,$ABEF_SAVE	# offload
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		1*16-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	pshufb		$TMP,@MSG[2]
+	sha256rnds2	$ABEF,$CDGH		# 4-7
+	pshufd		\$0x0e,$Wi,$Wi
+	lea		0x40($inp),$inp
+	sha256msg1	@MSG[1],@MSG[0]
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		2*16-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	pshufb		$TMP,@MSG[3]
+	sha256rnds2	$ABEF,$CDGH		# 8-11
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[3],$TMP
+	palignr		\$4,@MSG[2],$TMP
+	nop
+	paddd		$TMP,@MSG[0]
+	sha256msg1	@MSG[2],@MSG[1]
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		3*16-0x80($Tbl),$Wi
+	paddd		@MSG[3],$Wi
+	sha256msg2	@MSG[3],@MSG[0]
+	sha256rnds2	$ABEF,$CDGH		# 12-15
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[0],$TMP
+	palignr		\$4,@MSG[3],$TMP
+	nop
+	paddd		$TMP,@MSG[1]
+	sha256msg1	@MSG[3],@MSG[2]
+	sha256rnds2	$CDGH,$ABEF
+___
+for($i=4;$i<16-3;$i++) {
+$code.=<<___;
+	movdqa		$i*16-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256rnds2	$ABEF,$CDGH		# 16-19...
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	nop
+	paddd		$TMP,@MSG[2]
+	sha256msg1	@MSG[0],@MSG[3]
+	sha256rnds2	$CDGH,$ABEF
+___
+	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	movdqa		13*16-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256rnds2	$ABEF,$CDGH		# 52-55
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	sha256rnds2	$CDGH,$ABEF
+	paddd		$TMP,@MSG[2]
+
+	movdqa		14*16-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	sha256rnds2	$ABEF,$CDGH		# 56-59
+	pshufd		\$0x0e,$Wi,$Wi
+	sha256msg2	@MSG[1],@MSG[2]
+	movdqa		$BSWAP,$TMP
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		15*16-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	nop
+	sha256rnds2	$ABEF,$CDGH		# 60-63
+	pshufd		\$0x0e,$Wi,$Wi
+	dec		$num
+	nop
+	sha256rnds2	$CDGH,$ABEF
+
+	paddd		$CDGH_SAVE,$CDGH
+	paddd		$ABEF_SAVE,$ABEF
+	jnz		.Loop_shaext
+
+	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
+	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
+	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
+	punpckhqdq	$CDGH,$ABEF		# DCBA
+	palignr		\$8,$TMP,$CDGH		# HGFE
+
+	movdqu	$ABEF,($ctx)
+	movdqu	$CDGH,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	-0x58(%r11),%xmm6
+	movaps	-0x48(%r11),%xmm7
+	movaps	-0x38(%r11),%xmm8
+	movaps	-0x28(%r11),%xmm9
+	movaps	-0x18(%r11),%xmm10
+	mov	%r11,%rsp
+.cfi_def_cfa	%r11,8
+.cfi_epilogue
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.size	${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext
+___
+}}}
+{{{
+
+my $a4=$T1;
+my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
+	'&mov	($a,$a1)',
+	'&mov	($a4,$f)',
+
+	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
+	'&xor	($a0,$e)',
+	'&xor	($a4,$g)',			# f^g
+
+	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
+	'&xor	($a1,$a)',
+	'&and	($a4,$e)',			# (f^g)&e
+
+	'&xor	($a0,$e)',
+	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
+	'&mov	($a2,$a)',
+
+	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
+	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
+	'&xor	($a2,$b)',			# a^b, b^c in next round
+
+	'&add	($h,$a4)',			# h+=Ch(e,f,g)
+	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
+	'&and	($a3,$a2)',			# (b^c)&(a^b)
+
+	'&xor	($a1,$a)',
+	'&add	($h,$a0)',			# h+=Sigma1(e)
+	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
+
+	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
+	'&add	($d,$h)',			# d+=h
+	'&add	($h,$a3)',			# h+=Maj(a,b,c)
+
+	'&mov	($a0,$d)',
+	'&add	($a1,$h);'.			# h+=Sigma0(a)
+	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+	);
+}
+
+######################################################################
+# SSSE3 code path
+#
+{
+my $Tbl = $inp;
+my $_ctx="0(%rbp)";
+my $_inp="8(%rbp)";
+my $_end="16(%rbp)";
+my $framesz=4*8+$win64*16*4+8;
+
+my @X = map("%xmm$_",(0..3));
+my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
+
+$code.=<<___;
+.globl	${func}
+.hidden	${func}
+.type	${func},\@function,3,"unwind"
+.align	64
+${func}:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$$framesz,%rsp
+.cfi_adjust_cfa_offset	$framesz
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	mov	$ctx,0(%rsp)		# save ctx, 1st arg
+	#mov	$inp,8(%rsp)		# save inp, 2nd arg
+	mov	%rdx,16(%rsp)		# save end pointer, "3rd" arg
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x20(%rsp)
+.cfi_offset	%xmm6,-0x78
+	movaps	%xmm7,0x30(%rsp)
+.cfi_offset	%xmm7,-0x68
+	movaps	%xmm8,0x40(%rsp)
+.cfi_offset	%xmm8,-0x58
+	movaps	%xmm9,0x50(%rsp)
+.cfi_offset	%xmm9,-0x48
+___
+$code.=<<___;
+	mov	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+.cfi_end_prologue
+
+	lea	-16*$SZ(%rsp),%rsp
+	mov	$SZ*0($ctx),$A
+	and	\$-64,%rsp		# align stack
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+___
+
+$code.=<<___;
+	#movdqa	$TABLE+`$SZ*$rounds`+32(%rip),$t4
+	#movdqa	$TABLE+`$SZ*$rounds`+64(%rip),$t5
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	$TABLE+`$SZ*$rounds`(%rip),$t3
+	mov	$inp,$_inp		# offload $inp
+	movdqu	0x00($inp),@X[0]
+	movdqu	0x10($inp),@X[1]
+	movdqu	0x20($inp),@X[2]
+	pshufb	$t3,@X[0]
+	movdqu	0x30($inp),@X[3]
+	lea	$TABLE(%rip),$Tbl
+	pshufb	$t3,@X[1]
+	movdqa	0x00($Tbl),$t0
+	movdqa	0x10($Tbl),$t1
+	pshufb	$t3,@X[2]
+	paddd	@X[0],$t0
+	movdqa	0x20($Tbl),$t2
+	pshufb	$t3,@X[3]
+	movdqa	0x30($Tbl),$t3
+	paddd	@X[1],$t1
+	paddd	@X[2],$t2
+	paddd	@X[3],$t3
+	movdqa	$t0,0x00(%rsp)
+	mov	$A,$a1
+	movdqa	$t1,0x10(%rsp)
+	mov	$B,$a3
+	movdqa	$t2,0x20(%rsp)
+	xor	$C,$a3			# magic
+	movdqa	$t3,0x30(%rsp)
+	mov	$E,$a0
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	sub	\$`-16*$SZ`,$Tbl	# size optimization
+___
+sub Xupdate_256_SSSE3 () {
+	(
+	'&movdqa	($t0,@X[1]);',
+	'&movdqa	($t3,@X[3])',
+	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
+	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
+	'&movdqa	($t1,$t0)',
+	'&movdqa	($t2,$t0);',
+	'&psrld		($t0,$sigma0[2])',
+	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
+	'&psrld		($t2,$sigma0[0])',
+	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
+	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
+	'&pxor		($t0,$t2)',
+	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
+	'&pxor		($t0,$t1)',
+	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
+	'&pxor		($t0,$t2);',
+	 '&movdqa	($t2,$t3)',
+	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
+	 '&psrld	($t3,$sigma1[2])',
+	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
+	 '&psrlq	($t2,$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
+	 '&pxor		($t3,$t2)',
+	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
+	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
+	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
+	 '&movdqa	($t2,$t3);',
+	 '&psrld	($t3,$sigma1[2])',
+	 '&psrlq	($t2,$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	'&movdqa	($t2,16*$j."($Tbl)")',
+	 '&pshufb	($t3,$t5)',
+	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
+	);
+}
+
+sub SSSE3_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
+
+    if (0) {
+	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	}
+    } else {			# squeeze extra 4% on Westmere and 19% on Atom
+	  eval(shift(@insns));	#@
+	&movdqa		($t0,@X[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t3,@X[3]);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&palignr	($t0,@X[0],$SZ);	# X[1..4]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&movdqa		($t1,$t0);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t2,$t0);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&psrld		($t0,$sigma0[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&psrld		($t2,$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&pslld		($t1,8*$SZ-$sigma0[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&psrld		($t2,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	&pxor		($t0,$t1);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pslld		($t1,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t1);		# sigma0(X[1..4])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &psrlq		($t2,$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
+	 &pshufd	($t3,$t3,0b10000000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &psrlq		($t2,$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 #&pshufb	($t3,$t5);
+	 &pshufd	($t3,$t3,0b00001000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t2,16*$j."($Tbl)");
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &pslldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+    }
+	&paddd		($t2,@X[0]);
+	  foreach (@insns) { eval; }		# remaining instructions
+	&movdqa		(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&SSSE3_256_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*$SZ."($Tbl)",0);
+	&jne	(".Lssse3_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+$code.=<<___;
+	mov	$_ctx,$ctx
+	mov	$a1,$A
+	mov	$_inp,$inp
+
+	add	$SZ*0($ctx),$A
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	lea	16*$SZ($inp),$inp
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop_ssse3
+
+	xorps	%xmm0, %xmm0
+	lea	$framesz+6*8(%rbp),%r11
+.cfi_def_cfa	%r11,8
+	movaps	%xmm0, 0x00(%rsp)	# scrub the stack
+	movaps	%xmm0, 0x10(%rsp)
+	movaps	%xmm0, 0x20(%rsp)
+	movaps	%xmm0, 0x30(%rsp)
+___
+$code.=<<___ if ($win64);
+	movaps	0x20(%rbp),%xmm6
+	movaps	0x30(%rbp),%xmm7
+	movaps	0x40(%rbp),%xmm8
+	movaps	0x50(%rbp),%xmm9
+___
+$code.=<<___;
+	mov	$framesz(%rbp),%r15
+.cfi_restore	%r15
+	mov	-40(%r11),%r14
+.cfi_restore	%r14
+	mov	-32(%r11),%r13
+.cfi_restore	%r13
+	mov	-24(%r11),%r12
+.cfi_restore	%r12
+	mov	-16(%r11),%rbx
+.cfi_restore	%rbx
+	mov	-8(%r11),%rbp
+.cfi_restore	%rbp
+.cfi_epilogue
+	lea	(%r11),%rsp
+	ret
+.cfi_endproc
+.size	${func},.-${func}
+___
+}
+}}}
+{
+my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") :  # Win64 order
+                               ("%rdi","%rsi","%rdx");  # Unix order
+$code.=<<___;
+.globl	${pre}sha256_emit
+.hidden	${pre}sha256_emit
+.type	${pre}sha256_emit,\@abi-omnipotent
+.align	16
+${pre}sha256_emit:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	bswap	%r8
+	mov	24($inp), %r11
+	bswap	%r9
+	mov	%r8d, 4($out)
+	bswap	%r10
+	mov	%r9d, 12($out)
+	bswap	%r11
+	mov	%r10d, 20($out)
+	shr	\$32, %r8
+	mov	%r11d, 28($out)
+	shr	\$32, %r9
+	mov	%r8d, 0($out)
+	shr	\$32, %r10
+	mov	%r9d, 8($out)
+	shr	\$32, %r11
+	mov	%r10d, 16($out)
+	mov	%r11d, 24($out)
+	ret
+.size	${pre}sha256_emit,.-${pre}sha256_emit
+
+.globl	${pre}sha256_bcopy
+.hidden	${pre}sha256_bcopy
+.type	${pre}sha256_bcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_bcopy:
+	sub	$inp, $out
+.Loop_bcopy:
+	movzb	($inp), %eax
+	lea	1($inp), $inp
+	mov	%al, -1($out,$inp)
+	dec	$len
+	jnz	.Loop_bcopy
+	ret
+.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
+
+.globl	${pre}sha256_hcopy
+.hidden	${pre}sha256_hcopy
+.type	${pre}sha256_hcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_hcopy:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	mov	24($inp), %r11
+	mov	%r8, 0($out)
+	mov	%r9, 8($out)
+	mov	%r10, 16($out)
+	mov	%r11, 24($out)
+	ret
+.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
+___
+}
+
+sub sha256op38 {
+    my $instr = shift;
+    my %opcodelet = (
+		"sha256rnds2" => 0xcb,
+  		"sha256msg1"  => 0xcc,
+		"sha256msg2"  => 0xcd	);
+
+    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
+      my @opcode=(0x0f,0x38);
+	push @opcode,$opcodelet{$instr};
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	return ".byte\t".join(',',@opcode);
+    } else {
+	return $instr."\t".@_[0];
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
+
+	print $_,"\n";
+}
+close STDOUT;
--- a/blst/asm/x86_64-xlate.pl
+++ b/blst/asm/x86_64-xlate.pl