initial stuff

2022-09-09 02:47:49 -04:00 · 2022-09-09 02:47:49 -04:00 · 943c07066e
commit 943c07066e
99 changed files with 58786 additions and 0 deletions
--- a/blst/elf/add_mod_256-armv8.S
+++ b/blst/elf/add_mod_256-armv8.S
@ -0,0 +1,379 @@
+.text
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,%function
+.align	5
+add_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	adds	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	adcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	adcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	stp	x8,x9,[x0]
+	csel	x11,x11,x2,lo
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	add_mod_256,.-add_mod_256
+
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,%function
+.align	5
+mul_by_3_mod_256:
+	ldp	x12,x13,[x1]
+	ldp	x14,x15,[x1,#16]
+
+	adds	x8,x12,x12
+	ldp	x4,x5,[x2]
+	adcs	x9,x13,x13
+	ldp	x6,x7,[x2,#16]
+	adcs	x10,x14,x14
+	adcs	x11,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	csel	x11,x11,x2,lo
+
+	adds	x8,x8,x12
+	adcs	x9,x9,x13
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	stp	x8,x9,[x0]
+	csel	x11,x11,x2,lo
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,%function
+.align	5
+lshift_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+.Loop_lshift_mod_256:
+	adds	x8,x8,x8
+	sub	x2,x2,#1
+	adcs	x9,x9,x9
+	adcs	x10,x10,x10
+	adcs	x11,x11,x11
+	adc	x3,xzr,xzr
+
+	subs	x12,x8,x4
+	sbcs	x13,x9,x5
+	sbcs	x14,x10,x6
+	sbcs	x15,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x12,lo
+	csel	x9,x9,x13,lo
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+
+	cbnz	x2,.Loop_lshift_mod_256
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	lshift_mod_256,.-lshift_mod_256
+
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,%function
+.align	5
+rshift_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+.Loop_rshift:
+	adds	x12,x8,x4
+	sub	x2,x2,#1
+	adcs	x13,x9,x5
+	adcs	x14,x10,x6
+	adcs	x15,x11,x7
+	adc	x3,xzr,xzr
+	tst	x8,#1
+
+	csel	x12,x12,x8,ne
+	csel	x13,x13,x9,ne
+	csel	x14,x14,x10,ne
+	csel	x15,x15,x11,ne
+	csel	x3,x3,xzr,ne
+
+	extr	x8,x13,x12,#1
+	extr	x9,x14,x13,#1
+	extr	x10,x15,x14,#1
+	extr	x11,x3,x15,#1
+
+	cbnz	x2,.Loop_rshift
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	rshift_mod_256,.-rshift_mod_256
+
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,%function
+.align	5
+cneg_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x4,x5,[x3]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x12,x4,x8
+	ldp	x6,x7,[x3,#16]
+	orr	x4,x8,x9
+	sbcs	x13,x5,x9
+	orr	x5,x10,x11
+	sbcs	x14,x6,x10
+	orr	x3,x4,x5
+	sbc	x15,x7,x11
+
+	cmp	x3,#0
+	csetm	x3,ne
+	ands	x2,x2,x3
+
+	csel	x8,x8,x12,eq
+	csel	x9,x9,x13,eq
+	csel	x10,x10,x14,eq
+	stp	x8,x9,[x0]
+	csel	x11,x11,x15,eq
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	cneg_mod_256,.-cneg_mod_256
+
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,%function
+.align	5
+sub_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	stp	x8,x9,[x0]
+	adc	x11,x11,x7
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	sub_mod_256,.-sub_mod_256
+
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,%function
+.align	5
+check_mod_256:
+	ldp	x8,x9,[x0]
+	ldp	x10,x11,[x0,#16]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	subs	xzr,x8,x4
+	sbcs	xzr,x9,x5
+	orr	x8,x8,x9
+	sbcs	xzr,x10,x6
+	orr	x8,x8,x10
+	sbcs	xzr,x11,x7
+	orr	x8,x8,x11
+	sbc	x1,xzr,xzr
+
+	cmp	x8,#0
+	mov	x0,#1
+	csel	x0,x0,xzr,ne
+	and	x0,x0,x1
+
+	ret
+.size	check_mod_256,.-check_mod_256
+
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,%function
+.align	5
+add_n_check_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	adds	x8,x8,x12
+	ldp	x4,x5,[x3]
+	adcs	x9,x9,x13
+	ldp	x6,x7,[x3,#16]
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	csel	x11,x11,x2,lo
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	csel	x0, x17, xzr, ne
+
+	ret
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,%function
+.align	5
+sub_n_check_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	subs	x8,x8,x12
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	adc	x11,x11,x7
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	csel	x0, x17, xzr, ne
+
+	ret
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
--- a/blst/elf/add_mod_256-x86_64.s
+++ b/blst/elf/add_mod_256-x86_64.s
@ -0,0 +1,572 @@
+.text	
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,@function
+.align	32
+add_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loaded_a_add_mod_256:
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	movq	%r9,%rsi
+	adcq	24(%rdx),%r11
+	sbbq	%rdx,%rdx
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_mod_256,.-add_mod_256
+
+
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,@function
+.align	32
+mul_by_3_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	%rdx,%rcx
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%rsi,%rdx
+	movq	24(%rsi),%r11
+
+	call	__lshift_mod_256
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	jmp	.Loaded_a_add_mod_256
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.type	__lshift_mod_256,@function
+.align	32
+__lshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r8,%rax
+	adcq	%r10,%r10
+	movq	%r9,%rsi
+	adcq	%r11,%r11
+	sbbq	%r12,%r12
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	cmovcq	%rbx,%r10
+	cmovcq	%rbp,%r11
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__lshift_mod_256,.-__lshift_mod_256
+
+
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,@function
+.align	32
+lshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loop_lshift_mod_256:
+	call	__lshift_mod_256
+	decl	%edx
+	jnz	.Loop_lshift_mod_256
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	lshift_mod_256,.-lshift_mod_256
+
+
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,@function
+.align	32
+rshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rbp
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loop_rshift_mod_256:
+	movq	%rbp,%r8
+	andq	$1,%rbp
+	movq	0(%rcx),%rax
+	negq	%rbp
+	movq	8(%rcx),%rsi
+	movq	16(%rcx),%rbx
+
+	andq	%rbp,%rax
+	andq	%rbp,%rsi
+	andq	%rbp,%rbx
+	andq	24(%rcx),%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	adcq	%rbx,%r10
+	adcq	%rbp,%r11
+	sbbq	%rax,%rax
+
+	shrq	$1,%r8
+	movq	%r9,%rbp
+	shrq	$1,%r9
+	movq	%r10,%rbx
+	shrq	$1,%r10
+	movq	%r11,%rsi
+	shrq	$1,%r11
+
+	shlq	$63,%rbp
+	shlq	$63,%rbx
+	orq	%r8,%rbp
+	shlq	$63,%rsi
+	orq	%rbx,%r9
+	shlq	$63,%rax
+	orq	%rsi,%r10
+	orq	%rax,%r11
+
+	decl	%edx
+	jnz	.Loop_rshift_mod_256
+
+	movq	%rbp,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	rshift_mod_256,.-rshift_mod_256
+
+
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,@function
+.align	32
+cneg_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	0(%rsi),%r12
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%r12,%r8
+	movq	24(%rsi),%r11
+	orq	%r9,%r12
+	orq	%r10,%r12
+	orq	%r11,%r12
+	movq	$-1,%rbp
+
+	movq	0(%rcx),%rax
+	cmovnzq	%rbp,%r12
+	movq	8(%rcx),%rsi
+	movq	16(%rcx),%rbx
+	andq	%r12,%rax
+	movq	24(%rcx),%rbp
+	andq	%r12,%rsi
+	andq	%r12,%rbx
+	andq	%r12,%rbp
+
+	subq	%r8,%rax
+	sbbq	%r9,%rsi
+	sbbq	%r10,%rbx
+	sbbq	%r11,%rbp
+
+	orq	%rdx,%rdx
+
+	cmovzq	%r8,%rax
+	cmovzq	%r9,%rsi
+	movq	%rax,0(%rdi)
+	cmovzq	%r10,%rbx
+	movq	%rsi,8(%rdi)
+	cmovzq	%r11,%rbp
+	movq	%rbx,16(%rdi)
+	movq	%rbp,24(%rdi)
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	cneg_mod_256,.-cneg_mod_256
+
+
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,@function
+.align	32
+sub_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%rax
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%rsi
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rbx
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbp
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%rax
+	andq	%rdx,%rsi
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_mod_256,.-sub_mod_256
+
+
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,@function
+.align	32
+check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+
+	movq	%rax,%r8
+	orq	%r9,%rax
+	orq	%r10,%rax
+	orq	%r11,%rax
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	%rsi,%rsi
+
+	movq	$1,%rdx
+	cmpq	$0,%rax
+	cmovneq	%rdx,%rax
+	andq	%rsi,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	check_mod_256,.-check_mod_256
+
+
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,@function
+.align	32
+add_n_check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	movq	%r9,%rsi
+	adcq	24(%rdx),%r11
+	sbbq	%rdx,%rdx
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	orq	%r9,%r8
+	orq	%r11,%r10
+	orq	%r10,%r8
+	movq	$1,%rax
+	cmovzq	%r8,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,@function
+.align	32
+sub_n_check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%rax
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%rsi
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rbx
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbp
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%rax
+	andq	%rdx,%rsi
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	orq	%r9,%r8
+	orq	%r11,%r10
+	orq	%r10,%r8
+	movq	$1,%rax
+	cmovzq	%r8,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
--- a/blst/elf/add_mod_384-armv8.S
+++ b/blst/elf/add_mod_384-armv8.S
@ -0,0 +1,931 @@
+.text
+
+.globl	add_mod_384
+.hidden	add_mod_384
+.type	add_mod_384,%function
+.align	5
+add_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	add_mod_384,.-add_mod_384
+
+.type	__add_mod_384,%function
+.align	5
+__add_mod_384:
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+__add_mod_384_ab_are_loaded:
+	adds	x10,x10,x16
+	adcs	x11,x11,x17
+	adcs	x12,x12,x19
+	adcs	x13,x13,x20
+	adcs	x14,x14,x21
+	adcs	x15,x15,x22
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csel	x10,x10,x16,lo
+	csel	x11,x11,x17,lo
+	csel	x12,x12,x19,lo
+	csel	x13,x13,x20,lo
+	csel	x14,x14,x21,lo
+	csel	x15,x15,x22,lo
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.globl	add_mod_384x
+.hidden	add_mod_384x
+.type	add_mod_384x,%function
+.align	5
+add_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	add_mod_384x,.-add_mod_384x
+
+.globl	rshift_mod_384
+.hidden	rshift_mod_384
+.type	rshift_mod_384,%function
+.align	5
+rshift_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+.Loop_rshift_mod_384:
+	sub	x2,x2,#1
+	bl	__rshift_mod_384
+	cbnz	x2,.Loop_rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	rshift_mod_384,.-rshift_mod_384
+
+.type	__rshift_mod_384,%function
+.align	5
+__rshift_mod_384:
+	sbfx	x22,x10,#0,#1
+	and	x16,x22,x4
+	and	x17,x22,x5
+	adds	x10,x10,x16
+	and	x19,x22,x6
+	adcs	x11,x11,x17
+	and	x20,x22,x7
+	adcs	x12,x12,x19
+	and	x21,x22,x8
+	adcs	x13,x13,x20
+	and	x22,x22,x9
+	adcs	x14,x14,x21
+	extr	x10,x11,x10,#1	// a[0:5] >>= 1
+	adcs	x15,x15,x22
+	extr	x11,x12,x11,#1
+	adc	x22,xzr,xzr
+	extr	x12,x13,x12,#1
+	extr	x13,x14,x13,#1
+	extr	x14,x15,x14,#1
+	extr	x15,x22,x15,#1
+	ret
+.size	__rshift_mod_384,.-__rshift_mod_384
+
+.globl	div_by_2_mod_384
+.hidden	div_by_2_mod_384
+.type	div_by_2_mod_384,%function
+.align	5
+div_by_2_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	div_by_2_mod_384,.-div_by_2_mod_384
+
+.globl	lshift_mod_384
+.hidden	lshift_mod_384
+.type	lshift_mod_384,%function
+.align	5
+lshift_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+.Loop_lshift_mod_384:
+	sub	x2,x2,#1
+	bl	__lshift_mod_384
+	cbnz	x2,.Loop_lshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	lshift_mod_384,.-lshift_mod_384
+
+.type	__lshift_mod_384,%function
+.align	5
+__lshift_mod_384:
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csel	x10,x10,x16,lo
+	csel	x11,x11,x17,lo
+	csel	x12,x12,x19,lo
+	csel	x13,x13,x20,lo
+	csel	x14,x14,x21,lo
+	csel	x15,x15,x22,lo
+
+	ret
+.size	__lshift_mod_384,.-__lshift_mod_384
+
+.globl	mul_by_3_mod_384
+.hidden	mul_by_3_mod_384
+.type	mul_by_3_mod_384,%function
+.align	5
+mul_by_3_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_3_mod_384,.-mul_by_3_mod_384
+
+.globl	mul_by_8_mod_384
+.hidden	mul_by_8_mod_384
+.type	mul_by_8_mod_384,%function
+.align	5
+mul_by_8_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_8_mod_384,.-mul_by_8_mod_384
+
+.globl	mul_by_3_mod_384x
+.hidden	mul_by_3_mod_384x
+.type	mul_by_3_mod_384x,%function
+.align	5
+mul_by_3_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1,#48]
+	ldp	x19,x20,[x1,#64]
+	ldp	x21,x22,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
+
+.globl	mul_by_8_mod_384x
+.hidden	mul_by_8_mod_384x
+.type	mul_by_8_mod_384x,%function
+.align	5
+mul_by_8_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
+
+.globl	cneg_mod_384
+.hidden	cneg_mod_384
+.type	cneg_mod_384,%function
+.align	5
+cneg_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x4,x5,[x3]
+	ldp	x12,x13,[x1,#16]
+	ldp	x6,x7,[x3,#16]
+
+	subs	x16,x4,x10
+	ldp	x14,x15,[x1,#32]
+	ldp	x8,x9,[x3,#32]
+	orr	x3,x10,x11
+	sbcs	x17,x5,x11
+	orr	x3,x3,x12
+	sbcs	x19,x6,x12
+	orr	x3,x3,x13
+	sbcs	x20,x7,x13
+	orr	x3,x3,x14
+	sbcs	x21,x8,x14
+	orr	x3,x3,x15
+	sbc	x22,x9,x15
+
+	cmp	x3,#0
+	csetm	x3,ne
+	ands	x2,x2,x3
+
+	csel	x10,x10,x16,eq
+	csel	x11,x11,x17,eq
+	csel	x12,x12,x19,eq
+	csel	x13,x13,x20,eq
+	stp	x10,x11,[x0]
+	csel	x14,x14,x21,eq
+	stp	x12,x13,[x0,#16]
+	csel	x15,x15,x22,eq
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	cneg_mod_384,.-cneg_mod_384
+
+.globl	sub_mod_384
+.hidden	sub_mod_384
+.type	sub_mod_384,%function
+.align	5
+sub_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	sub_mod_384,.-sub_mod_384
+
+.type	__sub_mod_384,%function
+.align	5
+__sub_mod_384:
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+	subs	x10,x10,x16
+	sbcs	x11,x11,x17
+	sbcs	x12,x12,x19
+	sbcs	x13,x13,x20
+	sbcs	x14,x14,x21
+	sbcs	x15,x15,x22
+	sbc	x3,xzr,xzr
+
+	and	x16,x4,x3
+	and	x17,x5,x3
+	adds	x10,x10,x16
+	and	x19,x6,x3
+	adcs	x11,x11,x17
+	and	x20,x7,x3
+	adcs	x12,x12,x19
+	and	x21,x8,x3
+	adcs	x13,x13,x20
+	and	x22,x9,x3
+	adcs	x14,x14,x21
+	adc	x15,x15,x22
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	sub_mod_384x
+.hidden	sub_mod_384x
+.type	sub_mod_384x,%function
+.align	5
+sub_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	sub_mod_384x,.-sub_mod_384x
+
+.globl	mul_by_1_plus_i_mod_384x
+.hidden	mul_by_1_plus_i_mod_384x
+.type	mul_by_1_plus_i_mod_384x,%function
+.align	5
+mul_by_1_plus_i_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+	add	x2,x1,#48
+
+	bl	__sub_mod_384			// a->re - a->im
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
+
+.globl	sgn0_pty_mod_384
+.hidden	sgn0_pty_mod_384
+.type	sgn0_pty_mod_384,%function
+.align	5
+sgn0_pty_mod_384:
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x0,x10,#1
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x3,x3,xzr
+
+	mvn	x3,x3
+	and	x3,x3,#2
+	orr	x0,x0,x3
+
+	ret
+.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
+
+.globl	sgn0_pty_mod_384x
+.hidden	sgn0_pty_mod_384x
+.type	sgn0_pty_mod_384x,%function
+.align	5
+sgn0_pty_mod_384x:
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x2,x10,#1
+	orr	x3,x10,x11
+	adds	x10,x10,x10
+	orr	x3,x3,x12
+	adcs	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	ldp	x10,x11,[x0,#48]
+	ldp	x12,x13,[x0,#64]
+	ldp	x14,x15,[x0,#80]
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x2,x2,x16
+
+	and	x0,x10,#1
+	orr	x1,x10,x11
+	adds	x10,x10,x10
+	orr	x1,x1,x12
+	adcs	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x0,x0,x16
+
+	cmp	x3,#0
+	csel	x3,x0,x2,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	x1,#0
+	csel	x1,x0,x2,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3	// pack sign and parity
+
+	ret
+.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
+.globl	vec_select_48
+.hidden	vec_select_48
+.type	vec_select_48,%function
+.align	5
+vec_select_48:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+.size	vec_select_48,.-vec_select_48
+.globl	vec_select_96
+.hidden	vec_select_96
+.type	vec_select_96,%function
+.align	5
+vec_select_96:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+.size	vec_select_96,.-vec_select_96
+.globl	vec_select_192
+.hidden	vec_select_192
+.type	vec_select_192,%function
+.align	5
+vec_select_192:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+.size	vec_select_192,.-vec_select_192
+.globl	vec_select_144
+.hidden	vec_select_144
+.type	vec_select_144,%function
+.align	5
+vec_select_144:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+.size	vec_select_144,.-vec_select_144
+.globl	vec_select_288
+.hidden	vec_select_288
+.type	vec_select_288,%function
+.align	5
+vec_select_288:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+.size	vec_select_288,.-vec_select_288
+.globl	vec_prefetch
+.hidden	vec_prefetch
+.type	vec_prefetch,%function
+.align	5
+vec_prefetch:
+	add	x1, x1, x0
+	sub	x1, x1, #1
+	mov	x2, #64
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	prfm	pldl1keep, [x0]
+	ret
+.size	vec_prefetch,.-vec_prefetch
--- a/blst/elf/add_mod_384-x86_64.s
+++ b/blst/elf/add_mod_384-x86_64.s
--- a/blst/elf/add_mod_384x384-x86_64.s
+++ b/blst/elf/add_mod_384x384-x86_64.s
@ -0,0 +1,252 @@
+.text	
+
+.type	__add_mod_384x384,@function
+.align	32
+__add_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	addq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	adcq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	adcq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	adcq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	adcq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	adcq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	adcq	48(%rdx),%r14
+	movq	%r9,8(%rdi)
+	adcq	56(%rdx),%r15
+	movq	%r10,16(%rdi)
+	adcq	64(%rdx),%rax
+	movq	%r12,32(%rdi)
+	movq	%r14,%r8
+	adcq	72(%rdx),%rbx
+	movq	%r11,24(%rdi)
+	movq	%r15,%r9
+	adcq	80(%rdx),%rbp
+	movq	%r13,40(%rdi)
+	movq	%rax,%r10
+	adcq	88(%rdx),%rsi
+	movq	%rbx,%r11
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%rbp,%r12
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%rbx
+	sbbq	32(%rcx),%rbp
+	movq	%rsi,%r13
+	sbbq	40(%rcx),%rsi
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r14
+	cmovcq	%r9,%r15
+	cmovcq	%r10,%rax
+	movq	%r14,48(%rdi)
+	cmovcq	%r11,%rbx
+	movq	%r15,56(%rdi)
+	cmovcq	%r12,%rbp
+	movq	%rax,64(%rdi)
+	cmovcq	%r13,%rsi
+	movq	%rbx,72(%rdi)
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__add_mod_384x384,.-__add_mod_384x384
+
+.type	__sub_mod_384x384,@function
+.align	32
+__sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.globl	add_mod_384x384
+.hidden	add_mod_384x384
+.type	add_mod_384x384,@function
+.align	32
+add_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__add_mod_384x384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_mod_384x384,.-add_mod_384x384
+
+.globl	sub_mod_384x384
+.hidden	sub_mod_384x384
+.type	sub_mod_384x384,@function
+.align	32
+sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sub_mod_384x384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_mod_384x384,.-sub_mod_384x384
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
--- a/blst/elf/ct_inverse_mod_256-armv8.S
+++ b/blst/elf/ct_inverse_mod_256-armv8.S
@ -0,0 +1,784 @@
+.text
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256, %function
+.align	5
+ct_inverse_mod_256:
+	.inst	0xd503233f
+	stp	x29, x30, [sp,#-80]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	sub	sp, sp, #1040
+
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x6, x7, [x1,#8*2]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	str	x0, [sp]
+
+	ldp	x8, x9, [x2,#8*0]
+	ldp	x10, x11, [x2,#8*2]
+
+	stp	x4, x5, [x1,#8*0]	// copy input to |a|
+	stp	x6, x7, [x1,#8*2]
+	stp	x8, x9, [x1,#8*4]	// copy modulus to |b|
+	stp	x10, x11, [x1,#8*6]
+
+	////////////////////////////////////////// first iteration
+	bl	.Lab_approximation_31_256_loaded
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	str	x12,[x0,#8*8]		// initialize |u| with |f0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to dst |b|
+	bl	__smul_256_n_shift_by_31
+	str	x12, [x0,#8*9]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	ldr	x8, [x1,#8*8]		// |u|
+	ldr	x9, [x1,#8*13]	// |v|
+	madd	x4, x16, x8, xzr	// |u|*|f0|
+	madd	x4, x17, x9, x4	// |v|*|g0|
+	str	x4, [x0,#8*4]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*5]
+	stp	x5, x5, [x0,#8*7]
+
+	madd	x4, x12, x8, xzr	// |u|*|f1|
+	madd	x4, x13, x9, x4	// |v|*|g1|
+	str	x4, [x0,#8*9]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*10]
+	stp	x5, x5, [x0,#8*12]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	////////////////////////////////////////// two[!] last iterations
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #47			// 31 + 512 % 31
+	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
+	ldr	x7, [x1,#8*0]		// just load
+	ldr	x11, [x1,#8*4]
+	bl	__inner_loop_62_256
+
+	mov	x16, x14
+	mov	x17, x15
+	ldr	x0, [sp]			// original out_ptr
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	ldr	x30, [x29,#8]
+
+	smulh	x20, x7, x17		// figure out top-most limb
+	ldp	x8, x9, [x3,#8*0]
+	adc	x23, x23, x25
+	ldp	x10, x11, [x3,#8*2]
+
+	add	x20, x20, x23		// x20 is 1, 0 or -1
+	asr	x19, x20, #63		// sign as mask
+
+	and	x23,   x8, x19		// add mod<<256 conditionally
+	and	x24,   x9, x19
+	adds	x4, x4, x23
+	and	x25,   x10, x19
+	adcs	x5, x5, x24
+	and	x26,   x11, x19
+	adcs	x6, x6, x25
+	adcs	x7, x22,   x26
+	adc	x20, x20, xzr		// x20 is 1, 0 or -1
+
+	neg	x19, x20
+	orr	x20, x20, x19		// excess bit or sign as mask
+	asr	x19, x19, #63		// excess bit as mask
+
+	and	x8, x8, x20		// mask |mod|
+	and	x9, x9, x20
+	and	x10, x10, x20
+	and	x11, x11, x20
+
+	eor	x8, x8, x19		// conditionally negate |mod|
+	eor	x9, x9, x19
+	adds	x8, x8, x19, lsr#63
+	eor	x10, x10, x19
+	adcs	x9, x9, xzr
+	eor	x11, x11, x19
+	adcs	x10, x10, xzr
+	adc	x11, x11, xzr
+
+	adds	x4, x4, x8	// final adjustment for |mod|<<256
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*4]
+	adc	x7, x7, x11
+	stp	x6, x7, [x0,#8*6]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldr	x29, [sp],#80
+	.inst	0xd50323bf
+	ret
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+
+////////////////////////////////////////////////////////////////////////
+.type	__smul_256x63, %function
+.align	5
+__smul_256x63:
+	ldp	x4, x5, [x1,#8*0+64]	// load |u| (or |v|)
+	asr	x14, x16, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x6, x7, [x1,#8*2+64]
+	eor	x16, x16, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x22, [x1,#8*4+64]
+
+	eor	x4, x4, x14	// conditionally negate |u| (or |v|)
+	sub	x16, x16, x14
+	eor	x5, x5, x14
+	adds	x4, x4, x14, lsr#63
+	eor	x6, x6, x14
+	adcs	x5, x5, xzr
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	eor	x22, x22, x14
+	umulh	x19, x4, x16
+	adcs	x7, x7, xzr
+	umulh	x20, x5, x16
+	adcs	x22, x22, xzr
+	umulh	x21, x6, x16
+	mul	x4, x4, x16
+	cmp	x16, #0
+	mul	x5, x5, x16
+	csel	x22, x22, xzr, ne
+	mul	x6, x6, x16
+	adds	x5, x5, x19
+	mul	x24, x7, x16
+	adcs	x6, x6, x20
+	adcs	x24, x24, x21
+	adc	x26, xzr, xzr
+	ldp	x8, x9, [x1,#8*0+104]	// load |u| (or |v|)
+	asr	x14, x17, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x10, x11, [x1,#8*2+104]
+	eor	x17, x17, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x23, [x1,#8*4+104]
+
+	eor	x8, x8, x14	// conditionally negate |u| (or |v|)
+	sub	x17, x17, x14
+	eor	x9, x9, x14
+	adds	x8, x8, x14, lsr#63
+	eor	x10, x10, x14
+	adcs	x9, x9, xzr
+	eor	x11, x11, x14
+	adcs	x10, x10, xzr
+	eor	x23, x23, x14
+	umulh	x19, x8, x17
+	adcs	x11, x11, xzr
+	umulh	x20, x9, x17
+	adcs	x23, x23, xzr
+	umulh	x21, x10, x17
+	adc	x15, xzr, xzr		// used in __smul_512x63_tail
+	mul	x8, x8, x17
+	cmp	x17, #0
+	mul	x9, x9, x17
+	csel	x23, x23, xzr, ne
+	mul	x10, x10, x17
+	adds	x9, x9, x19
+	mul	x25, x11, x17
+	adcs	x10, x10, x20
+	adcs	x25, x25, x21
+	adc	x26, x26, xzr
+
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*0]
+	adcs	x24,   x24,   x25
+	stp	x6, x24, [x0,#8*2]
+
+	ret
+.size	__smul_256x63,.-__smul_256x63
+
+.type	__smul_512x63_tail, %function
+.align	5
+__smul_512x63_tail:
+	umulh	x24, x7, x16
+	ldp	x5, x6, [x1,#8*18]	// load rest of |v|
+	adc	x26, x26, xzr
+	ldr	x7, [x1,#8*20]
+	and	x22, x22, x16
+
+	umulh	x11, x11, x17	// resume |v|*|g1| chain
+
+	sub	x24, x24, x22	// tie up |u|*|f1| chain
+	asr	x25, x24, #63
+
+	eor	x5, x5, x14	// conditionally negate rest of |v|
+	eor	x6, x6, x14
+	adds	x5, x5, x15
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	umulh	x19, x23,   x17
+	adc	x7, x7, xzr
+	umulh	x20, x5, x17
+	add	x11, x11, x26
+	umulh	x21, x6, x17
+
+	mul	x4, x23,   x17
+	mul	x5, x5, x17
+	adds	x4, x4, x11
+	mul	x6, x6, x17
+	adcs	x5, x5, x19
+	mul	x22,   x7, x17
+	adcs	x6, x6, x20
+	adcs	x22,   x22,   x21
+	adc	x23, xzr, xzr		// used in the final step
+
+	adds	x4, x4, x24
+	adcs	x5, x5, x25
+	adcs	x6, x6, x25
+	stp	x4, x5, [x0,#8*4]
+	adcs	x22,   x22,   x25	// carry is used in the final step
+	stp	x6, x22,   [x0,#8*6]
+
+	ret
+.size	__smul_512x63_tail,.-__smul_512x63_tail
+
+.type	__smul_256_n_shift_by_31, %function
+.align	5
+__smul_256_n_shift_by_31:
+	ldp	x4, x5, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x24, x12, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x6, x7, [x1,#8*2+0]
+	eor	x25, x12, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x4, x4, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x5, x5, x24
+	adds	x4, x4, x24, lsr#63
+	eor	x6, x6, x24
+	adcs	x5, x5, xzr
+	eor	x7, x7, x24
+	umulh	x19, x4, x25
+	adcs	x6, x6, xzr
+	umulh	x20, x5, x25
+	adc	x7, x7, xzr
+	umulh	x21, x6, x25
+	and	x24, x24, x25
+	umulh	x22, x7, x25
+	neg	x24, x24
+
+	mul	x4, x4, x25
+	mul	x5, x5, x25
+	mul	x6, x6, x25
+	adds	x5, x5, x19
+	mul	x7, x7, x25
+	adcs	x6, x6, x20
+	adcs	x7, x7, x21
+	adc	x22, x22, x24
+	ldp	x8, x9, [x1,#8*0+32]	// load |a| (or |b|)
+	asr	x24, x13, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x10, x11, [x1,#8*2+32]
+	eor	x25, x13, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x8, x8, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x9, x9, x24
+	adds	x8, x8, x24, lsr#63
+	eor	x10, x10, x24
+	adcs	x9, x9, xzr
+	eor	x11, x11, x24
+	umulh	x19, x8, x25
+	adcs	x10, x10, xzr
+	umulh	x20, x9, x25
+	adc	x11, x11, xzr
+	umulh	x21, x10, x25
+	and	x24, x24, x25
+	umulh	x23, x11, x25
+	neg	x24, x24
+
+	mul	x8, x8, x25
+	mul	x9, x9, x25
+	mul	x10, x10, x25
+	adds	x9, x9, x19
+	mul	x11, x11, x25
+	adcs	x10, x10, x20
+	adcs	x11, x11, x21
+	adc	x23, x23, x24
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	adcs	x7, x7, x11
+	adc	x8, x22,   x23
+
+	extr	x4, x5, x4, #31
+	extr	x5, x6, x5, #31
+	extr	x6, x7, x6, #31
+	asr	x23, x8, #63	// result's sign as mask
+	extr	x7, x8, x7, #31
+
+	eor	x4, x4, x23	// ensure the result is positive
+	eor	x5, x5, x23
+	adds	x4, x4, x23, lsr#63
+	eor	x6, x6, x23
+	adcs	x5, x5, xzr
+	eor	x7, x7, x23
+	adcs	x6, x6, xzr
+	stp	x4, x5, [x0,#8*0]
+	adc	x7, x7, xzr
+	stp	x6, x7, [x0,#8*2]
+
+	eor	x12, x12, x23		// adjust |f/g| accordingly
+	eor	x13, x13, x23
+	sub	x12, x12, x23
+	sub	x13, x13, x23
+
+	ret
+.size	__smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
+.type	__ab_approximation_31_256, %function
+.align	4
+__ab_approximation_31_256:
+	ldp	x6, x7, [x1,#8*2]
+	ldp	x10, x11, [x1,#8*6]
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x8, x9, [x1,#8*4]
+
+.Lab_approximation_31_256_loaded:
+	orr	x19, x7, x11	// check top-most limbs, ...
+	cmp	x19, #0
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	csel	x6, x6, x5, ne
+	orr	x19, x7, x11	// and ones before top-most, ...
+	csel	x10, x10, x9, ne
+
+	cmp	x19, #0
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	csel	x6, x6, x4, ne
+	orr	x19, x7, x11	// and one more, ...
+	csel	x10, x10, x8, ne
+
+	clz	x19, x19
+	cmp	x19, #64
+	csel	x19, x19, xzr, ne
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	neg	x20, x19
+
+	lslv	x7, x7, x19	// align high limbs to the left
+	lslv	x11, x11, x19
+	lsrv	x6, x6, x20
+	lsrv	x10, x10, x20
+	and	x6, x6, x20, asr#6
+	and	x10, x10, x20, asr#6
+	orr	x7, x7, x6
+	orr	x11, x11, x10
+
+	bfxil	x7, x4, #0, #31
+	bfxil	x11, x8, #0, #31
+
+	b	__inner_loop_31_256
+	ret
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+
+.type	__inner_loop_31_256, %function
+.align	4
+__inner_loop_31_256:
+	mov	x2, #31
+	mov	x13, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x15, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x23,#0x7FFFFFFF7FFFFFFF
+
+.Loop_31_256:
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x15
+	csel	x11, x11, x7, hs	// |b_| = |a_|
+	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x15, x15, x13,    hs	// exchange |fg0| and |fg1|
+	csel	x13, x13, x19,   hs
+	lsr	x7, x7, #1
+	and	x19, x15, x22
+	and	x20, x23, x22
+	sub	x13, x13, x19	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x15, x15, x15	// |f1|<<=1
+	add	x13, x13, x20
+	sub	x15, x15, x23
+	cbnz	x2, .Loop_31_256
+
+	mov	x23, #0x7FFFFFFF
+	ubfx	x12, x13, #0, #32
+	ubfx	x13, x13, #32, #32
+	ubfx	x14, x15, #0, #32
+	ubfx	x15, x15, #32, #32
+	sub	x12, x12, x23		// remove bias
+	sub	x13, x13, x23
+	sub	x14, x14, x23
+	sub	x15, x15, x23
+
+	ret
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256, %function
+.align	4
+__inner_loop_62_256:
+	mov	x12, #1		// |f0|=1
+	mov	x13, #0		// |g0|=0
+	mov	x14, #0		// |f1|=0
+	mov	x15, #1		// |g1|=1
+
+.Loop_62_256:
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x12
+	csel	x11, x11, x7, hs	// |b_| = |a_|
+	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	mov	x20, x13
+	csel	x12, x12, x14,       hs	// exchange |f0| and |f1|
+	csel	x14, x14, x19,     hs
+	csel	x13, x13, x15,       hs	// exchange |g0| and |g1|
+	csel	x15, x15, x20,     hs
+	lsr	x7, x7, #1
+	and	x19, x14, x22
+	and	x20, x15, x22
+	add	x14, x14, x14		// |f1|<<=1
+	add	x15, x15, x15		// |g1|<<=1
+	sub	x12, x12, x19		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x13, x13, x20		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, .Loop_62_256
+
+	ret
+.size	__inner_loop_62_256,.-__inner_loop_62_256
--- a/blst/elf/ct_inverse_mod_256-x86_64.s
+++ b/blst/elf/ct_inverse_mod_256-x86_64.s
--- a/blst/elf/ct_inverse_mod_384-armv8.S
+++ b/blst/elf/ct_inverse_mod_384-armv8.S
@ -0,0 +1,717 @@
+.text
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383, %function
+.align	5
+ct_inverse_mod_383:
+	.inst	0xd503233f
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #1040
+
+	ldp	x22,   x4, [x1,#8*0]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x7, x8, [x1,#8*4]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	stp	x0, x3, [sp]
+
+	ldp	x9, x10, [x2,#8*0]
+	ldp	x11, x12, [x2,#8*2]
+	ldp	x13, x14, [x2,#8*4]
+
+	stp	x22,   x4, [x1,#8*0]	// copy input to |a|
+	stp	x5, x6, [x1,#8*2]
+	stp	x7, x8, [x1,#8*4]
+	stp	x9, x10, [x1,#8*6]	// copy modulus to |b|
+	stp	x11, x12, [x1,#8*8]
+	stp	x13, x14, [x1,#8*10]
+
+	////////////////////////////////////////// first iteration
+	mov	x2, #62
+	bl	.Lab_approximation_62_loaded
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	str	x15,[x0,#8*12]		// initialize |u| with |f0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to dst |b|
+	bl	__smul_383_n_shift_by_62
+	str	x15, [x0,#8*12]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	ldr	x7, [x1,#8*12]	// |u|
+	ldr	x8, [x1,#8*18]	// |v|
+	mul	x3, x20, x7		// |u|*|f0|
+	smulh	x4, x20, x7
+	mul	x5, x21, x8		// |v|*|g0|
+	smulh	x6, x21, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*6]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*8]
+	stp	x5, x5, [x0,#8*10]
+
+	mul	x3, x15, x7		// |u|*|f1|
+	smulh	x4, x15, x7
+	mul	x5, x16, x8		// |v|*|g1|
+	smulh	x6, x16, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*12]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*14]
+	stp	x5, x5, [x0,#8*16]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	asr	x27, x27, #63		// sign extension
+	stp	x27, x27, [x0,#8*6]
+	stp	x27, x27, [x0,#8*8]
+	stp	x27, x27, [x0,#8*10]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	////////////////////////////////////////// iteration before last
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldp	x3, x8, [x1,#8*0]	// just load
+	ldp	x9, x14, [x1,#8*6]
+	bl	__inner_loop_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	str	x3, [x0,#8*0]
+	str	x9, [x0,#8*6]
+
+	mov	x20, x15			// exact |f0|
+	mov	x21, x16			// exact |g0|
+	mov	x15, x17
+	mov	x16, x19
+	add	x0, x0, #8*12	// pointer to dst |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// exact |f1|
+	mov	x21, x16			// exact |g1|
+	add	x0, x0, #8*6	// pointer to dst |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+
+	////////////////////////////////////////// last iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #22			// 766 % 62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldr	x3, [x1,#8*0]		// just load
+	eor	x8, x8, x8
+	ldr	x9, [x1,#8*6]
+	eor	x14, x14, x14
+	bl	__inner_loop_62
+
+	mov	x20, x17
+	mov	x21, x19
+	ldp	x0, x15, [sp]		// original out_ptr and n_ptr
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	ldr	x30, [x29,#8]
+
+	asr	x22, x8, #63		// sign as mask
+	ldp	x9, x10, [x15,#8*0]
+	ldp	x11, x12, [x15,#8*2]
+	ldp	x13, x14, [x15,#8*4]
+
+	and	x9, x9, x22		// add mod<<384 conditionally
+	and	x10, x10, x22
+	adds	x3, x3, x9
+	and	x11, x11, x22
+	adcs	x4, x4, x10
+	and	x12, x12, x22
+	adcs	x5, x5, x11
+	and	x13, x13, x22
+	adcs	x6, x6, x12
+	and	x14, x14, x22
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x14
+	stp	x7, x8, [x0,#8*10]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	.inst	0xd50323bf
+	ret
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+
+////////////////////////////////////////////////////////////////////////
+// see corresponding commentary in ctx_inverse_mod_384-x86_64...
+.type	__smul_383x63, %function
+.align	5
+__smul_383x63:
+	ldp	x3, x4, [x1,#8*0+96]	// load |u| (or |v|)
+	asr	x17, x20, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x5, x6, [x1,#8*2+96]
+	eor	x20, x20, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x7, x8, [x1,#8*4+96]
+
+	eor	x3, x3, x17	// conditionally negate |u| (or |v|)
+	sub	x20, x20, x17
+	eor	x4, x4, x17
+	adds	x3, x3, x17, lsr#63
+	eor	x5, x5, x17
+	adcs	x4, x4, xzr
+	eor	x6, x6, x17
+	adcs	x5, x5, xzr
+	eor	x7, x7, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x20
+	eor	x8, x8, x17
+	umulh	x23, x4, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x5, x20
+	adcs	x8, x8, xzr
+	umulh	x25, x6, x20
+	umulh	x26, x7, x20
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x22
+	mul	x6, x6, x20
+	adcs	x5, x5, x23
+	mul	x7, x7, x20
+	adcs	x6, x6, x24
+	mul	x27,x8, x20
+	adcs	x7, x7, x25
+	adcs	x27,x27,x26
+	adc	x2, xzr, xzr
+	ldp	x9, x10, [x1,#8*0+144]	// load |u| (or |v|)
+	asr	x17, x21, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x11, x12, [x1,#8*2+144]
+	eor	x21, x21, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x13, x14, [x1,#8*4+144]
+
+	eor	x9, x9, x17	// conditionally negate |u| (or |v|)
+	sub	x21, x21, x17
+	eor	x10, x10, x17
+	adds	x9, x9, x17, lsr#63
+	eor	x11, x11, x17
+	adcs	x10, x10, xzr
+	eor	x12, x12, x17
+	adcs	x11, x11, xzr
+	eor	x13, x13, x17
+	adcs	x12, x12, xzr
+	umulh	x22, x9, x21
+	eor	x14, x14, x17
+	umulh	x23, x10, x21
+	adcs	x13, x13, xzr
+	umulh	x24, x11, x21
+	adcs	x14, x14, xzr
+	umulh	x25, x12, x21
+	adc	x19, xzr, xzr		// used in __smul_767x63_tail
+	umulh	x26, x13, x21
+	mul	x9, x9, x21
+	mul	x10, x10, x21
+	mul	x11, x11, x21
+	adds	x10, x10, x22
+	mul	x12, x12, x21
+	adcs	x11, x11, x23
+	mul	x13, x13, x21
+	adcs	x12, x12, x24
+	mul	x28,x14, x21
+	adcs	x13, x13, x25
+	adcs	x28,x28,x26
+	adc	x2, x2, xzr
+
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*2]
+	adcs	x27,   x27,   x28
+	stp	x7, x27,   [x0,#8*4]
+	adc	x28,   x2,   xzr	// used in __smul_767x63_tail
+
+	ret
+.size	__smul_383x63,.-__smul_383x63
+
+.type	__smul_767x63_tail, %function
+.align	5
+__smul_767x63_tail:
+	smulh	x27,   x8, x20
+	ldp	x3, x4, [x1,#8*24]	// load rest of |v|
+	umulh	x14,x14, x21
+	ldp	x5, x6, [x1,#8*26]
+	ldp	x7, x8, [x1,#8*28]
+
+	eor	x3, x3, x17	// conditionally negate rest of |v|
+	eor	x4, x4, x17
+	eor	x5, x5, x17
+	adds	x3, x3, x19
+	eor	x6, x6, x17
+	adcs	x4, x4, xzr
+	eor	x7, x7, x17
+	adcs	x5, x5, xzr
+	eor	x8, x8, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x21
+	adcs	x7, x7, xzr
+	umulh	x23, x4, x21
+	adc	x8, x8, xzr
+
+	umulh	x24, x5, x21
+	add	x14, x14, x28
+	umulh	x25, x6, x21
+	asr	x28, x27, #63
+	umulh	x26, x7, x21
+	mul	x3, x3, x21
+	mul	x4, x4, x21
+	mul	x5, x5, x21
+	adds	x3, x3, x14
+	mul	x6, x6, x21
+	adcs	x4, x4, x22
+	mul	x7, x7, x21
+	adcs	x5, x5, x23
+	mul	x8, x8, x21
+	adcs	x6, x6, x24
+	adcs	x7, x7, x25
+	adc	x8, x8, x26
+
+	adds	x3, x3, x27
+	adcs	x4, x4, x28
+	adcs	x5, x5, x28
+	adcs	x6, x6, x28
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x28
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x28
+	stp	x7, x8, [x0,#8*10]
+
+	ret
+.size	__smul_767x63_tail,.-__smul_767x63_tail
+
+.type	__smul_383_n_shift_by_62, %function
+.align	5
+__smul_383_n_shift_by_62:
+	ldp	x3, x4, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x28, x15, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x5, x6, [x1,#8*2+0]
+	eor	x2, x15, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x7, x8, [x1,#8*4+0]
+
+	eor	x3, x3, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	umulh	x22, x3, x2
+	adcs	x6, x6, xzr
+	umulh	x23, x4, x2
+	eor	x8, x8, x28
+	umulh	x24, x5, x2
+	adcs	x7, x7, xzr
+	umulh	x25, x6, x2
+	adc	x8, x8, xzr
+
+	umulh	x26, x7, x2
+	smulh	x27, x8, x2
+	mul	x3, x3, x2
+	mul	x4, x4, x2
+	mul	x5, x5, x2
+	adds	x4, x4, x22
+	mul	x6, x6, x2
+	adcs	x5, x5, x23
+	mul	x7, x7, x2
+	adcs	x6, x6, x24
+	mul	x8, x8, x2
+	adcs	x7, x7, x25
+	adcs	x8, x8 ,x26
+	adc	x27, x27, xzr
+	ldp	x9, x10, [x1,#8*0+48]	// load |a| (or |b|)
+	asr	x28, x16, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x11, x12, [x1,#8*2+48]
+	eor	x2, x16, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x13, x14, [x1,#8*4+48]
+
+	eor	x9, x9, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x10, x10, x28
+	adds	x9, x9, x28, lsr#63
+	eor	x11, x11, x28
+	adcs	x10, x10, xzr
+	eor	x12, x12, x28
+	adcs	x11, x11, xzr
+	eor	x13, x13, x28
+	umulh	x22, x9, x2
+	adcs	x12, x12, xzr
+	umulh	x23, x10, x2
+	eor	x14, x14, x28
+	umulh	x24, x11, x2
+	adcs	x13, x13, xzr
+	umulh	x25, x12, x2
+	adc	x14, x14, xzr
+
+	umulh	x26, x13, x2
+	smulh	x28, x14, x2
+	mul	x9, x9, x2
+	mul	x10, x10, x2
+	mul	x11, x11, x2
+	adds	x10, x10, x22
+	mul	x12, x12, x2
+	adcs	x11, x11, x23
+	mul	x13, x13, x2
+	adcs	x12, x12, x24
+	mul	x14, x14, x2
+	adcs	x13, x13, x25
+	adcs	x14, x14 ,x26
+	adc	x28, x28, xzr
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x27,   x28
+
+	extr	x3, x4, x3, #62
+	extr	x4, x5, x4, #62
+	extr	x5, x6, x5, #62
+	asr	x28, x9, #63
+	extr	x6, x7, x6, #62
+	extr	x7, x8, x7, #62
+	extr	x8, x9, x8, #62
+
+	eor	x3, x3, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	adcs	x6, x6, xzr
+	eor	x8, x8, x28
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x0,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x0,#8*4]
+
+	eor	x15, x15, x28
+	eor	x16, x16, x28
+	sub	x15, x15, x28
+	sub	x16, x16, x28
+
+	ret
+.size	__smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
+.type	__ab_approximation_62, %function
+.align	4
+__ab_approximation_62:
+	ldp	x7, x8, [x1,#8*4]
+	ldp	x13, x14, [x1,#8*10]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x11, x12, [x1,#8*8]
+
+.Lab_approximation_62_loaded:
+	orr	x22, x8, x14	// check top-most limbs, ...
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x6, ne
+	orr	x22, x8, x14	// ... ones before top-most, ...
+	csel	x13, x13, x12, ne
+
+	ldp	x3, x4, [x1,#8*0]
+	ldp	x9, x10, [x1,#8*6]
+
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x5, ne
+	orr	x22, x8, x14	// ... and ones before that ...
+	csel	x13, x13, x11, ne
+
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x4, ne
+	orr	x22, x8, x14
+	csel	x13, x13, x10, ne
+
+	clz	x22, x22
+	cmp	x22, #64
+	csel	x22, x22, xzr, ne
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	neg	x23, x22
+
+	lslv	x8, x8, x22	// align high limbs to the left
+	lslv	x14, x14, x22
+	lsrv	x7, x7, x23
+	lsrv	x13, x13, x23
+	and	x7, x7, x23, asr#6
+	and	x13, x13, x23, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	b	__inner_loop_62
+	ret
+.size	__ab_approximation_62,.-__ab_approximation_62
+.type	__inner_loop_62, %function
+.align	4
+__inner_loop_62:
+	mov	x15, #1		// |f0|=1
+	mov	x16, #0		// |g0|=0
+	mov	x17, #0		// |f1|=0
+	mov	x19, #1		// |g1|=1
+
+.Loop_62:
+	sbfx	x28, x3, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	subs	x24, x9, x3	// |b_|-|a_|
+	and	x22, x9, x28
+	sbc	x25, x14, x8
+	and	x23, x14, x28
+	subs	x26, x3, x22	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x22, x15
+	sbcs	x27, x8, x23
+	mov	x23, x16
+	csel	x9, x9, x3, hs	// |b_| = |a_|
+	csel	x14, x14, x8, hs
+	csel	x3, x26, x24, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x8, x27, x25, hs
+	csel	x15, x15, x17,       hs	// exchange |f0| and |f1|
+	csel	x17, x17, x22,     hs
+	csel	x16, x16, x19,       hs	// exchange |g0| and |g1|
+	csel	x19, x19, x23,     hs
+	extr	x3, x8, x3, #1
+	lsr	x8, x8, #1
+	and	x22, x17, x28
+	and	x23, x19, x28
+	add	x17, x17, x17		// |f1|<<=1
+	add	x19, x19, x19		// |g1|<<=1
+	sub	x15, x15, x22		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x16, x16, x23		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, .Loop_62
+
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
--- a/blst/elf/ct_is_square_mod_384-armv8.S
+++ b/blst/elf/ct_is_square_mod_384-armv8.S
@ -0,0 +1,324 @@
+.text
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384, %function
+.align	5
+ct_is_square_mod_384:
+	.inst	0xd503233f
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #512
+
+	ldp	x3, x4, [x0,#8*0]		// load input
+	ldp	x5, x6, [x0,#8*2]
+	ldp	x7, x8, [x0,#8*4]
+
+	add	x0, sp, #255	// find closest 256-byte-aligned spot
+	and	x0, x0, #-256	// in the frame...
+
+	ldp	x9, x10, [x1,#8*0]		// load modulus
+	ldp	x11, x12, [x1,#8*2]
+	ldp	x13, x14, [x1,#8*4]
+
+	stp	x3, x4, [x0,#8*6]	// copy input to |a|
+	stp	x5, x6, [x0,#8*8]
+	stp	x7, x8, [x0,#8*10]
+	stp	x9, x10, [x0,#8*0]	// copy modulus to |b|
+	stp	x11, x12, [x0,#8*2]
+	stp	x13, x14, [x0,#8*4]
+
+	eor	x2, x2, x2			// init the .Legendre symbol
+	mov	x15, #24			// 24 is 768/30-1
+	b	.Loop_is_square
+
+.align	4
+.Loop_is_square:
+	bl	__ab_approximation_30
+	sub	x15, x15, #1
+
+	eor	x1, x0, #128		// pointer to dst |b|
+	bl	__smul_384_n_shift_by_30
+
+	mov	x19, x16			// |f0|
+	mov	x20, x17			// |g0|
+	add	x1, x1, #8*6	// pointer to dst |a|
+	bl	__smul_384_n_shift_by_30
+
+	ldp	x9, x10, [x1,#-8*6]
+	eor	x0, x0, #128		// flip-flop src |a|b|
+	and	x27, x27, x9		// if |a| was negative,
+	add	x2, x2, x27, lsr#1		// adjust |L|
+
+	cbnz	x15, .Loop_is_square
+
+	////////////////////////////////////////// last iteration
+	//bl	__ab_approximation_30		// |a| and |b| are exact,
+	//ldr	x8, [x0,#8*6]		// just load
+	mov	x14, x9			// ldr	x14, [x0,#8*0]
+	mov	x15, #48			// 48 is 768%30 + 30
+	bl	__inner_loop_48
+	ldr	x30, [x29,#8]
+
+	and	x0, x2, #1
+	eor	x0, x0, #1
+
+	add	sp, sp, #512
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	.inst	0xd50323bf
+	ret
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smul_384_n_shift_by_30, %function
+.align	5
+__smul_384_n_shift_by_30:
+	ldp	x3, x4, [x0,#8*0+0]	// load |b| (or |a|)
+	asr	x27, x20, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x5, x6, [x0,#8*2+0]
+	eor	x20, x20, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x7, x8, [x0,#8*4+0]
+
+	eor	x3, x3, x27	// conditionally negate |b| (or |a|)
+	sub	x20, x20, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	umulh	x21, x3, x20
+	adcs	x6, x6, xzr
+	umulh	x22, x4, x20
+	eor	x8, x8, x27
+	umulh	x23, x5, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x6, x20
+	adc	x8, x8, xzr
+
+	umulh	x25, x7, x20
+	and	x28, x20, x27
+	umulh	x26, x8, x20
+	neg	x28, x28
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x21
+	mul	x6, x6, x20
+	adcs	x5, x5, x22
+	mul	x7, x7, x20
+	adcs	x6, x6, x23
+	mul	x8, x8, x20
+	adcs	x7, x7, x24
+	adcs	x8, x8 ,x25
+	adc	x26, x26, x28
+	ldp	x9, x10, [x0,#8*0+48]	// load |b| (or |a|)
+	asr	x27, x19, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x11, x12, [x0,#8*2+48]
+	eor	x19, x19, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x13, x14, [x0,#8*4+48]
+
+	eor	x9, x9, x27	// conditionally negate |b| (or |a|)
+	sub	x19, x19, x27
+	eor	x10, x10, x27
+	adds	x9, x9, x27, lsr#63
+	eor	x11, x11, x27
+	adcs	x10, x10, xzr
+	eor	x12, x12, x27
+	adcs	x11, x11, xzr
+	eor	x13, x13, x27
+	umulh	x21, x9, x19
+	adcs	x12, x12, xzr
+	umulh	x22, x10, x19
+	eor	x14, x14, x27
+	umulh	x23, x11, x19
+	adcs	x13, x13, xzr
+	umulh	x24, x12, x19
+	adc	x14, x14, xzr
+
+	umulh	x25, x13, x19
+	and	x28, x19, x27
+	umulh	x27, x14, x19
+	neg	x28, x28
+	mul	x9, x9, x19
+	mul	x10, x10, x19
+	mul	x11, x11, x19
+	adds	x10, x10, x21
+	mul	x12, x12, x19
+	adcs	x11, x11, x22
+	mul	x13, x13, x19
+	adcs	x12, x12, x23
+	mul	x14, x14, x19
+	adcs	x13, x13, x24
+	adcs	x14, x14 ,x25
+	adc	x27, x27, x28
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x26,   x27
+
+	extr	x3, x4, x3, #30
+	extr	x4, x5, x4, #30
+	extr	x5, x6, x5, #30
+	asr	x27, x9, #63
+	extr	x6, x7, x6, #30
+	extr	x7, x8, x7, #30
+	extr	x8, x9, x8, #30
+
+	eor	x3, x3, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	adcs	x6, x6, xzr
+	eor	x8, x8, x27
+	stp	x3, x4, [x1,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x1,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x1,#8*4]
+
+	ret
+.size	__smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
+.type	__ab_approximation_30, %function
+.align	4
+__ab_approximation_30:
+	ldp	x13, x14, [x0,#8*4]	// |a| is still in registers
+	ldp	x11, x12, [x0,#8*2]
+
+	orr	x21, x8, x14	// check top-most limbs, ...
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x6, ne
+	orr	x21, x8, x14	// ... ones before top-most, ...
+	csel	x13, x13, x12, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x5, ne
+	orr	x21, x8, x14	// ... and ones before that ...
+	csel	x13, x13, x11, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x4, ne
+	orr	x21, x8, x14	// and one more, ...
+	csel	x13, x13, x10, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x3, ne
+	orr	x21, x8, x14
+	csel	x13, x13, x9, ne
+
+	clz	x21, x21
+	cmp	x21, #64
+	csel	x21, x21, xzr, ne
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	neg	x22, x21
+
+	lslv	x8, x8, x21	// align high limbs to the left
+	lslv	x14, x14, x21
+	lsrv	x7, x7, x22
+	lsrv	x13, x13, x22
+	and	x7, x7, x22, asr#6
+	and	x13, x13, x22, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	bfxil	x8, x3, #0, #32
+	bfxil	x14, x9, #0, #32
+
+	b	__inner_loop_30
+	ret
+.size	__ab_approximation_30,.-__ab_approximation_30
+
+.type	__inner_loop_30, %function
+.align	4
+__inner_loop_30:
+	mov	x28, #30
+	mov	x17, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x20, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x27,#0x7FFFFFFF7FFFFFFF
+
+.Loop_30:
+	sbfx	x24, x8, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x8, x14
+	sub	x28, x28, #1
+	and	x21, x14, x24
+
+	sub	x22, x14, x8		// |b_|-|a_|
+	subs	x23, x8, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1	// L + (a_ & b_) >> 1
+	mov	x21, x20
+	csel	x14, x14, x8, hs	// |b_| = |a_|
+	csel	x8, x23, x22, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x20, x20, x17,  hs	// exchange |fg0| and |fg1|
+	csel	x17, x17, x21, hs
+	csel	x2,   x2,   x25, hs
+	lsr	x8, x8, #1
+	and	x21, x20, x24
+	and	x22, x27, x24
+	add	x23, x14, #2
+	sub	x17, x17, x21	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x20, x20, x20	// |f1|<<=1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+	add	x17, x17, x22
+	sub	x20, x20, x27
+
+	cbnz	x28, .Loop_30
+
+	mov	x27, #0x7FFFFFFF
+	ubfx	x16, x17, #0, #32
+	ubfx	x17, x17, #32, #32
+	ubfx	x19, x20, #0, #32
+	ubfx	x20, x20, #32, #32
+	sub	x16, x16, x27		// remove the bias
+	sub	x17, x17, x27
+	sub	x19, x19, x27
+	sub	x20, x20, x27
+
+	ret
+.size	__inner_loop_30,.-__inner_loop_30
+.type	__inner_loop_48, %function
+.align	4
+__inner_loop_48:
+.Loop_48:
+	sbfx	x24, x8, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x8, x14
+	sub	x15, x15, #1
+	and	x21, x14, x24
+	sub	x22, x14, x8		// |b_|-|a_|
+	subs	x23, x8, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1
+	csel	x14, x14, x8, hs	// |b_| = |a_|
+	csel	x8, x23, x22, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x2,   x2,   x25, hs
+	add	x23, x14, #2
+	lsr	x8, x8, #1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+
+	cbnz	x15, .Loop_48
+
+	ret
+.size	__inner_loop_48,.-__inner_loop_48
--- a/blst/elf/ct_is_square_mod_384-x86_64.s
+++ b/blst/elf/ct_is_square_mod_384-x86_64.s
@ -0,0 +1,479 @@
+.text	
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384,@function
+.align	32
+ct_is_square_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$536,%rsp
+.cfi_adjust_cfa_offset	536
+
+
+	leaq	24+255(%rsp),%rax
+	andq	$-256,%rax
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rcx
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%rdi
+	movq	%rax,%rsi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rcx,72(%rax)
+	movq	%rdx,80(%rax)
+	movq	%rdi,88(%rax)
+
+	xorq	%rbp,%rbp
+	movl	$24,%ecx
+	jmp	.Loop_is_square
+
+.align	32
+.Loop_is_square:
+	movl	%ecx,16(%rsp)
+
+	call	__ab_approximation_30
+	movq	%rax,0(%rsp)
+	movq	%rbx,8(%rsp)
+
+	movq	$128+48,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_384_n_shift_by_30
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	-48(%rdi),%rdi
+	call	__smulq_384_n_shift_by_30
+
+	movl	16(%rsp),%ecx
+	xorq	$128,%rsi
+
+	andq	48(%rdi),%r14
+	shrq	$1,%r14
+	addq	%r14,%rbp
+
+	subl	$1,%ecx
+	jnz	.Loop_is_square
+
+
+
+
+	movq	48(%rsi),%r9
+	call	__inner_loop_48
+
+	movq	$1,%rax
+	andq	%rbp,%rax
+	xorq	$1,%rax
+
+	leaq	536(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-536-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smulq_384_n_shift_by_30,@function
+.align	32
+__smulq_384_n_shift_by_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%rdx,%r14
+	andq	%rbx,%r14
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	negq	%r14
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	%rdx,%r14
+	leaq	48(%rsi),%rsi
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%rdx,%r15
+	andq	%rbx,%r15
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	negq	%r15
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	%rdx,%r15
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	%r15,%r14
+
+	shrdq	$30,%r9,%r8
+	shrdq	$30,%r10,%r9
+	shrdq	$30,%r11,%r10
+	shrdq	$30,%r12,%r11
+	shrdq	$30,%r13,%r12
+	shrdq	$30,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbx,%rbx
+	subq	%r14,%rbx
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbx,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30
+.type	__ab_approximation_30,@function
+.align	32
+__ab_approximation_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	88(%rsi),%rbx
+	movq	80(%rsi),%r15
+	movq	72(%rsi),%r14
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r11,%r12
+	movq	64(%rsi),%r11
+	cmovzq	%r14,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r10,%r12
+	movq	56(%rsi),%r10
+	cmovzq	%r11,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r9,%r12
+	movq	48(%rsi),%r9
+	cmovzq	%r10,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r8,%r12
+	cmovzq	%r9,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r13
+	cmovzq	%r9,%rbx
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%r12,%r13
+	shldq	%cl,%r15,%rbx
+
+	movq	$0xFFFFFFFF00000000,%rax
+	movl	%r8d,%r8d
+	movl	%r9d,%r9d
+	andq	%rax,%r13
+	andq	%rax,%rbx
+	orq	%r13,%r8
+	orq	%rbx,%r9
+
+	jmp	__inner_loop_30
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__ab_approximation_30,.-__ab_approximation_30
+.type	__inner_loop_30,@function
+.align	32
+__inner_loop_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rbx
+	movq	$0x800000007FFFFFFF,%rcx
+	leaq	-1(%rbx),%r15
+	movl	$30,%edi
+
+.Loop_30:
+	movq	%r8,%rax
+	andq	%r9,%rax
+	shrq	$1,%rax
+
+	cmpq	%r9,%r8
+	movq	%r8,%r10
+	movq	%r9,%r11
+	leaq	(%rax,%rbp,1),%rax
+	movq	%rbx,%r12
+	movq	%rcx,%r13
+	movq	%rbp,%r14
+	cmovbq	%r9,%r8
+	cmovbq	%r10,%r9
+	cmovbq	%rcx,%rbx
+	cmovbq	%r12,%rcx
+	cmovbq	%rax,%rbp
+
+	subq	%r9,%r8
+	subq	%rcx,%rbx
+	addq	%r15,%rbx
+
+	testq	$1,%r10
+	cmovzq	%r10,%r8
+	cmovzq	%r11,%r9
+	cmovzq	%r12,%rbx
+	cmovzq	%r13,%rcx
+	cmovzq	%r14,%rbp
+
+	leaq	2(%r9),%rax
+	shrq	$1,%r8
+	shrq	$2,%rax
+	addq	%rcx,%rcx
+	leaq	(%rax,%rbp,1),%rbp
+	subq	%r15,%rcx
+
+	subl	$1,%edi
+	jnz	.Loop_30
+
+	shrq	$32,%r15
+	movl	%ebx,%eax
+	shrq	$32,%rbx
+	movl	%ecx,%edx
+	shrq	$32,%rcx
+	subq	%r15,%rax
+	subq	%r15,%rbx
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_30,.-__inner_loop_30
+
+.type	__inner_loop_48,@function
+.align	32
+__inner_loop_48:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movl	$48,%edi
+
+.Loop_48:
+	movq	%r8,%rax
+	andq	%r9,%rax
+	shrq	$1,%rax
+
+	cmpq	%r9,%r8
+	movq	%r8,%r10
+	movq	%r9,%r11
+	leaq	(%rax,%rbp,1),%rax
+	movq	%rbp,%r12
+	cmovbq	%r9,%r8
+	cmovbq	%r10,%r9
+	cmovbq	%rax,%rbp
+
+	subq	%r9,%r8
+
+	testq	$1,%r10
+	cmovzq	%r10,%r8
+	cmovzq	%r11,%r9
+	cmovzq	%r12,%rbp
+
+	leaq	2(%r9),%rax
+	shrq	$1,%r8
+	shrq	$2,%rax
+	addq	%rax,%rbp
+
+	subl	$1,%edi
+	jnz	.Loop_48
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_48,.-__inner_loop_48
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
--- a/blst/elf/ctq_inverse_mod_384-x86_64.s
+++ b/blst/elf/ctq_inverse_mod_384-x86_64.s
--- a/blst/elf/ctx_inverse_mod_384-x86_64.s
+++ b/blst/elf/ctx_inverse_mod_384-x86_64.s
--- a/blst/elf/div3w-armv8.S
+++ b/blst/elf/div3w-armv8.S
@ -0,0 +1,88 @@
+.text
+
+.globl	div_3_limbs
+.type	div_3_limbs,%function
+.align	5
+div_3_limbs:
+	ldp	x4,x5,[x0]	// load R
+	eor	x0,x0,x0	// Q = 0
+	mov	x3,#64		// loop counter
+	nop
+
+.Loop:
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,x0	// Q <<= 1
+	sbcs	x7,x5,x2
+	add	x0,x0,#1	// Q + speculative bit
+	csel	x4,x4,x6,lo	// select between R and R - D
+	extr	x1,x2,x1,#1	// D >>= 1
+	csel	x5,x5,x7,lo
+	lsr	x2,x2,#1
+	sbc	x0,x0,xzr	// subtract speculative bit
+	sub	x3,x3,#1
+	cbnz	x3,.Loop
+
+	asr	x3,x0,#63	// top bit -> mask
+	add	x0,x0,x0	// Q <<= 1
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,#1	// Q + specilative bit
+	sbcs	x7,x5,x2
+	sbc	x0,x0,xzr	// subtract speculative bit
+
+	orr	x0,x0,x3	// all ones if overflow
+
+	ret
+.size	div_3_limbs,.-div_3_limbs
+.globl	quot_rem_128
+.type	quot_rem_128,%function
+.align	5
+quot_rem_128:
+	ldp	x3,x4,[x1]
+
+	mul	x5,x3,x2	// divisor[0:1} * quotient
+	umulh	x6,x3,x2
+	mul	x11,  x4,x2
+	umulh	x7,x4,x2
+
+	ldp	x8,x9,[x0]	// load 3 limbs of the dividend
+	ldr	x10,[x0,#16]
+
+	adds	x6,x6,x11
+	adc	x7,x7,xzr
+
+	subs	x8,x8,x5	// dividend - divisor * quotient
+	sbcs	x9,x9,x6
+	sbcs	x10,x10,x7
+	sbc	x5,xzr,xzr		// borrow -> mask
+
+	add	x2,x2,x5	// if borrowed, adjust the quotient ...
+	and	x3,x3,x5
+	and	x4,x4,x5
+	adds	x8,x8,x3	// ... and add divisor
+	adc	x9,x9,x4
+
+	stp	x8,x9,[x0]	// save 2 limbs of the remainder
+	str	x2,[x0,#16]	// and one limb of the quotient
+
+	mov	x0,x2		// return adjusted quotient
+
+	ret
+.size	quot_rem_128,.-quot_rem_128
+
+.globl	quot_rem_64
+.type	quot_rem_64,%function
+.align	5
+quot_rem_64:
+	ldr	x3,[x1]
+	ldr	x8,[x0]	// load 1 limb of the dividend
+
+	mul	x5,x3,x2	// divisor * quotient
+
+	sub	x8,x8,x5	// dividend - divisor * quotient
+
+	stp	x8,x2,[x0]	// save remainder and quotient
+
+	mov	x0,x2		// return quotient
+
+	ret
+.size	quot_rem_64,.-quot_rem_64
--- a/blst/elf/div3w-x86_64.s
+++ b/blst/elf/div3w-x86_64.s
@ -0,0 +1,123 @@
+.text	
+
+.globl	div_3_limbs
+.hidden	div_3_limbs
+.type	div_3_limbs,@function
+.align	32
+div_3_limbs:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	(%rdi),%r8
+	movq	8(%rdi),%r9
+	xorq	%rax,%rax
+	movl	$64,%ecx
+
+.Loop:
+	movq	%r8,%r10
+	subq	%rsi,%r8
+	movq	%r9,%r11
+	sbbq	%rdx,%r9
+	leaq	1(%rax,%rax,1),%rax
+	movq	%rdx,%rdi
+	cmovcq	%r10,%r8
+	cmovcq	%r11,%r9
+	sbbq	$0,%rax
+	shlq	$63,%rdi
+	shrq	$1,%rsi
+	shrq	$1,%rdx
+	orq	%rdi,%rsi
+	subl	$1,%ecx
+	jnz	.Loop
+
+	leaq	1(%rax,%rax,1),%rcx
+	sarq	$63,%rax
+
+	subq	%rsi,%r8
+	sbbq	%rdx,%r9
+	sbbq	$0,%rcx
+
+	orq	%rcx,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	div_3_limbs,.-div_3_limbs
+.globl	quot_rem_128
+.hidden	quot_rem_128
+.type	quot_rem_128,@function
+.align	32
+quot_rem_128:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rax
+	movq	%rdx,%rcx
+
+	mulq	0(%rsi)
+	movq	%rax,%r8
+	movq	%rcx,%rax
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r9
+	adcq	$0,%rdx
+
+	movq	0(%rdi),%r10
+	movq	8(%rdi),%r11
+	movq	16(%rdi),%rax
+
+	subq	%r8,%r10
+	sbbq	%r9,%r11
+	sbbq	%rdx,%rax
+	sbbq	%r8,%r8
+
+	addq	%r8,%rcx
+	movq	%r8,%r9
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	addq	%r8,%r10
+	adcq	%r9,%r11
+
+	movq	%r10,0(%rdi)
+	movq	%r11,8(%rdi)
+	movq	%rcx,16(%rdi)
+
+	movq	%rcx,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	quot_rem_128,.-quot_rem_128
+
+
+
+
+
+.globl	quot_rem_64
+.hidden	quot_rem_64
+.type	quot_rem_64,@function
+.align	32
+quot_rem_64:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rax
+	imulq	0(%rsi),%rdx
+
+	movq	0(%rdi),%r10
+
+	subq	%rdx,%r10
+
+	movq	%r10,0(%rdi)
+	movq	%rax,8(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	quot_rem_64,.-quot_rem_64
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
--- a/blst/elf/mul_mont_256-armv8.S
+++ b/blst/elf/mul_mont_256-armv8.S
@ -0,0 +1,464 @@
+.text
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,%function
+.align	5
+mul_mont_sparse_256:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x10,x11,[x1]
+	ldr	x9,        [x2]
+	ldp	x12,x13,[x1,#16]
+
+	mul	x19,x10,x9
+	ldp	x5,x6,[x3]
+	mul	x20,x11,x9
+	ldp	x7,x8,[x3,#16]
+	mul	x21,x12,x9
+	mul	x22,x13,x9
+
+	umulh	x14,x10,x9
+	umulh	x15,x11,x9
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	umulh	x17,x13,x9
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,xzr,    x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*1]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*2]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*3]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	adcs	x20,x21,x15
+	adcs	x21,x22,x16
+	adcs	x22,x23,x17
+	adc	x23,xzr,xzr
+
+	subs	x14,x19,x5
+	sbcs	x15,x20,x6
+	sbcs	x16,x21,x7
+	sbcs	x17,x22,x8
+	sbcs	xzr,    x23,xzr
+
+	csel	x19,x19,x14,lo
+	csel	x20,x20,x15,lo
+	csel	x21,x21,x16,lo
+	csel	x22,x22,x17,lo
+
+	stp	x19,x20,[x0]
+	stp	x21,x22,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,%function
+.align	5
+sqr_mont_sparse_256:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	mov	x4,x3
+
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x11,x6,x5	// a[1]*a[0]
+	umulh	x15,x6,x5
+	mul	x12,x7,x5	// a[2]*a[0]
+	umulh	x16,x7,x5
+	mul	x13,x8,x5	// a[3]*a[0]
+	umulh	x19,x8,x5
+
+	adds	x12,x12,x15	// accumulate high parts of multiplication
+	mul	x14,x7,x6	// a[2]*a[1]
+	umulh	x15,x7,x6
+	adcs	x13,x13,x16
+	mul	x16,x8,x6	// a[3]*a[1]
+	umulh	x17,x8,x6
+	adc	x19,x19,xzr	// can't overflow
+
+	mul	x20,x8,x7	// a[3]*a[2]
+	umulh	x21,x8,x7
+
+	adds	x15,x15,x16	// accumulate high parts of multiplication
+	mul	x10,x5,x5	// a[0]*a[0]
+	adc	x16,x17,xzr	// can't overflow
+
+	adds	x13,x13,x14	// accumulate low parts of multiplication
+	umulh	x5,x5,x5
+	adcs	x19,x19,x15
+	mul	x15,x6,x6	// a[1]*a[1]
+	adcs	x20,x20,x16
+	umulh	x6,x6,x6
+	adc	x21,x21,xzr	// can't overflow
+
+	adds	x11,x11,x11	// acc[1-6]*=2
+	mul	x16,x7,x7	// a[2]*a[2]
+	adcs	x12,x12,x12
+	umulh	x7,x7,x7
+	adcs	x13,x13,x13
+	mul	x17,x8,x8	// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x8,x8,x8
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adc	x22,xzr,xzr
+
+	adds	x11,x11,x5	// +a[i]*a[i]
+	adcs	x12,x12,x15
+	adcs	x13,x13,x6
+	adcs	x19,x19,x16
+	adcs	x20,x20,x7
+	adcs	x21,x21,x17
+	adc	x22,x22,x8
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	adds	x10,x10,x19	// accumulate upper half
+	adcs	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adc	x19,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x19,xzr
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,%function
+.align	5
+from_mont_256:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+	.inst	0xd50323bf
+	ret
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,%function
+.align	5
+redc_mont_256:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	ldp	x14,x15,[x1,#32]
+	ldp	x16,x17,[x1,#48]
+
+	adds	x10,x10,x14
+	adcs	x11,x11,x15
+	adcs	x12,x12,x16
+	adcs	x13,x13,x17
+	adc	x9,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x9,xzr
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+	.inst	0xd50323bf
+	ret
+.size	redc_mont_256,.-redc_mont_256
+
+.type	__mul_by_1_mont_256,%function
+.align	5
+__mul_by_1_mont_256:
+	mul	x3,x4,x10
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	adc	x13,x9,x17
+
+	ret
+.size	__mul_by_1_mont_256,.-__mul_by_1_mont_256
--- a/blst/elf/mul_mont_384-armv8.S
+++ b/blst/elf/mul_mont_384-armv8.S
--- a/blst/elf/mulq_mont_256-x86_64.s
+++ b/blst/elf/mulq_mont_256-x86_64.s
@ -0,0 +1,714 @@
+.text	
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,@function
+.align	32
+mul_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r13
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%rbp
+	movq	%rdx,%rbx
+
+	movq	%rax,%r15
+	mulq	%r13
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+	call	__mulq_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,@function
+.align	32
+sqr_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rax
+	movq	%rcx,%r8
+	movq	8(%rsi),%r14
+	movq	%rdx,%rcx
+	movq	16(%rsi),%r12
+	leaq	(%rsi),%rbx
+	movq	24(%rsi),%rbp
+
+	movq	%rax,%r15
+	mulq	%rax
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+	call	__mulq_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+.type	__mulq_mont_sparse_256,@function
+.align	32
+__mulq_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	xorq	%r14,%r14
+	movq	%rdx,%r13
+
+	movq	%r9,%rdi
+	imulq	%r8,%r9
+
+
+	movq	%rax,%r15
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	%rdx,%r14
+	xorq	%r15,%r15
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r9,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rdx,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	movq	%r10,%rdi
+	imulq	%r8,%r10
+
+
+	movq	%rax,%r9
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	%rdx,%r15
+	xorq	%r9,%r9
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r10,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rdx,%r14
+	adcq	$0,%r15
+	adcq	$0,%r9
+	movq	%r11,%rdi
+	imulq	%r8,%r11
+
+
+	movq	%rax,%r10
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	%rdx,%r9
+	xorq	%r10,%r10
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r11,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	addq	%rdx,%r15
+	adcq	$0,%r9
+	adcq	$0,%r10
+	imulq	%r8,%rax
+	movq	8(%rsp),%rsi
+
+
+	movq	%rax,%r11
+	mulq	0(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	%rdx,%r12
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	movq	%r14,%rbx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+
+	movq	%r15,%r12
+	subq	0(%rcx),%r13
+	sbbq	8(%rcx),%r14
+	sbbq	16(%rcx),%r15
+	movq	%r9,%rbp
+	sbbq	24(%rcx),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rax,%r13
+	cmovcq	%rbx,%r14
+	cmovcq	%r12,%r15
+	movq	%r13,0(%rsi)
+	cmovcq	%rbp,%r9
+	movq	%r14,8(%rsi)
+	movq	%r15,16(%rsi)
+	movq	%r9,24(%rsi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__mulq_mont_sparse_256,.-__mulq_mont_sparse_256
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,@function
+.align	32
+from_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_256
+
+
+
+
+
+	movq	%r14,%r10
+	movq	%r15,%r11
+	movq	%r9,%r12
+
+	subq	0(%rbx),%r13
+	sbbq	8(%rbx),%r14
+	sbbq	16(%rbx),%r15
+	sbbq	24(%rbx),%r9
+
+	cmovncq	%r13,%rax
+	cmovncq	%r14,%r10
+	cmovncq	%r15,%r11
+	movq	%rax,0(%rdi)
+	cmovncq	%r9,%r12
+	movq	%r10,8(%rdi)
+	movq	%r11,16(%rdi)
+	movq	%r12,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,@function
+.align	32
+redc_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_256
+
+	addq	32(%rsi),%r13
+	adcq	40(%rsi),%r14
+	movq	%r13,%rax
+	adcq	48(%rsi),%r15
+	movq	%r14,%r10
+	adcq	56(%rsi),%r9
+	sbbq	%rsi,%rsi
+
+
+
+
+	movq	%r15,%r11
+	subq	0(%rbx),%r13
+	sbbq	8(%rbx),%r14
+	sbbq	16(%rbx),%r15
+	movq	%r9,%r12
+	sbbq	24(%rbx),%r9
+	sbbq	$0,%rsi
+
+	cmovncq	%r13,%rax
+	cmovncq	%r14,%r10
+	cmovncq	%r15,%r11
+	movq	%rax,0(%rdi)
+	cmovncq	%r9,%r12
+	movq	%r10,8(%rdi)
+	movq	%r11,16(%rdi)
+	movq	%r12,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	redc_mont_256,.-redc_mont_256
+.type	__mulq_by_1_mont_256,@function
+.align	32
+__mulq_by_1_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	movq	%rax,%r13
+	imulq	%rcx,%rax
+	movq	%rax,%r9
+
+	mulq	0(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+
+	mulq	8(%rbx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	16(%rbx)
+	movq	%r10,%r14
+	imulq	%rcx,%r10
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	24(%rbx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	movq	%r11,%r15
+	imulq	%rcx,%r11
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	movq	%r12,%r9
+	imulq	%rcx,%r12
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_by_1_mont_256,.-__mulq_by_1_mont_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
--- a/blst/elf/mulq_mont_384-x86_64.s
+++ b/blst/elf/mulq_mont_384-x86_64.s
--- a/blst/elf/mulx_mont_256-x86_64.s
+++ b/blst/elf/mulx_mont_256-x86_64.s
@ -0,0 +1,627 @@
+.text	
+
+.globl	mulx_mont_sparse_256
+.hidden	mulx_mont_sparse_256
+.type	mulx_mont_sparse_256,@function
+.align	32
+mulx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%r9
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%rax,%r11
+	call	__mulx_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_mont_sparse_256,.-mulx_mont_sparse_256
+
+.globl	sqrx_mont_sparse_256
+.hidden	sqrx_mont_sparse_256
+.type	sqrx_mont_sparse_256,@function
+.align	32
+sqrx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	movq	%rcx,%r8
+	movq	%rdx,%rcx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%r9
+	leaq	-128(%rbx),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%rdx,%rax,%r11
+	call	__mulx_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_mont_sparse_256,.-sqrx_mont_sparse_256
+.type	__mulx_mont_sparse_256,@function
+.align	32
+__mulx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	mulxq	%r15,%r15,%r12
+	mulxq	%rbp,%rbp,%r13
+	addq	%r15,%r11
+	mulxq	%r9,%r9,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rbp,%r12
+	adcq	%r9,%r13
+	adcq	$0,%r14
+
+	movq	%rax,%r10
+	imulq	%r8,%rax
+
+
+	xorq	%r15,%r15
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r11
+	adcxq	%r9,%r12
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r12
+	adcxq	%r9,%r13
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r14
+	adcxq	%r15,%r9
+	adoxq	%r9,%r15
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r10
+	adoxq	%r11,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r12
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r12
+	adoxq	%r9,%r13
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	16(%rbx),%rdx
+	adcxq	%rbp,%r13
+	adoxq	%r9,%r14
+	adcxq	%r10,%r14
+	adoxq	%r10,%r15
+	adcxq	%r10,%r15
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+	movq	%rax,%r11
+	imulq	%r8,%rax
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r12
+	adcxq	%r9,%r13
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r14
+	adcxq	%r9,%r15
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r15
+	adcxq	%r10,%r9
+	adoxq	%r9,%r10
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r11
+	adoxq	%r12,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r13
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r13
+	adoxq	%r9,%r14
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	24(%rbx),%rdx
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+	adcxq	%r11,%r15
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	adoxq	%r11,%r11
+	adcq	$0,%r11
+	movq	%rax,%r12
+	imulq	%r8,%rax
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r14
+	adcxq	%r9,%r15
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r15
+	adcxq	%r9,%r10
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r10
+	adcxq	%r11,%r9
+	adoxq	%r9,%r11
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r12
+	adoxq	%r13,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r14
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	%rax,%rdx
+	adcxq	%rbp,%r15
+	adoxq	%r9,%r10
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	adcxq	%r12,%r11
+	adoxq	%r12,%r12
+	adcq	$0,%r12
+	imulq	%r8,%rdx
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rcx),%r13,%r9
+	adcxq	%rax,%r13
+	adoxq	%r9,%r14
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r15
+	adoxq	%r9,%r10
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	%r14,%rdx
+	leaq	128(%rcx),%rcx
+	adcxq	%rbp,%r10
+	adoxq	%r9,%r11
+	movq	%r15,%rax
+	adcxq	%r13,%r11
+	adoxq	%r13,%r12
+	adcq	$0,%r12
+
+
+
+
+	movq	%r10,%rbp
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	sbbq	16(%rcx),%r10
+	movq	%r11,%r9
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rdx,%r14
+	cmovcq	%rax,%r15
+	cmovcq	%rbp,%r10
+	movq	%r14,0(%rdi)
+	cmovcq	%r9,%r11
+	movq	%r15,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulx_mont_sparse_256,.-__mulx_mont_sparse_256
+.globl	fromx_mont_256
+.hidden	fromx_mont_256
+.type	fromx_mont_256,@function
+.align	32
+fromx_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_256
+
+
+
+
+
+	movq	%r15,%rdx
+	movq	%r10,%r12
+	movq	%r11,%r13
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r10
+	sbbq	24(%rbx),%r11
+
+	cmovncq	%r14,%rax
+	cmovncq	%r15,%rdx
+	cmovncq	%r10,%r12
+	movq	%rax,0(%rdi)
+	cmovncq	%r11,%r13
+	movq	%rdx,8(%rdi)
+	movq	%r12,16(%rdi)
+	movq	%r13,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	fromx_mont_256,.-fromx_mont_256
+
+.globl	redcx_mont_256
+.hidden	redcx_mont_256
+.type	redcx_mont_256,@function
+.align	32
+redcx_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_256
+
+	addq	32(%rsi),%r14
+	adcq	40(%rsi),%r15
+	movq	%r14,%rax
+	adcq	48(%rsi),%r10
+	movq	%r15,%rdx
+	adcq	56(%rsi),%r11
+	sbbq	%rsi,%rsi
+
+
+
+
+	movq	%r10,%r12
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r10
+	movq	%r11,%r13
+	sbbq	24(%rbx),%r11
+	sbbq	$0,%rsi
+
+	cmovncq	%r14,%rax
+	cmovncq	%r15,%rdx
+	cmovncq	%r10,%r12
+	movq	%rax,0(%rdi)
+	cmovncq	%r11,%r13
+	movq	%rdx,8(%rdi)
+	movq	%r12,16(%rdi)
+	movq	%r13,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	redcx_mont_256,.-redcx_mont_256
+.type	__mulx_by_1_mont_256,@function
+.align	32
+__mulx_by_1_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r11
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	movq	%rax,%r14
+	imulq	%rcx,%rax
+	movq	%rax,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	movq	%r11,%r15
+	imulq	%rcx,%r11
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	movq	%r12,%r10
+	imulq	%rcx,%r12
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r10
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rbx)
+	movq	%r13,%r11
+	imulq	%rcx,%r13
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r11
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rbx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rbx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulx_by_1_mont_256,.-__mulx_by_1_mont_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
--- a/blst/elf/mulx_mont_384-x86_64.s
+++ b/blst/elf/mulx_mont_384-x86_64.s
--- a/blst/elf/sha256-armv8.S
+++ b/blst/elf/sha256-armv8.S
--- a/blst/elf/sha256-portable-x86_64.s
+++ b/blst/elf/sha256-portable-x86_64.s
--- a/blst/elf/sha256-x86_64.s
+++ b/blst/elf/sha256-x86_64.s