ftu/blst/asm/ct_inverse_mod_256-x86_64.pl

#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast Euclidean inversion as suggested in
# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake.
#
# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
#                                                       const vec256 modx);
#
$python_ref.=<<'___';
def ct_inverse_mod_256(inp, mod):
    a, u = inp, 1
    b, v = mod, 0

    k = 31
    mask = (1 << k) - 1

    for i in range(0, 512 // k - 1):
        # __ab_approximation_31
        n = max(a.bit_length(), b.bit_length())
        if n < 64:
            a_, b_ = a, b
        else:
            a_ = (a & mask) | ((a >> (n-k-2)) << k)
            b_ = (b & mask) | ((b >> (n-k-2)) << k)

        # __inner_loop_31
        f0, g0, f1, g1 = 1, 0, 0, 1
        for j in range(0, k):
            if a_ & 1:
                if a_ < b_:
                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
                a_, f0, g0 = a_-b_, f0-f1, g0-g1
            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1

        # __smulq_256_n_shift_by_31
        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
        if a < 0:
            a, f0, g0 = -a, -f0, -g0
        if b < 0:
            b, f1, g1 = -b, -f1, -g1

        # __smulq_512x63
        u, v = u*f0 + v*g0, u*f1 + v*g1

    if 512 % k + k:
        f0, g0, f1, g1 = 1, 0, 0, 1
        for j in range(0, 512 % k + k):
            if a & 1:
                if a < b:
                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
                a, f0, g0 = a-b, f0-f1, g0-g1
            a, f1, g1 = a >> 1, f1 << 1, g1 << 1

        v = u*f1 + v*g1

    mod <<= 512 - mod.bit_length()  # align to the left
    if v < 0:
        v += mod
    if v < 0:
        v += mod
    elif v == 1<<512
        v -= mod

    return v & (2**512 - 1) # to be reduced % mod
___

$flavour = shift;
$output  = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
    or die "can't call $xlate: $!";

my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
my @acc = map("%r$_",(8..15));
my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
my $cnt = "%edx";

$frame = 8*6+2*512;

$code.=<<___;
.text

.globl	ct_inverse_mod_256
.type	ct_inverse_mod_256,\@function,4,"unwind"
.align	32
ct_inverse_mod_256:
.cfi_startproc
	push	%rbp
.cfi_push	%rbp
	push	%rbx
.cfi_push	%rbx
	push	%r12
.cfi_push	%r12
	push	%r13
.cfi_push	%r13
	push	%r14
.cfi_push	%r14
	push	%r15
.cfi_push	%r15
	sub	\$$frame, %rsp
.cfi_adjust_cfa_offset	$frame
.cfi_end_prologue

	lea	8*6+511(%rsp), %rax	# find closest 512-byte-aligned spot
	and	\$-512, %rax		# in the frame...
	mov	$out_ptr, 8*4(%rsp)
	mov	$nx_ptr,  8*5(%rsp)

	mov	8*0($in_ptr), @acc[0]	# load input
	mov	8*1($in_ptr), @acc[1]
	mov	8*2($in_ptr), @acc[2]
	mov	8*3($in_ptr), @acc[3]

	mov	8*0($n_ptr), @acc[4]	# load modulus
	mov	8*1($n_ptr), @acc[5]
	mov	8*2($n_ptr), @acc[6]
	mov	8*3($n_ptr), @acc[7]

	mov	@acc[0], 8*0(%rax)	# copy input to |a|
	mov	@acc[1], 8*1(%rax)
	mov	@acc[2], 8*2(%rax)
	mov	@acc[3], 8*3(%rax)

	mov	@acc[4], 8*4(%rax)	# copy modulus to |b|
	mov	@acc[5], 8*5(%rax)
	mov	@acc[6], 8*6(%rax)
	mov	@acc[7], 8*7(%rax)
	mov	%rax, $in_ptr

	################################# first iteration
	mov	\$31, $cnt
	call	__ab_approximation_31_256
	#mov	$f0, 8*0(%rsp)
	#mov	$g0, 8*1(%rsp)
	mov	$f1, 8*2(%rsp)
	mov	$g1, 8*3(%rsp)

	mov	\$256, $out_ptr
	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
	call	__smulq_256_n_shift_by_31
	#mov	$f0, 8*0(%rsp)		# corrected |f0|
	#mov	$g0, 8*1(%rsp)		# corrected |g0|
	mov	$f0, 8*8($out_ptr)	# initialize |u| with |f0|

	mov	8*2(%rsp), $f0		# |f1|
	mov	8*3(%rsp), $g0		# |g1|
	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
	call	__smulq_256_n_shift_by_31
	#mov	$f0, 8*2(%rsp)		# corrected |f1|
	#mov	$g0, 8*3(%rsp)		# corrected |g1|
	mov	$f0, 8*9($out_ptr)	# initialize |v| with |f1|

	################################# second iteration
	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
	mov	\$31, $cnt
	call	__ab_approximation_31_256
	#mov	$f0, 8*0(%rsp)
	#mov	$g0, 8*1(%rsp)
	mov	$f1, 8*2(%rsp)
	mov	$g1, 8*3(%rsp)

	mov	\$256, $out_ptr
	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
	call	__smulq_256_n_shift_by_31
	mov	$f0, 8*0(%rsp)		# corrected |f0|
	mov	$g0, 8*1(%rsp)		# corrected |g0|

	mov	8*2(%rsp), $f0		# |f1|
	mov	8*3(%rsp), $g0		# |g1|
	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
	call	__smulq_256_n_shift_by_31
	#mov	$f0, 8*2(%rsp)		# corrected |f1|
	#mov	$g0, 8*3(%rsp)		# corrected |g1|

	mov	8*8($in_ptr),  @acc[0]	# |u|
	mov	8*13($in_ptr), @acc[4]	# |v|
	mov	@acc[0], @acc[1]
	imulq	8*0(%rsp), @acc[0]	# |u|*|f0|
	mov	@acc[4], @acc[5]
	imulq	8*1(%rsp), @acc[4]	# |v|*|g0|
	add	@acc[4], @acc[0]
	mov	@acc[0], 8*4($out_ptr)	# destination |u|
	sar	\$63, @acc[0]		# sign extension
	mov	@acc[0], 8*5($out_ptr)
	mov	@acc[0], 8*6($out_ptr)
	mov	@acc[0], 8*7($out_ptr)
	mov	@acc[0], 8*8($out_ptr)
	lea	8*8($in_ptr), $in_ptr	# make in_ptr "rewindable" with xor

	imulq	$f0, @acc[1]		# |u|*|f1|
	imulq	$g0, @acc[5]		# |v|*|g1|
	add	@acc[5], @acc[1]
	mov	@acc[1], 8*9($out_ptr)	# destination |v|
	sar	\$63, @acc[1]		# sign extension
	mov	@acc[1], 8*10($out_ptr)
	mov	@acc[1], 8*11($out_ptr)
	mov	@acc[1], 8*12($out_ptr)
	mov	@acc[1], 8*13($out_ptr)
___
for($i=2; $i<15; $i++) {
my $smul_512x63  = $i>8  ? "__smulq_512x63"
                         : "__smulq_256x63";
$code.=<<___;
	xor	\$256+8*8, $in_ptr	# flip-flop pointer to source |a|b|u|v|
	mov	\$31, $cnt
	call	__ab_approximation_31_256
	#mov	$f0, 8*0(%rsp)
	#mov	$g0, 8*1(%rsp)
	mov	$f1, 8*2(%rsp)
	mov	$g1, 8*3(%rsp)

	mov	\$256, $out_ptr
	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
	call	__smulq_256_n_shift_by_31
	mov	$f0, 8*0(%rsp)		# corrected |f0|
	mov	$g0, 8*1(%rsp)		# corrected |g0|

	mov	8*2(%rsp), $f0		# |f1|
	mov	8*3(%rsp), $g0		# |g1|
	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
	call	__smulq_256_n_shift_by_31
	mov	$f0, 8*2(%rsp)		# corrected |f1|
	mov	$g0, 8*3(%rsp)		# corrected |g1|

	mov	8*0(%rsp), $f0		# |f0|
	mov	8*1(%rsp), $g0		# |g0|
	lea	8*8($in_ptr), $in_ptr	# pointer to source |u|v|
	lea	8*4($out_ptr), $out_ptr	# pointer to destination |u|
	call	__smulq_256x63

	mov	8*2(%rsp), $f0		# |f1|
	mov	8*3(%rsp), $g0		# |g1|
	lea	8*5($out_ptr),$out_ptr	# pointer to destination |v|
	call	$smul_512x63
___
$code.=<<___	if ($i==8);
	sar	\$63, %rbp		# sign extension
	mov	%rbp, 8*5($out_ptr)
	mov	%rbp, 8*6($out_ptr)
	mov	%rbp, 8*7($out_ptr)
___
}
$code.=<<___;
	################################# two[!] last iterations in one go
	xor	\$256+8*8, $in_ptr	# flip-flop pointer to source |a|b|u|v|
	mov	\$47, $cnt		# 31 + 512 % 31
	#call	__ab_approximation_31	# |a| and |b| are exact, just load
	mov	8*0($in_ptr), @acc[0]	# |a_lo|
	#xor	@acc[1],      @acc[1]	# |a_hi|
	mov	8*4($in_ptr), @acc[2]	# |b_lo|
	#xor	@acc[3],      @acc[3]	# |b_hi|
	call	__inner_loop_62_256
	#mov	$f0, 8*0(%rsp)
	#mov	$g0, 8*1(%rsp)
	#mov	$f1, 8*2(%rsp)
	#mov	$g1, 8*3(%rsp)

	#mov	8*0(%rsp), $f0		# |f0|
	#mov	8*1(%rsp), $g0		# |g0|
	lea	8*8($in_ptr), $in_ptr	# pointer to source |u|v|
	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
	#call	__smulq_256x63

	#mov	8*2(%rsp), $f0		# |f1|
	#mov	8*3(%rsp), $g0		# |g1|
	mov	$f1, $f0
	mov	$g1, $g0
	mov	8*4(%rsp), $out_ptr	# original |out_ptr|
	call	__smulq_512x63
	adc	%rbp, %rdx		# the excess limb of the result

	mov	8*5(%rsp), $in_ptr	# original |nx_ptr|
	mov	%rdx, %rax
	sar	\$63, %rdx		# result's sign as mask

	mov	%rdx, @acc[0]		# mask |modulus|
	mov	%rdx, @acc[1]
	and	8*0($in_ptr), @acc[0]
	mov	%rdx, @acc[2]
	and	8*1($in_ptr), @acc[1]
	and	8*2($in_ptr), @acc[2]
	and	8*3($in_ptr), %rdx

	add	@acc[0], @acc[4]	# conditionally add |modulus|<<256
	adc	@acc[1], @acc[5]
	adc	@acc[2], @acc[6]
	adc	%rdx,    @acc[7]
	adc	\$0,     %rax

	mov	%rax, %rdx
	neg	%rax
	or	%rax, %rdx		# excess bit or sign as mask
	sar	\$63, %rax		# excess bit as mask

	mov	%rdx, @acc[0]		# mask |modulus|
	mov	%rdx, @acc[1]
	and	8*0($in_ptr), @acc[0]
	mov	%rdx, @acc[2]
	and	8*1($in_ptr), @acc[1]
	and	8*2($in_ptr), @acc[2]
	and	8*3($in_ptr), %rdx

	xor	%rax, @acc[0]		# conditionally negate |modulus|
	xor	%rcx, %rcx
	xor	%rax, @acc[1]
	sub	%rax, %rcx
	xor	%rax, @acc[2]
	xor	%rax, %rdx
	add	%rcx, @acc[0]
	adc	\$0, @acc[1]
	adc	\$0, @acc[2]
	adc	\$0, %rdx

	add	@acc[0], @acc[4]	# final adjustment for |modulus|<<256
	adc	@acc[1], @acc[5]
	adc	@acc[2], @acc[6]
	adc	%rdx,    @acc[7]

	mov	@acc[4], 8*4($out_ptr)	# store absolute value
	mov	@acc[5], 8*5($out_ptr)
	mov	@acc[6], 8*6($out_ptr)
	mov	@acc[7], 8*7($out_ptr)

	lea	$frame(%rsp), %r8	# size optimization
	mov	8*0(%r8),%r15
.cfi_restore	%r15
	mov	8*1(%r8),%r14
.cfi_restore	%r14
	mov	8*2(%r8),%r13
.cfi_restore	%r13
	mov	8*3(%r8),%r12
.cfi_restore	%r12
	mov	8*4(%r8),%rbx
.cfi_restore	%rbx
	mov	8*5(%r8),%rbp
.cfi_restore	%rbp
	lea	8*6(%r8),%rsp
.cfi_adjust_cfa_offset	-$frame-8*6
.cfi_epilogue
	ret
.cfi_endproc
.size	ct_inverse_mod_256,.-ct_inverse_mod_256
___
########################################################################
# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
# to the maximum bit-length of the *result*, and "63" - to the maximum
# bit-length of the |f?| and |g?| single-limb multiplicands. However!
# The latter should not be taken literally, as they are always chosen so
# that "bad things" don't happen. For example, there comes a point when
# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
# because past that point |f0| is always 1 and |g0| is always 0. And,
# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
# perform full-width |u|*|f1| multiplication, half-width one with sign
# extension is sufficient...
$code.=<<___;
.type	__smulq_512x63,\@abi-omnipotent
.align	32
__smulq_512x63:
	mov	8*0($in_ptr), @acc[0]	# load |u|
	mov	8*1($in_ptr), @acc[1]
	mov	8*2($in_ptr), @acc[2]
	mov	8*3($in_ptr), @acc[3]
	mov	8*4($in_ptr), %rbp	# sign limb

	mov	$f0, %rbx
	sar	\$63, $f0		# |f0|'s sign as mask
	xor	%rax, %rax
	sub	$f0, %rax		# |f0|'s sign as bit

	xor	$f0, %rbx		# conditionally negate |f0|
	add	%rax, %rbx

	xor	$f0, @acc[0]		# conditionally negate |u|
	xor	$f0, @acc[1]
	xor	$f0, @acc[2]
	xor	$f0, @acc[3]
	xor	$f0, %rbp
	add	@acc[0], %rax
	adc	\$0, @acc[1]
	adc	\$0, @acc[2]
	adc	\$0, @acc[3]
	adc	\$0, %rbp

	mulq	%rbx			# |u|*|f0|
	mov	%rax, 8*0($out_ptr)	# offload |u|*|f0|
	mov	@acc[1], %rax
	mov	%rdx, @acc[1]
___
for($i=1; $i<3; $i++) {
$code.=<<___;
	mulq	%rbx
	add	%rax, @acc[$i]
	mov	@acc[$i+1], %rax
	adc	\$0, %rdx
	mov	@acc[$i], 8*$i($out_ptr)
	mov	%rdx, @acc[$i+1]
___
}
$code.=<<___;
	and	%rbx, %rbp
	neg	%rbp
	mulq	%rbx
	add	%rax, @acc[3]
	adc	%rdx, %rbp
	mov	@acc[3], 8*3($out_ptr)

	mov	8*5($in_ptr), @acc[0]	# load |v|
	mov	8*6($in_ptr), @acc[1]
	mov	8*7($in_ptr), @acc[2]
	mov	8*8($in_ptr), @acc[3]
	mov	8*9($in_ptr), @acc[4]
	mov	8*10($in_ptr), @acc[5]
	mov	8*11($in_ptr), @acc[6]
	mov	8*12($in_ptr), @acc[7]

	mov	$g0, $f0
	sar	\$63, $f0		# |g0|'s sign as mask
	xor	%rax, %rax
	sub	$f0, %rax		# |g0|'s sign as bit

	xor	$f0, $g0		# conditionally negate |g0|
	add	%rax, $g0

	xor	$f0, @acc[0]		# conditionally negate |v|
	xor	$f0, @acc[1]
	xor	$f0, @acc[2]
	xor	$f0, @acc[3]
	xor	$f0, @acc[4]
	xor	$f0, @acc[5]
	xor	$f0, @acc[6]
	xor	$f0, @acc[7]
	add	@acc[0], %rax
	adc	\$0, @acc[1]
	adc	\$0, @acc[2]
	adc	\$0, @acc[3]
	adc	\$0, @acc[4]
	adc	\$0, @acc[5]
	adc	\$0, @acc[6]
	adc	\$0, @acc[7]

	mulq	$g0
	mov	%rax, @acc[0]
	mov	@acc[1], %rax
	mov	%rdx, @acc[1]
___
for($i=1; $i<7; $i++) {
$code.=<<___;
	mulq	$g0
	add	%rax, @acc[$i]
	mov	@acc[$i+1], %rax
	adc	\$0, %rdx
	mov	%rdx, @acc[$i+1]
___
}
$code.=<<___;
	imulq	$g0
	add	%rax, @acc[7]
	adc	\$0, %rdx		# used in the final step

	mov	%rbp, %rbx
	sar	\$63, %rbp		# sign extension

	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
	adc	8*1($out_ptr), @acc[1]
	adc	8*2($out_ptr), @acc[2]
	adc	8*3($out_ptr), @acc[3]
	adc	%rbx, @acc[4]
	adc	%rbp, @acc[5]
	adc	%rbp, @acc[6]
	adc	%rbp, @acc[7]

	mov	@acc[0], 8*0($out_ptr)
	mov	@acc[1], 8*1($out_ptr)
	mov	@acc[2], 8*2($out_ptr)
	mov	@acc[3], 8*3($out_ptr)
	mov	@acc[4], 8*4($out_ptr)
	mov	@acc[5], 8*5($out_ptr)
	mov	@acc[6], 8*6($out_ptr)
	mov	@acc[7], 8*7($out_ptr)

	ret
.size	__smulq_512x63,.-__smulq_512x63

.type	__smulq_256x63,\@abi-omnipotent
.align	32
__smulq_256x63:
___
for($j=0; $j<2; $j++) {
my $k = 8*5*$j;
my @acc=@acc;	@acc=@acc[4..7]	if($j);
my $top="%rbp";	$top=$g0	if($j);
$code.=<<___;
	mov	$k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
	mov	$k+8*1($in_ptr), @acc[1]
	mov	$k+8*2($in_ptr), @acc[2]
	mov	$k+8*3($in_ptr), @acc[3]
	mov	$k+8*4($in_ptr), $top	# sign/excess limb

	mov	$f0, %rbx
	sar	\$63, $f0		# |f0|'s sign as mask (or |g0|'s)
	xor	%rax, %rax
	sub	$f0, %rax		# |f0|'s sign as bit (or |g0|'s)

	xor	$f0, %rbx		# conditionally negate |f0|
	add	%rax, %rbx

	xor	$f0, @acc[0]		# conditionally negate |u| (or |v|)
	xor	$f0, @acc[1]
	xor	$f0, @acc[2]
	xor	$f0, @acc[3]
	xor	$f0, $top
	add	@acc[0], %rax
	adc	\$0, @acc[1]
	adc	\$0, @acc[2]
	adc	\$0, @acc[3]
	adc	\$0, $top

	mulq	%rbx
	mov	%rax, @acc[0]
	mov	@acc[1], %rax
	mov	%rdx, @acc[1]
___
for($i=1; $i<3; $i++) {
$code.=<<___;
	mulq	%rbx
	add	%rax, @acc[$i]
	mov	@acc[$i+1], %rax
	adc	\$0, %rdx
	mov	%rdx, @acc[$i+1]
___
}
$code.=<<___;
	and	%rbx, $top
	neg	$top
	mulq	%rbx
	add	%rax, @acc[3]
	adc	%rdx, $top
___
$code.=<<___	if ($j==0);
	mov	$g0, $f0
___
}
$code.=<<___;
	add	@acc[4], @acc[0]	# accumulate |u|*|f0|
	adc	@acc[5], @acc[1]
	adc	@acc[6], @acc[2]
	adc	@acc[7], @acc[3]
	adc	%rcx, %rbp

	mov	@acc[0], 8*0($out_ptr)
	mov	@acc[1], 8*1($out_ptr)
	mov	@acc[2], 8*2($out_ptr)
	mov	@acc[3], 8*3($out_ptr)
	mov	%rbp,    8*4($out_ptr)

	ret
.size	__smulq_256x63,.-__smulq_256x63
___
########################################################################
# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
# the names refers to maximum bit-lengths of |a| and |b|. As already
# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
# chosen so that "bad things" don't happen. For example, so that the
# sum of the products doesn't overflow, and that the final result is
# never wider than inputs...
{
$code.=<<___;
.type	__smulq_256_n_shift_by_31,\@abi-omnipotent
.align	32
__smulq_256_n_shift_by_31:
	mov	$f0, 8*0($out_ptr)	# offload |f0|
	mov	$g0, 8*1($out_ptr)	# offload |g0|
	mov	$f0, %rbp
___
for($j=0; $j<2; $j++) {
my $k = 8*4*$j;
my @acc=@acc;	@acc=@acc[4..7] if ($j);
my $f0="%rbp";	$f0=$g0		if ($j);
$code.=<<___;
	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
	mov	$k+8*1($in_ptr), @acc[1]
	mov	$k+8*2($in_ptr), @acc[2]
	mov	$k+8*3($in_ptr), @acc[3]

	mov	$f0, %rbx
	sar	\$63, $f0		# |f0|'s sign as mask (or |g0|'s)
	xor	%rax, %rax
	sub	$f0, %rax		# |f0|'s sign as bit (or |g0|'s)

	xor	$f0, %rbx		# conditionally negate |f0| (or |g0|)
	add	%rax, %rbx

	xor	$f0, @acc[0]		# conditionally negate |a| (or |b|)
	xor	$f0, @acc[1]
	xor	$f0, @acc[2]
	xor	$f0, @acc[3]
	add	@acc[0], %rax
	adc	\$0, @acc[1]
	adc	\$0, @acc[2]
	adc	\$0, @acc[3]

	mulq	%rbx
	mov	%rax, @acc[0]
	mov	@acc[1], %rax
	and	%rbx, $f0
	neg	$f0
	mov	%rdx, @acc[1]
___
for($i=1; $i<3; $i++) {
$code.=<<___;
	mulq	%rbx
	add	%rax, @acc[$i]
	mov	@acc[$i+1], %rax
	adc	\$0, %rdx
	mov	%rdx, @acc[$i+1]
___
}
$code.=<<___;
	mulq	%rbx
	add	%rax, @acc[3]
	adc	%rdx, $f0
___
}
$code.=<<___;
	add	@acc[4], @acc[0]
	adc	@acc[5], @acc[1]
	adc	@acc[6], @acc[2]
	adc	@acc[7], @acc[3]
	adc	$g0, %rbp

	mov	8*0($out_ptr), $f0	# restore original |f0|
	mov	8*1($out_ptr), $g0	# restore original |g0|

	shrd	\$31, @acc[1], @acc[0]
	shrd	\$31, @acc[2], @acc[1]
	shrd	\$31, @acc[3], @acc[2]
	shrd	\$31, %rbp,    @acc[3]

	sar	\$63, %rbp		# sign as mask
	xor	%rax, %rax
	sub	%rbp, %rax		# sign as bit

	xor	%rbp, @acc[0]		# conditionally negate the result
	xor	%rbp, @acc[1]
	xor	%rbp, @acc[2]
	xor	%rbp, @acc[3]
	add	%rax, @acc[0]
	adc	\$0, @acc[1]
	adc	\$0, @acc[2]
	adc	\$0, @acc[3]

	mov	@acc[0], 8*0($out_ptr)
	mov	@acc[1], 8*1($out_ptr)
	mov	@acc[2], 8*2($out_ptr)
	mov	@acc[3], 8*3($out_ptr)

	xor	%rbp, $f0		# conditionally negate |f0|
	xor	%rbp, $g0		# conditionally negate |g0|
	add	%rax, $f0
	add	%rax, $g0

	ret
.size	__smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31
___
}

{
my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
my ($a_, $b_) = ($a_lo, $b_lo);
{
my @a = ($a_lo, $t1, $a_hi);
my @b = ($b_lo, $t2, $b_hi);

$code.=<<___;
.type	__ab_approximation_31_256,\@abi-omnipotent
.align	32
__ab_approximation_31_256:
	mov	8*3($in_ptr), @a[2]	# load |a| in reverse order
	mov	8*7($in_ptr), @b[2]	# load |b| in reverse order
	mov	8*2($in_ptr), @a[1]
	mov	8*6($in_ptr), @b[1]
	mov	8*1($in_ptr), @a[0]
	mov	8*5($in_ptr), @b[0]

	mov	@a[2], $t0
	or	@b[2], $t0		# check top-most limbs, ...
	cmovz	@a[1], @a[2]
	cmovz	@b[1], @b[2]
	cmovz	@a[0], @a[1]
	mov	8*0($in_ptr), @a[0]
	cmovz	@b[0], @b[1]
	mov	8*4($in_ptr), @b[0]

	mov	@a[2], $t0
	or	@b[2], $t0		# ... and ones before that ...
	cmovz	@a[1], @a[2]
	cmovz	@b[1], @b[2]
	cmovz	@a[0], @a[1]
	cmovz	@b[0], @b[1]

	mov	@a[2], $t0
	or	@b[2], $t0
	bsr	$t0, %rcx
	lea	1(%rcx), %rcx
	cmovz	@a[0], @a[2]
	cmovz	@b[0], @b[2]
	cmovz	$t0, %rcx
	neg	%rcx
	#and	\$63, %rcx		# debugging artefact

	shldq	%cl, @a[1], @a[2]	# align second limb to the left
	shldq	%cl, @b[1], @b[2]

	mov	\$0x7FFFFFFF, %eax
	and	%rax, @a[0]
	and	%rax, @b[0]
	not	%rax
	and	%rax, @a[2]
	and	%rax, @b[2]
	or	@a[2], @a[0]
	or	@b[2], @b[0]

	jmp	__inner_loop_31_256

	ret
.size	__ab_approximation_31_256,.-__ab_approximation_31_256
___
}
$code.=<<___;
.type	__inner_loop_31_256,\@abi-omnipotent
.align	32			# comment and punish Coffee Lake by up to 40%
__inner_loop_31_256:		################# by Thomas Pornin
	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
	mov	\$0x7FFFFFFF7FFFFFFF, $bias

.Loop_31_256:
	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
	mov	$a_, $t0
	mov	$b_, $t1
	mov	$fg0, $t2
	mov	$fg1, $t3
	cmovb	$b_, $a_
	cmovb	$t0, $b_
	cmovb	$fg1, $fg0
	cmovb	$t2, $fg1

	sub	$b_, $a_		# |a_|-|b_|
	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
	add	$bias, $fg0

	test	\$1, $t0		# if |a_| was even, roll back 
	cmovz	$t0, $a_
	cmovz	$t1, $b_
	cmovz	$t2, $fg0
	cmovz	$t3, $fg1

	shr	\$1, $a_		# |a_|>>=1
	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
	sub	$bias, $fg1
	sub	\$1, $cnt
	jnz	.Loop_31_256

	shr	\$32, $bias
	mov	%ecx, %edx		# $fg0, $f0
	mov	${fg1}d, ${f1}d
	shr	\$32, $g0
	shr	\$32, $g1
	sub	$bias, $f0		# remove the bias
	sub	$bias, $g0
	sub	$bias, $f1
	sub	$bias, $g1

	ret
.size	__inner_loop_31_256,.-__inner_loop_31_256

.type	__inner_loop_62_256,\@abi-omnipotent
.align	32
__inner_loop_62_256:
	mov	$cnt, %r15d
	mov	\$1, $f0	# |f0|=1
	xor	$g0, $g0	# |g0|=0
	xor	$f1, $f1	# |f1|=0
	mov	$f0, $g1	# |g1|=1
	mov	$f0, %r14

.Loop_62_256:
	xor	$t0, $t0
	test	%r14, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
	mov	$b_lo, $t1
	cmovnz	$b_lo, $t0
	sub	$a_lo, $t1	# |b_|-|a_|
	mov	$a_lo, $t2
	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
	cmovc	$t1, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
	cmovc	$t2, $b_lo	# |b_| = |a_|
	mov	$f0, $t0	# exchange |f0| and |f1|
	cmovc	$f1, $f0
	cmovc	$t0, $f1
	mov	$g0, $t1	# exchange |g0| and |g1|
	cmovc	$g1, $g0
	cmovc	$t1, $g1
	xor	$t0, $t0
	xor	$t1, $t1
	shr	\$1, $a_lo
	test	%r14, $t2	# if |a_| was odd, then we'll be subtracting...
	cmovnz	$f1, $t0
	cmovnz	$g1, $t1
	add	$f1, $f1	# |f1|<<=1
	add	$g1, $g1	# |g1|<<=1
	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
	sub	\$1, %r15d
	jnz	.Loop_62_256

	ret
.size	__inner_loop_62_256,.-__inner_loop_62_256
___
}

print $code;
close STDOUT;
initial stuff 2022-09-09 06:47:49 +00:00			`#!/usr/bin/env perl`
			`#`
			`# Copyright Supranational LLC`
			`# Licensed under the Apache License, Version 2.0, see LICENSE for details.`
			`# SPDX-License-Identifier: Apache-2.0`
			`#`
			`# Both constant-time and fast Euclidean inversion as suggested in`
			`# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake.`
			`#`
			`# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,`
			`# const vec256 modx);`
			`#`
			`$python_ref.=<<'___';`
			`def ct_inverse_mod_256(inp, mod):`
			`a, u = inp, 1`
			`b, v = mod, 0`

			`k = 31`
			`mask = (1 << k) - 1`

			`for i in range(0, 512 // k - 1):`
			`# __ab_approximation_31`
			`n = max(a.bit_length(), b.bit_length())`
			`if n < 64:`
			`a_, b_ = a, b`
			`else:`
			`a_ = (a & mask) \| ((a >> (n-k-2)) << k)`
			`b_ = (b & mask) \| ((b >> (n-k-2)) << k)`

			`# __inner_loop_31`
			`f0, g0, f1, g1 = 1, 0, 0, 1`
			`for j in range(0, k):`
			`if a_ & 1:`
			`if a_ < b_:`
			`a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0`
			`a_, f0, g0 = a_-b_, f0-f1, g0-g1`
			`a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1`

			`# __smulq_256_n_shift_by_31`
			`a, b = (af0 + bg0) >> k, (af1 + bg1) >> k`
			`if a < 0:`
			`a, f0, g0 = -a, -f0, -g0`
			`if b < 0:`
			`b, f1, g1 = -b, -f1, -g1`

			`# __smulq_512x63`
			`u, v = uf0 + vg0, uf1 + vg1`

			`if 512 % k + k:`
			`f0, g0, f1, g1 = 1, 0, 0, 1`
			`for j in range(0, 512 % k + k):`
			`if a & 1:`
			`if a < b:`
			`a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0`
			`a, f0, g0 = a-b, f0-f1, g0-g1`
			`a, f1, g1 = a >> 1, f1 << 1, g1 << 1`

			`v = uf1 + vg1`

			`mod <<= 512 - mod.bit_length() # align to the left`
			`if v < 0:`
			`v += mod`
			`if v < 0:`
			`v += mod`
			`elif v == 1<<512`
			`v -= mod`

			`return v & (2**512 - 1) # to be reduced % mod`
			`___`

			`$flavour = shift;`
			`$output = shift;`
			`if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }`

			`$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);`

			`$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;`
			`( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or`
			`( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or`
			`die "can't locate x86_64-xlate.pl";`

			`open STDOUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\""`
			`or die "can't call $xlate: $!";`

			`my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");`
			`my @acc = map("%r$_",(8..15));`
			`my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");`
			`my $cnt = "%edx";`

			`$frame = 86+2512;`

			`$code.=<<___;`
			`.text`

			`.globl ct_inverse_mod_256`
			`.type ct_inverse_mod_256,\@function,4,"unwind"`
			`.align 32`
			`ct_inverse_mod_256:`
			`.cfi_startproc`
			`push %rbp`
			`.cfi_push %rbp`
			`push %rbx`
			`.cfi_push %rbx`
			`push %r12`
			`.cfi_push %r12`
			`push %r13`
			`.cfi_push %r13`
			`push %r14`
			`.cfi_push %r14`
			`push %r15`
			`.cfi_push %r15`
			`sub \$$frame, %rsp`
			`.cfi_adjust_cfa_offset $frame`
			`.cfi_end_prologue`

			`lea 8*6+511(%rsp), %rax # find closest 512-byte-aligned spot`
			`and \$-512, %rax # in the frame...`
			`mov $out_ptr, 8*4(%rsp)`
			`mov $nx_ptr, 8*5(%rsp)`

			`mov 8*0($in_ptr), @acc[0] # load input`
			`mov 8*1($in_ptr), @acc[1]`
			`mov 8*2($in_ptr), @acc[2]`
			`mov 8*3($in_ptr), @acc[3]`

			`mov 8*0($n_ptr), @acc[4] # load modulus`
			`mov 8*1($n_ptr), @acc[5]`
			`mov 8*2($n_ptr), @acc[6]`
			`mov 8*3($n_ptr), @acc[7]`

			`mov @acc[0], 8*0(%rax) # copy input to \|a\|`
			`mov @acc[1], 8*1(%rax)`
			`mov @acc[2], 8*2(%rax)`
			`mov @acc[3], 8*3(%rax)`

			`mov @acc[4], 8*4(%rax) # copy modulus to \|b\|`
			`mov @acc[5], 8*5(%rax)`
			`mov @acc[6], 8*6(%rax)`
			`mov @acc[7], 8*7(%rax)`
			`mov %rax, $in_ptr`

			`################################# first iteration`
			`mov \$31, $cnt`
			`call __ab_approximation_31_256`
			`#mov $f0, 8*0(%rsp)`
			`#mov $g0, 8*1(%rsp)`
			`mov $f1, 8*2(%rsp)`
			`mov $g1, 8*3(%rsp)`

			`mov \$256, $out_ptr`
			`xor $in_ptr, $out_ptr # pointer to destination \|a\|b\|u\|v\|`
			`call __smulq_256_n_shift_by_31`
			`#mov $f0, 8*0(%rsp) # corrected \|f0\|`
			`#mov $g0, 8*1(%rsp) # corrected \|g0\|`
			`mov $f0, 8*8($out_ptr) # initialize \|u\| with \|f0\|`

			`mov 8*2(%rsp), $f0 # \|f1\|`
			`mov 8*3(%rsp), $g0 # \|g1\|`
			`lea 8*4($out_ptr), $out_ptr # pointer to destination \|b\|`
			`call __smulq_256_n_shift_by_31`
			`#mov $f0, 8*2(%rsp) # corrected \|f1\|`
			`#mov $g0, 8*3(%rsp) # corrected \|g1\|`
			`mov $f0, 8*9($out_ptr) # initialize \|v\| with \|f1\|`

			`################################# second iteration`
			`xor \$256, $in_ptr # flip-flop pointer to source \|a\|b\|u\|v\|`
			`mov \$31, $cnt`
			`call __ab_approximation_31_256`
			`#mov $f0, 8*0(%rsp)`
			`#mov $g0, 8*1(%rsp)`
			`mov $f1, 8*2(%rsp)`
			`mov $g1, 8*3(%rsp)`

			`mov \$256, $out_ptr`
			`xor $in_ptr, $out_ptr # pointer to destination \|a\|b\|u\|v\|`
			`call __smulq_256_n_shift_by_31`
			`mov $f0, 8*0(%rsp) # corrected \|f0\|`
			`mov $g0, 8*1(%rsp) # corrected \|g0\|`

			`mov 8*2(%rsp), $f0 # \|f1\|`
			`mov 8*3(%rsp), $g0 # \|g1\|`
			`lea 8*4($out_ptr), $out_ptr # pointer to destination \|b\|`
			`call __smulq_256_n_shift_by_31`
			`#mov $f0, 8*2(%rsp) # corrected \|f1\|`
			`#mov $g0, 8*3(%rsp) # corrected \|g1\|`

			`mov 8*8($in_ptr), @acc[0] # \|u\|`
			`mov 8*13($in_ptr), @acc[4] # \|v\|`
			`mov @acc[0], @acc[1]`
			`imulq 80(%rsp), @acc[0] # \|u\|\|f0\|`
			`mov @acc[4], @acc[5]`
			`imulq 81(%rsp), @acc[4] # \|v\|\|g0\|`
			`add @acc[4], @acc[0]`
			`mov @acc[0], 8*4($out_ptr) # destination \|u\|`
			`sar \$63, @acc[0] # sign extension`
			`mov @acc[0], 8*5($out_ptr)`
			`mov @acc[0], 8*6($out_ptr)`
			`mov @acc[0], 8*7($out_ptr)`
			`mov @acc[0], 8*8($out_ptr)`
			`lea 8*8($in_ptr), $in_ptr # make in_ptr "rewindable" with xor`

			`imulq $f0, @acc[1] # \|u\|*\|f1\|`
			`imulq $g0, @acc[5] # \|v\|*\|g1\|`
			`add @acc[5], @acc[1]`
			`mov @acc[1], 8*9($out_ptr) # destination \|v\|`
			`sar \$63, @acc[1] # sign extension`
			`mov @acc[1], 8*10($out_ptr)`
			`mov @acc[1], 8*11($out_ptr)`
			`mov @acc[1], 8*12($out_ptr)`
			`mov @acc[1], 8*13($out_ptr)`
			`___`
			`for($i=2; $i<15; $i++) {`
			`my $smul_512x63 = $i>8 ? "__smulq_512x63"`
			`: "__smulq_256x63";`
			`$code.=<<___;`
			`xor \$256+8*8, $in_ptr # flip-flop pointer to source \|a\|b\|u\|v\|`
			`mov \$31, $cnt`
			`call __ab_approximation_31_256`
			`#mov $f0, 8*0(%rsp)`
			`#mov $g0, 8*1(%rsp)`
			`mov $f1, 8*2(%rsp)`
			`mov $g1, 8*3(%rsp)`

			`mov \$256, $out_ptr`
			`xor $in_ptr, $out_ptr # pointer to destination \|a\|b\|u\|v\|`
			`call __smulq_256_n_shift_by_31`
			`mov $f0, 8*0(%rsp) # corrected \|f0\|`
			`mov $g0, 8*1(%rsp) # corrected \|g0\|`

			`mov 8*2(%rsp), $f0 # \|f1\|`
			`mov 8*3(%rsp), $g0 # \|g1\|`
			`lea 8*4($out_ptr), $out_ptr # pointer to destination \|b\|`
			`call __smulq_256_n_shift_by_31`
			`mov $f0, 8*2(%rsp) # corrected \|f1\|`
			`mov $g0, 8*3(%rsp) # corrected \|g1\|`

			`mov 8*0(%rsp), $f0 # \|f0\|`
			`mov 8*1(%rsp), $g0 # \|g0\|`
			`lea 8*8($in_ptr), $in_ptr # pointer to source \|u\|v\|`
			`lea 8*4($out_ptr), $out_ptr # pointer to destination \|u\|`
			`call __smulq_256x63`

			`mov 8*2(%rsp), $f0 # \|f1\|`
			`mov 8*3(%rsp), $g0 # \|g1\|`
			`lea 8*5($out_ptr),$out_ptr # pointer to destination \|v\|`
			`call $smul_512x63`
			`___`
			`$code.=<<___ if ($i==8);`
			`sar \$63, %rbp # sign extension`
			`mov %rbp, 8*5($out_ptr)`
			`mov %rbp, 8*6($out_ptr)`
			`mov %rbp, 8*7($out_ptr)`
			`___`
			`}`
			`$code.=<<___;`
			`################################# two[!] last iterations in one go`
			`xor \$256+8*8, $in_ptr # flip-flop pointer to source \|a\|b\|u\|v\|`
			`mov \$47, $cnt # 31 + 512 % 31`
			`#call __ab_approximation_31 # \|a\| and \|b\| are exact, just load`
			`mov 8*0($in_ptr), @acc[0] # \|a_lo\|`
			`#xor @acc[1], @acc[1] # \|a_hi\|`
			`mov 8*4($in_ptr), @acc[2] # \|b_lo\|`
			`#xor @acc[3], @acc[3] # \|b_hi\|`
			`call __inner_loop_62_256`
			`#mov $f0, 8*0(%rsp)`
			`#mov $g0, 8*1(%rsp)`
			`#mov $f1, 8*2(%rsp)`
			`#mov $g1, 8*3(%rsp)`

			`#mov 8*0(%rsp), $f0 # \|f0\|`
			`#mov 8*1(%rsp), $g0 # \|g0\|`
			`lea 8*8($in_ptr), $in_ptr # pointer to source \|u\|v\|`
			`#lea 8*6($out_ptr), $out_ptr # pointer to destination \|u\|`
			`#call __smulq_256x63`

			`#mov 8*2(%rsp), $f0 # \|f1\|`
			`#mov 8*3(%rsp), $g0 # \|g1\|`
			`mov $f1, $f0`
			`mov $g1, $g0`
			`mov 8*4(%rsp), $out_ptr # original \|out_ptr\|`
			`call __smulq_512x63`
			`adc %rbp, %rdx # the excess limb of the result`

			`mov 8*5(%rsp), $in_ptr # original \|nx_ptr\|`
			`mov %rdx, %rax`
			`sar \$63, %rdx # result's sign as mask`

			`mov %rdx, @acc[0] # mask \|modulus\|`
			`mov %rdx, @acc[1]`
			`and 8*0($in_ptr), @acc[0]`
			`mov %rdx, @acc[2]`
			`and 8*1($in_ptr), @acc[1]`
			`and 8*2($in_ptr), @acc[2]`
			`and 8*3($in_ptr), %rdx`

			`add @acc[0], @acc[4] # conditionally add \|modulus\|<<256`
			`adc @acc[1], @acc[5]`
			`adc @acc[2], @acc[6]`
			`adc %rdx, @acc[7]`
			`adc \$0, %rax`

			`mov %rax, %rdx`
			`neg %rax`
			`or %rax, %rdx # excess bit or sign as mask`
			`sar \$63, %rax # excess bit as mask`

			`mov %rdx, @acc[0] # mask \|modulus\|`
			`mov %rdx, @acc[1]`
			`and 8*0($in_ptr), @acc[0]`
			`mov %rdx, @acc[2]`
			`and 8*1($in_ptr), @acc[1]`
			`and 8*2($in_ptr), @acc[2]`
			`and 8*3($in_ptr), %rdx`

			`xor %rax, @acc[0] # conditionally negate \|modulus\|`
			`xor %rcx, %rcx`
			`xor %rax, @acc[1]`
			`sub %rax, %rcx`
			`xor %rax, @acc[2]`
			`xor %rax, %rdx`
			`add %rcx, @acc[0]`
			`adc \$0, @acc[1]`
			`adc \$0, @acc[2]`
			`adc \$0, %rdx`

			`add @acc[0], @acc[4] # final adjustment for \|modulus\|<<256`
			`adc @acc[1], @acc[5]`
			`adc @acc[2], @acc[6]`
			`adc %rdx, @acc[7]`

			`mov @acc[4], 8*4($out_ptr) # store absolute value`
			`mov @acc[5], 8*5($out_ptr)`
			`mov @acc[6], 8*6($out_ptr)`
			`mov @acc[7], 8*7($out_ptr)`

			`lea $frame(%rsp), %r8 # size optimization`
			`mov 8*0(%r8),%r15`
			`.cfi_restore %r15`
			`mov 8*1(%r8),%r14`
			`.cfi_restore %r14`
			`mov 8*2(%r8),%r13`
			`.cfi_restore %r13`
			`mov 8*3(%r8),%r12`
			`.cfi_restore %r12`
			`mov 8*4(%r8),%rbx`
			`.cfi_restore %rbx`
			`mov 8*5(%r8),%rbp`
			`.cfi_restore %rbp`
			`lea 8*6(%r8),%rsp`
			`.cfi_adjust_cfa_offset -$frame-8*6`
			`.cfi_epilogue`
			`ret`
			`.cfi_endproc`
			`.size ct_inverse_mod_256,.-ct_inverse_mod_256`
			`___`
			`########################################################################`
			`# Signed \|u\|\|f?\|+\|v\|\|g?\| subroutines. "NNN" in "NNNx63" suffix refers`
			`# to the maximum bit-length of the result, and "63" - to the maximum`
			`# bit-length of the \|f?\| and \|g?\| single-limb multiplicands. However!`
			`# The latter should not be taken literally, as they are always chosen so`
			`# that "bad things" don't happen. For example, there comes a point when`
			`# \|v\| grows beyond 383 bits, while \|u\| remains 383 bits wide. Yet, we`
			`# always call __smul_383x63 to perform \|u\|\|f0\|+\|v\|\|g0\| step. This is`
			`# because past that point \|f0\| is always 1 and \|g0\| is always 0. And,`
			`# since \|u\| never grows beyond 383 bits, __smul_767x63 doesn't have to`
			`# perform full-width \|u\|*\|f1\| multiplication, half-width one with sign`
			`# extension is sufficient...`
			`$code.=<<___;`
			`.type __smulq_512x63,\@abi-omnipotent`
			`.align 32`
			`__smulq_512x63:`
			`mov 8*0($in_ptr), @acc[0] # load \|u\|`
			`mov 8*1($in_ptr), @acc[1]`
			`mov 8*2($in_ptr), @acc[2]`
			`mov 8*3($in_ptr), @acc[3]`
			`mov 8*4($in_ptr), %rbp # sign limb`

			`mov $f0, %rbx`
			`sar \$63, $f0 # \|f0\|'s sign as mask`
			`xor %rax, %rax`
			`sub $f0, %rax # \|f0\|'s sign as bit`

			`xor $f0, %rbx # conditionally negate \|f0\|`
			`add %rax, %rbx`

			`xor $f0, @acc[0] # conditionally negate \|u\|`
			`xor $f0, @acc[1]`
			`xor $f0, @acc[2]`
			`xor $f0, @acc[3]`
			`xor $f0, %rbp`
			`add @acc[0], %rax`
			`adc \$0, @acc[1]`
			`adc \$0, @acc[2]`
			`adc \$0, @acc[3]`
			`adc \$0, %rbp`

			`mulq %rbx # \|u\|*\|f0\|`
			`mov %rax, 80($out_ptr) # offload \|u\|\|f0\|`
			`mov @acc[1], %rax`
			`mov %rdx, @acc[1]`
			`___`
			`for($i=1; $i<3; $i++) {`
			`$code.=<<___;`
			`mulq %rbx`
			`add %rax, @acc[$i]`
			`mov @acc[$i+1], %rax`
			`adc \$0, %rdx`
			`mov @acc[$i], 8*$i($out_ptr)`
			`mov %rdx, @acc[$i+1]`
			`___`
			`}`
			`$code.=<<___;`
			`and %rbx, %rbp`
			`neg %rbp`
			`mulq %rbx`
			`add %rax, @acc[3]`
			`adc %rdx, %rbp`
			`mov @acc[3], 8*3($out_ptr)`

			`mov 8*5($in_ptr), @acc[0] # load \|v\|`
			`mov 8*6($in_ptr), @acc[1]`
			`mov 8*7($in_ptr), @acc[2]`
			`mov 8*8($in_ptr), @acc[3]`
			`mov 8*9($in_ptr), @acc[4]`
			`mov 8*10($in_ptr), @acc[5]`
			`mov 8*11($in_ptr), @acc[6]`
			`mov 8*12($in_ptr), @acc[7]`

			`mov $g0, $f0`
			`sar \$63, $f0 # \|g0\|'s sign as mask`
			`xor %rax, %rax`
			`sub $f0, %rax # \|g0\|'s sign as bit`

			`xor $f0, $g0 # conditionally negate \|g0\|`
			`add %rax, $g0`

			`xor $f0, @acc[0] # conditionally negate \|v\|`
			`xor $f0, @acc[1]`
			`xor $f0, @acc[2]`
			`xor $f0, @acc[3]`
			`xor $f0, @acc[4]`
			`xor $f0, @acc[5]`
			`xor $f0, @acc[6]`
			`xor $f0, @acc[7]`
			`add @acc[0], %rax`
			`adc \$0, @acc[1]`
			`adc \$0, @acc[2]`
			`adc \$0, @acc[3]`
			`adc \$0, @acc[4]`
			`adc \$0, @acc[5]`
			`adc \$0, @acc[6]`
			`adc \$0, @acc[7]`

			`mulq $g0`
			`mov %rax, @acc[0]`
			`mov @acc[1], %rax`
			`mov %rdx, @acc[1]`
			`___`
			`for($i=1; $i<7; $i++) {`
			`$code.=<<___;`
			`mulq $g0`
			`add %rax, @acc[$i]`
			`mov @acc[$i+1], %rax`
			`adc \$0, %rdx`
			`mov %rdx, @acc[$i+1]`
			`___`
			`}`
			`$code.=<<___;`
			`imulq $g0`
			`add %rax, @acc[7]`
			`adc \$0, %rdx # used in the final step`

			`mov %rbp, %rbx`
			`sar \$63, %rbp # sign extension`

			`add 80($out_ptr), @acc[0] # accumulate \|u\|\|f0\|`
			`adc 8*1($out_ptr), @acc[1]`
			`adc 8*2($out_ptr), @acc[2]`
			`adc 8*3($out_ptr), @acc[3]`
			`adc %rbx, @acc[4]`
			`adc %rbp, @acc[5]`
			`adc %rbp, @acc[6]`
			`adc %rbp, @acc[7]`

			`mov @acc[0], 8*0($out_ptr)`
			`mov @acc[1], 8*1($out_ptr)`
			`mov @acc[2], 8*2($out_ptr)`
			`mov @acc[3], 8*3($out_ptr)`
			`mov @acc[4], 8*4($out_ptr)`
			`mov @acc[5], 8*5($out_ptr)`
			`mov @acc[6], 8*6($out_ptr)`
			`mov @acc[7], 8*7($out_ptr)`

			`ret`
			`.size __smulq_512x63,.-__smulq_512x63`

			`.type __smulq_256x63,\@abi-omnipotent`
			`.align 32`
			`__smulq_256x63:`
			`___`
			`for($j=0; $j<2; $j++) {`
			`my $k = 85$j;`
			`my @acc=@acc; @acc=@acc[4..7] if($j);`
			`my $top="%rbp"; $top=$g0 if($j);`
			`$code.=<<___;`
			`mov $k+8*0($in_ptr), @acc[0] # load \|u\| (or \|v\|)`
			`mov $k+8*1($in_ptr), @acc[1]`
			`mov $k+8*2($in_ptr), @acc[2]`
			`mov $k+8*3($in_ptr), @acc[3]`
			`mov $k+8*4($in_ptr), $top # sign/excess limb`

			`mov $f0, %rbx`
			`sar \$63, $f0 # \|f0\|'s sign as mask (or \|g0\|'s)`
			`xor %rax, %rax`
			`sub $f0, %rax # \|f0\|'s sign as bit (or \|g0\|'s)`

			`xor $f0, %rbx # conditionally negate \|f0\|`
			`add %rax, %rbx`

			`xor $f0, @acc[0] # conditionally negate \|u\| (or \|v\|)`
			`xor $f0, @acc[1]`
			`xor $f0, @acc[2]`
			`xor $f0, @acc[3]`
			`xor $f0, $top`
			`add @acc[0], %rax`
			`adc \$0, @acc[1]`
			`adc \$0, @acc[2]`
			`adc \$0, @acc[3]`
			`adc \$0, $top`

			`mulq %rbx`
			`mov %rax, @acc[0]`
			`mov @acc[1], %rax`
			`mov %rdx, @acc[1]`
			`___`
			`for($i=1; $i<3; $i++) {`
			`$code.=<<___;`
			`mulq %rbx`
			`add %rax, @acc[$i]`
			`mov @acc[$i+1], %rax`
			`adc \$0, %rdx`
			`mov %rdx, @acc[$i+1]`
			`___`
			`}`
			`$code.=<<___;`
			`and %rbx, $top`
			`neg $top`
			`mulq %rbx`
			`add %rax, @acc[3]`
			`adc %rdx, $top`
			`___`
			`$code.=<<___ if ($j==0);`
			`mov $g0, $f0`
			`___`
			`}`
			`$code.=<<___;`
			`add @acc[4], @acc[0] # accumulate \|u\|*\|f0\|`
			`adc @acc[5], @acc[1]`
			`adc @acc[6], @acc[2]`
			`adc @acc[7], @acc[3]`
			`adc %rcx, %rbp`

			`mov @acc[0], 8*0($out_ptr)`
			`mov @acc[1], 8*1($out_ptr)`
			`mov @acc[2], 8*2($out_ptr)`
			`mov @acc[3], 8*3($out_ptr)`
			`mov %rbp, 8*4($out_ptr)`

			`ret`
			`.size __smulq_256x63,.-__smulq_256x63`
			`___`
			`########################################################################`
			`# Signed abs(\|a\|\|f?\|+\|b\|\|g?\|)>>k subroutines. "NNN" in the middle of`
			`# the names refers to maximum bit-lengths of \|a\| and \|b\|. As already`
			`# mentioned, \|f?\| and \|g?\| can be viewed as 63 bits wide, but are always`
			`# chosen so that "bad things" don't happen. For example, so that the`
			`# sum of the products doesn't overflow, and that the final result is`
			`# never wider than inputs...`
			`{`
			`$code.=<<___;`
			`.type __smulq_256_n_shift_by_31,\@abi-omnipotent`
			`.align 32`
			`__smulq_256_n_shift_by_31:`
			`mov $f0, 8*0($out_ptr) # offload \|f0\|`
			`mov $g0, 8*1($out_ptr) # offload \|g0\|`
			`mov $f0, %rbp`
			`___`
			`for($j=0; $j<2; $j++) {`
			`my $k = 84$j;`
			`my @acc=@acc; @acc=@acc[4..7] if ($j);`
			`my $f0="%rbp"; $f0=$g0 if ($j);`
			`$code.=<<___;`
			`mov $k+8*0($in_ptr), @acc[0] # load \|a\| (or \|b\|)`
			`mov $k+8*1($in_ptr), @acc[1]`
			`mov $k+8*2($in_ptr), @acc[2]`
			`mov $k+8*3($in_ptr), @acc[3]`

			`mov $f0, %rbx`
			`sar \$63, $f0 # \|f0\|'s sign as mask (or \|g0\|'s)`
			`xor %rax, %rax`
			`sub $f0, %rax # \|f0\|'s sign as bit (or \|g0\|'s)`

			`xor $f0, %rbx # conditionally negate \|f0\| (or \|g0\|)`
			`add %rax, %rbx`

			`xor $f0, @acc[0] # conditionally negate \|a\| (or \|b\|)`
			`xor $f0, @acc[1]`
			`xor $f0, @acc[2]`
			`xor $f0, @acc[3]`
			`add @acc[0], %rax`
			`adc \$0, @acc[1]`
			`adc \$0, @acc[2]`
			`adc \$0, @acc[3]`

			`mulq %rbx`
			`mov %rax, @acc[0]`
			`mov @acc[1], %rax`
			`and %rbx, $f0`
			`neg $f0`
			`mov %rdx, @acc[1]`
			`___`
			`for($i=1; $i<3; $i++) {`
			`$code.=<<___;`
			`mulq %rbx`
			`add %rax, @acc[$i]`
			`mov @acc[$i+1], %rax`
			`adc \$0, %rdx`
			`mov %rdx, @acc[$i+1]`
			`___`
			`}`
			`$code.=<<___;`
			`mulq %rbx`
			`add %rax, @acc[3]`
			`adc %rdx, $f0`
			`___`
			`}`
			`$code.=<<___;`
			`add @acc[4], @acc[0]`
			`adc @acc[5], @acc[1]`
			`adc @acc[6], @acc[2]`
			`adc @acc[7], @acc[3]`
			`adc $g0, %rbp`

			`mov 8*0($out_ptr), $f0 # restore original \|f0\|`
			`mov 8*1($out_ptr), $g0 # restore original \|g0\|`

			`shrd \$31, @acc[1], @acc[0]`
			`shrd \$31, @acc[2], @acc[1]`
			`shrd \$31, @acc[3], @acc[2]`
			`shrd \$31, %rbp, @acc[3]`

			`sar \$63, %rbp # sign as mask`
			`xor %rax, %rax`
			`sub %rbp, %rax # sign as bit`

			`xor %rbp, @acc[0] # conditionally negate the result`
			`xor %rbp, @acc[1]`
			`xor %rbp, @acc[2]`
			`xor %rbp, @acc[3]`
			`add %rax, @acc[0]`
			`adc \$0, @acc[1]`
			`adc \$0, @acc[2]`
			`adc \$0, @acc[3]`

			`mov @acc[0], 8*0($out_ptr)`
			`mov @acc[1], 8*1($out_ptr)`
			`mov @acc[2], 8*2($out_ptr)`
			`mov @acc[3], 8*3($out_ptr)`

			`xor %rbp, $f0 # conditionally negate \|f0\|`
			`xor %rbp, $g0 # conditionally negate \|g0\|`
			`add %rax, $f0`
			`add %rax, $g0`

			`ret`
			`.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31`
			`___`
			`}`

			`{`
			`my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));`
			`my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");`
			`my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);`
			`my ($a_, $b_) = ($a_lo, $b_lo);`
			`{`
			`my @a = ($a_lo, $t1, $a_hi);`
			`my @b = ($b_lo, $t2, $b_hi);`

			`$code.=<<___;`
			`.type __ab_approximation_31_256,\@abi-omnipotent`
			`.align 32`
			`__ab_approximation_31_256:`
			`mov 8*3($in_ptr), @a[2] # load \|a\| in reverse order`
			`mov 8*7($in_ptr), @b[2] # load \|b\| in reverse order`
			`mov 8*2($in_ptr), @a[1]`
			`mov 8*6($in_ptr), @b[1]`
			`mov 8*1($in_ptr), @a[0]`
			`mov 8*5($in_ptr), @b[0]`

			`mov @a[2], $t0`
			`or @b[2], $t0 # check top-most limbs, ...`
			`cmovz @a[1], @a[2]`
			`cmovz @b[1], @b[2]`
			`cmovz @a[0], @a[1]`
			`mov 8*0($in_ptr), @a[0]`
			`cmovz @b[0], @b[1]`
			`mov 8*4($in_ptr), @b[0]`

			`mov @a[2], $t0`
			`or @b[2], $t0 # ... and ones before that ...`
			`cmovz @a[1], @a[2]`
			`cmovz @b[1], @b[2]`
			`cmovz @a[0], @a[1]`
			`cmovz @b[0], @b[1]`

			`mov @a[2], $t0`
			`or @b[2], $t0`
			`bsr $t0, %rcx`
			`lea 1(%rcx), %rcx`
			`cmovz @a[0], @a[2]`
			`cmovz @b[0], @b[2]`
			`cmovz $t0, %rcx`
			`neg %rcx`
			`#and \$63, %rcx # debugging artefact`

			`shldq %cl, @a[1], @a[2] # align second limb to the left`
			`shldq %cl, @b[1], @b[2]`

			`mov \$0x7FFFFFFF, %eax`
			`and %rax, @a[0]`
			`and %rax, @b[0]`
			`not %rax`
			`and %rax, @a[2]`
			`and %rax, @b[2]`
			`or @a[2], @a[0]`
			`or @b[2], @b[0]`

			`jmp __inner_loop_31_256`

			`ret`
			`.size __ab_approximation_31_256,.-__ab_approximation_31_256`
			`___`
			`}`
			`$code.=<<___;`
			`.type __inner_loop_31_256,\@abi-omnipotent`
			`.align 32 # comment and punish Coffee Lake by up to 40%`
			`__inner_loop_31_256: ################# by Thomas Pornin`
			`mov \$0x7FFFFFFF80000000, $fg0 # \|f0\|=1, \|g0\|=0`
			`mov \$0x800000007FFFFFFF, $fg1 # \|f1\|=0, \|g1\|=1`
			`mov \$0x7FFFFFFF7FFFFFFF, $bias`

			`.Loop_31_256:`
			`cmp $b_, $a_ # if \|a_\|<\|b_\|, swap the variables`
			`mov $a_, $t0`
			`mov $b_, $t1`
			`mov $fg0, $t2`
			`mov $fg1, $t3`
			`cmovb $b_, $a_`
			`cmovb $t0, $b_`
			`cmovb $fg1, $fg0`
			`cmovb $t2, $fg1`

			`sub $b_, $a_ # \|a_\|-\|b_\|`
			`sub $fg1, $fg0 # \|f0\|-\|f1\|, \|g0\|-\|g1\|`
			`add $bias, $fg0`

			`test \$1, $t0 # if \|a_\| was even, roll back`
			`cmovz $t0, $a_`
			`cmovz $t1, $b_`
			`cmovz $t2, $fg0`
			`cmovz $t3, $fg1`

			`shr \$1, $a_ # \|a_\|>>=1`
			`add $fg1, $fg1 # \|f1\|<<=1, \|g1\|<<=1`
			`sub $bias, $fg1`
			`sub \$1, $cnt`
			`jnz .Loop_31_256`

			`shr \$32, $bias`
			`mov %ecx, %edx # $fg0, $f0`
			`mov ${fg1}d, ${f1}d`
			`shr \$32, $g0`
			`shr \$32, $g1`
			`sub $bias, $f0 # remove the bias`
			`sub $bias, $g0`
			`sub $bias, $f1`
			`sub $bias, $g1`

			`ret`
			`.size __inner_loop_31_256,.-__inner_loop_31_256`

			`.type __inner_loop_62_256,\@abi-omnipotent`
			`.align 32`
			`__inner_loop_62_256:`
			`mov $cnt, %r15d`
			`mov \$1, $f0 # \|f0\|=1`
			`xor $g0, $g0 # \|g0\|=0`
			`xor $f1, $f1 # \|f1\|=0`
			`mov $f0, $g1 # \|g1\|=1`
			`mov $f0, %r14`

			`.Loop_62_256:`
			`xor $t0, $t0`
			`test %r14, $a_lo # if \|a_\| is odd, then we'll be subtracting \|b_\|`
			`mov $b_lo, $t1`
			`cmovnz $b_lo, $t0`
			`sub $a_lo, $t1 # \|b_\|-\|a_\|`
			`mov $a_lo, $t2`
			`sub $t0, $a_lo # \|a_\|-\|b_\| (or \|a_\|-0 if \|a_\| was even)`
			`cmovc $t1, $a_lo # borrow means \|a_\|<\|b_\|, replace with \|b_\|-\|a_\|`
			`cmovc $t2, $b_lo # \|b_\| = \|a_\|`
			`mov $f0, $t0 # exchange \|f0\| and \|f1\|`
			`cmovc $f1, $f0`
			`cmovc $t0, $f1`
			`mov $g0, $t1 # exchange \|g0\| and \|g1\|`
			`cmovc $g1, $g0`
			`cmovc $t1, $g1`
			`xor $t0, $t0`
			`xor $t1, $t1`
			`shr \$1, $a_lo`
			`test %r14, $t2 # if \|a_\| was odd, then we'll be subtracting...`
			`cmovnz $f1, $t0`
			`cmovnz $g1, $t1`
			`add $f1, $f1 # \|f1\|<<=1`
			`add $g1, $g1 # \|g1\|<<=1`
			`sub $t0, $f0 # \|f0\|-=\|f1\| (or \|f0-=0\| if \|a_\| was even)`
			`sub $t1, $g0 # \|g0\|-=\|g1\| (or \|g0-=0\| ...)`
			`sub \$1, %r15d`
			`jnz .Loop_62_256`

			`ret`
			`.size __inner_loop_62_256,.-__inner_loop_62_256`
			`___`
			`}`

			`print $code;`
			`close STDOUT;`