ftu/blst/asm/ct_inverse_mod_384-armv8.pl

#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast Euclidean inversion as suggested in
# https://eprint.iacr.org/2020/972. Performance is >12x better [on
# Cortex cores] than modulus-specific FLT addition chain...
#
# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
#
$python_ref.=<<'___';
def ct_inverse_mod_383(inp, mod):
    a, u = inp, 1
    b, v = mod, 0

    k = 62
    w = 64
    mask = (1 << w) - 1

    for i in range(0, 766 // k):
        # __ab_approximation_62
        n = max(a.bit_length(), b.bit_length())
        if n < 128:
            a_, b_ = a, b
        else:
            a_ = (a & mask) | ((a >> (n-w)) << w)
            b_ = (b & mask) | ((b >> (n-w)) << w)

        # __inner_loop_62
        f0, g0, f1, g1 = 1, 0, 0, 1
        for j in range(0, k):
            if a_ & 1:
                if a_ < b_:
                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
                a_, f0, g0 = a_-b_, f0-f1, g0-g1
            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1

        # __smul_383_n_shift_by_62
        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
        if a < 0:
            a, f0, g0 = -a, -f0, -g0
        if b < 0:
            b, f1, g1 = -b, -f1, -g1

        # __smul_767x63
        u, v = u*f0 + v*g0, u*f1 + v*g1

    if 766 % k:
        f0, g0, f1, g1 = 1, 0, 0, 1
        for j in range(0, 766 % k):
            if a & 1:
                if a < b:
                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
                a, f0, g0 = a-b, f0-f1, g0-g1
            a, f1, g1 = a >> 1, f1 << 1, g1 << 1

        v = u*f1 + v*g1

    if v < 0:
        v += mod << (768 - mod.bit_length())    # left aligned

    return v & (2**768 - 1) # to be reduced % mod
___

$flavour = shift;
$output  = shift;

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
my @acc=map("x$_",(3..14));
my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21));
my $cnt = $n_ptr;
my @t = map("x$_",(22..28,2));
my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11];

$frame = 16+2*512;

$code.=<<___;
.text

.globl	ct_inverse_mod_383
.type	ct_inverse_mod_383, %function
.align	5
ct_inverse_mod_383:
	paciasp
	stp	x29, x30, [sp,#-128]!
	add	x29, sp, #0
	stp	x19, x20, [sp,#16]
	stp	x21, x22, [sp,#32]
	stp	x23, x24, [sp,#48]
	stp	x25, x26, [sp,#64]
	stp	x27, x28, [sp,#80]
	sub	sp, sp, #$frame

	ldp	@t[0],   @acc[1], [$in_ptr,#8*0]
	ldp	@acc[2], @acc[3], [$in_ptr,#8*2]
	ldp	@acc[4], @acc[5], [$in_ptr,#8*4]

	add	$in_ptr, sp, #16+511	// find closest 512-byte-aligned spot
	and	$in_ptr, $in_ptr, #-512	// in the frame...
	stp	$out_ptr, $nx_ptr, [sp]

	ldp	@acc[6], @acc[7], [$n_ptr,#8*0]
	ldp	@acc[8], @acc[9], [$n_ptr,#8*2]
	ldp	@acc[10], @acc[11], [$n_ptr,#8*4]

	stp	@t[0],   @acc[1], [$in_ptr,#8*0]	// copy input to |a|
	stp	@acc[2], @acc[3], [$in_ptr,#8*2]
	stp	@acc[4], @acc[5], [$in_ptr,#8*4]
	stp	@acc[6], @acc[7], [$in_ptr,#8*6]	// copy modulus to |b|
	stp	@acc[8], @acc[9], [$in_ptr,#8*8]
	stp	@acc[10], @acc[11], [$in_ptr,#8*10]

	////////////////////////////////////////// first iteration
	mov	$cnt, #62
	bl	.Lab_approximation_62_loaded

	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
	bl	__smul_383_n_shift_by_62
	str	$f0,[$out_ptr,#8*12]		// initialize |u| with |f0|

	mov	$f0, $f1			// |f1|
	mov	$g0, $g1			// |g1|
	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |b|
	bl	__smul_383_n_shift_by_62
	str	$f0, [$out_ptr,#8*12]		// initialize |v| with |f1|

	////////////////////////////////////////// second iteration
	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
	mov	$cnt, #62
	bl	__ab_approximation_62

	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
	bl	__smul_383_n_shift_by_62
	mov	$f_, $f0			// corrected |f0|
	mov	$g_, $g0			// corrected |g0|

	mov	$f0, $f1			// |f1|
	mov	$g0, $g1			// |g1|
	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |b|
	bl	__smul_383_n_shift_by_62

	ldr	@acc[4], [$in_ptr,#8*12]	// |u|
	ldr	@acc[5], [$in_ptr,#8*18]	// |v|
	mul	@acc[0], $f_, @acc[4]		// |u|*|f0|
	smulh	@acc[1], $f_, @acc[4]
	mul	@acc[2], $g_, @acc[5]		// |v|*|g0|
	smulh	@acc[3], $g_, @acc[5]
	adds	@acc[0], @acc[0], @acc[2]
	adc	@acc[1], @acc[1], @acc[3]
	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
	asr	@acc[2], @acc[1], #63		// sign extenstion
	stp	@acc[2], @acc[2], [$out_ptr,#8*8]
	stp	@acc[2], @acc[2], [$out_ptr,#8*10]

	mul	@acc[0], $f0, @acc[4]		// |u|*|f1|
	smulh	@acc[1], $f0, @acc[4]
	mul	@acc[2], $g0, @acc[5]		// |v|*|g1|
	smulh	@acc[3], $g0, @acc[5]
	adds	@acc[0], @acc[0], @acc[2]
	adc	@acc[1], @acc[1], @acc[3]
	stp	@acc[0], @acc[1], [$out_ptr,#8*12]
	asr	@acc[2], @acc[1], #63		// sign extenstion
	stp	@acc[2], @acc[2], [$out_ptr,#8*14]
	stp	@acc[2], @acc[2], [$out_ptr,#8*16]
___
for($i=2; $i<11; $i++) {
$code.=<<___;
	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
	mov	$cnt, #62
	bl	__ab_approximation_62

	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
	bl	__smul_383_n_shift_by_62
	mov	$f_, $f0			// corrected |f0|
	mov	$g_, $g0			// corrected |g0|

	mov	$f0, $f1			// |f1|
	mov	$g0, $g1			// |g1|
	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |b|
	bl	__smul_383_n_shift_by_62

	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |u|
	bl	__smul_383x63

	mov	$f_, $f0			// corrected |f1|
	mov	$g_, $g0			// corrected |g1|
	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |v|
	bl	__smul_383x63
___
$code.=<<___	if ($i>5);
	bl	__smul_767x63_tail
___
$code.=<<___	if ($i==5);
	asr	@t[5], @t[5], #63		// sign extension
	stp	@t[5], @t[5], [$out_ptr,#8*6]
	stp	@t[5], @t[5], [$out_ptr,#8*8]
	stp	@t[5], @t[5], [$out_ptr,#8*10]
___
}
$code.=<<___;
	////////////////////////////////////////// iteration before last
	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
	mov	$cnt, #62
	//bl	__ab_approximation_62		// |a| and |b| are exact,
	ldp	$a_lo, $a_hi, [$in_ptr,#8*0]	// just load
	ldp	$b_lo, $b_hi, [$in_ptr,#8*6]
	bl	__inner_loop_62

	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
	str	$a_lo, [$out_ptr,#8*0]
	str	$b_lo, [$out_ptr,#8*6]

	mov	$f_, $f0			// exact |f0|
	mov	$g_, $g0			// exact |g0|
	mov	$f0, $f1
	mov	$g0, $g1
	add	$out_ptr, $out_ptr, #8*12	// pointer to dst |u|
	bl	__smul_383x63

	mov	$f_, $f0			// exact |f1|
	mov	$g_, $g0			// exact |g1|
	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |v|
	bl	__smul_383x63
	bl	__smul_767x63_tail

	////////////////////////////////////////// last iteration
	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
	mov	$cnt, #22			// 766 % 62
	//bl	__ab_approximation_62		// |a| and |b| are exact,
	ldr	$a_lo, [$in_ptr,#8*0]		// just load
	eor	$a_hi, $a_hi, $a_hi
	ldr	$b_lo, [$in_ptr,#8*6]
	eor	$b_hi, $b_hi, $b_hi
	bl	__inner_loop_62

	mov	$f_, $f1
	mov	$g_, $g1
	ldp	$out_ptr, $f0, [sp]		// original out_ptr and n_ptr
	bl	__smul_383x63
	bl	__smul_767x63_tail
	ldr	x30, [x29,#8]

	asr	@t[0], @acc[5], #63		// sign as mask
	ldp	@acc[6], @acc[7], [$f0,#8*0]
	ldp	@acc[8], @acc[9], [$f0,#8*2]
	ldp	@acc[10], @acc[11], [$f0,#8*4]

	and	@acc[6], @acc[6], @t[0]		// add mod<<384 conditionally
	and	@acc[7], @acc[7], @t[0]
	adds	@acc[0], @acc[0], @acc[6]
	and	@acc[8], @acc[8], @t[0]
	adcs	@acc[1], @acc[1], @acc[7]
	and	@acc[9], @acc[9], @t[0]
	adcs	@acc[2], @acc[2], @acc[8]
	and	@acc[10], @acc[10], @t[0]
	adcs	@acc[3], @acc[3], @acc[9]
	and	@acc[11], @acc[11], @t[0]
	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
	adcs	@acc[4], @acc[4], @acc[10]
	stp	@acc[2], @acc[3], [$out_ptr,#8*8]
	adc	@acc[5], @acc[5], @acc[11]
	stp	@acc[4], @acc[5], [$out_ptr,#8*10]

	add	sp, sp, #$frame
	ldp	x19, x20, [x29,#16]
	ldp	x21, x22, [x29,#32]
	ldp	x23, x24, [x29,#48]
	ldp	x25, x26, [x29,#64]
	ldp	x27, x28, [x29,#80]
	ldr	x29, [sp],#128
	autiasp
	ret
.size	ct_inverse_mod_383,.-ct_inverse_mod_383

////////////////////////////////////////////////////////////////////////
// see corresponding commentary in ctx_inverse_mod_384-x86_64...
.type	__smul_383x63, %function
.align	5
__smul_383x63:
___
for($j=0; $j<2; $j++) {
my $f_ = $f_;   $f_ = $g_          if ($j);
my @acc = @acc; @acc = @acc[6..11] if ($j);
my $k = 8*12+8*6*$j;
$code.=<<___;
	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |u| (or |v|)
	asr	$f1, $f_, #63		// |f_|'s sign as mask (or |g_|'s)
	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
	eor	$f_, $f_, $f1		// conditionally negate |f_| (or |g_|)
	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]

	eor	@acc[0], @acc[0], $f1	// conditionally negate |u| (or |v|)
	sub	$f_, $f_, $f1
	eor	@acc[1], @acc[1], $f1
	adds	@acc[0], @acc[0], $f1, lsr#63
	eor	@acc[2], @acc[2], $f1
	adcs	@acc[1], @acc[1], xzr
	eor	@acc[3], @acc[3], $f1
	adcs	@acc[2], @acc[2], xzr
	eor	@acc[4], @acc[4], $f1
	adcs	@acc[3], @acc[3], xzr
	 umulh	@t[0], @acc[0], $f_
	eor	@acc[5], @acc[5], $f1
	 umulh	@t[1], @acc[1], $f_
	adcs	@acc[4], @acc[4], xzr
	 umulh	@t[2], @acc[2], $f_
	adcs	@acc[5], @acc[5], xzr
	 umulh	@t[3], @acc[3], $f_
___
$code.=<<___	if ($j);
	adc	$g1, xzr, xzr		// used in __smul_767x63_tail
___
$code.=<<___;
	umulh	@t[4], @acc[4], $f_
	mul	@acc[0], @acc[0], $f_
	mul	@acc[1], @acc[1], $f_
	mul	@acc[2], @acc[2], $f_
	adds	@acc[1], @acc[1], @t[0]
	mul	@acc[3], @acc[3], $f_
	adcs	@acc[2], @acc[2], @t[1]
	mul	@acc[4], @acc[4], $f_
	adcs	@acc[3], @acc[3], @t[2]
	mul	@t[5+$j],@acc[5], $f_
	adcs	@acc[4], @acc[4], @t[3]
	adcs	@t[5+$j],@t[5+$j],@t[4]
___
$code.=<<___	if ($j==0);
	adc	@t[7], xzr, xzr
___
}
$code.=<<___;
	adc	@t[7], @t[7], xzr

	adds	@acc[0], @acc[0], @acc[6]
	adcs	@acc[1], @acc[1], @acc[7]
	adcs	@acc[2], @acc[2], @acc[8]
	adcs	@acc[3], @acc[3], @acc[9]
	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
	adcs	@acc[4], @acc[4], @acc[10]
	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
	adcs	@t[5],   @t[5],   @t[6]
	stp	@acc[4], @t[5],   [$out_ptr,#8*4]
	adc	@t[6],   @t[7],   xzr	// used in __smul_767x63_tail

	ret
.size	__smul_383x63,.-__smul_383x63

.type	__smul_767x63_tail, %function
.align	5
__smul_767x63_tail:
	smulh	@t[5],   @acc[5], $f_
	ldp	@acc[0], @acc[1], [$in_ptr,#8*24]	// load rest of |v|
	umulh	@acc[11],@acc[11], $g_
	ldp	@acc[2], @acc[3], [$in_ptr,#8*26]
	ldp	@acc[4], @acc[5], [$in_ptr,#8*28]

	eor	@acc[0], @acc[0], $f1	// conditionally negate rest of |v|
	eor	@acc[1], @acc[1], $f1
	eor	@acc[2], @acc[2], $f1
	adds	@acc[0], @acc[0], $g1
	eor	@acc[3], @acc[3], $f1
	adcs	@acc[1], @acc[1], xzr
	eor	@acc[4], @acc[4], $f1
	adcs	@acc[2], @acc[2], xzr
	eor	@acc[5], @acc[5], $f1
	adcs	@acc[3], @acc[3], xzr
	 umulh	@t[0], @acc[0], $g_
	adcs	@acc[4], @acc[4], xzr
	 umulh	@t[1], @acc[1], $g_
	adc	@acc[5], @acc[5], xzr

	umulh	@t[2], @acc[2], $g_
	 add	@acc[11], @acc[11], @t[6]
	umulh	@t[3], @acc[3], $g_
	 asr	@t[6], @t[5], #63
	umulh	@t[4], @acc[4], $g_
	mul	@acc[0], @acc[0], $g_
	mul	@acc[1], @acc[1], $g_
	mul	@acc[2], @acc[2], $g_
	adds	@acc[0], @acc[0], @acc[11]
	mul	@acc[3], @acc[3], $g_
	adcs	@acc[1], @acc[1], @t[0]
	mul	@acc[4], @acc[4], $g_
	adcs	@acc[2], @acc[2], @t[1]
	mul	@acc[5], @acc[5], $g_
	adcs	@acc[3], @acc[3], @t[2]
	adcs	@acc[4], @acc[4], @t[3]
	adc	@acc[5], @acc[5], @t[4]

	adds	@acc[0], @acc[0], @t[5]
	adcs	@acc[1], @acc[1], @t[6]
	adcs	@acc[2], @acc[2], @t[6]
	adcs	@acc[3], @acc[3], @t[6]
	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
	adcs	@acc[4], @acc[4], @t[6]
	stp	@acc[2], @acc[3], [$out_ptr,#8*8]
	adc	@acc[5], @acc[5], @t[6]
	stp	@acc[4], @acc[5], [$out_ptr,#8*10]

	ret
.size	__smul_767x63_tail,.-__smul_767x63_tail

.type	__smul_383_n_shift_by_62, %function
.align	5
__smul_383_n_shift_by_62:
___
for($j=0; $j<2; $j++) {
my $f0 = $f0;   $f0 = $g0           if ($j);
my @acc = @acc; @acc = @acc[6..11]  if ($j);
my $k = 8*6*$j;
$code.=<<___;
	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |a| (or |b|)
	asr	@t[6], $f0, #63		// |f0|'s sign as mask (or |g0|'s)
	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
	eor	@t[7], $f0, @t[6]	// conditionally negate |f0| (or |g0|)
	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]

	eor	@acc[0], @acc[0], @t[6]	// conditionally negate |a| (or |b|)
	sub	@t[7], @t[7], @t[6]
	eor	@acc[1], @acc[1], @t[6]
	adds	@acc[0], @acc[0], @t[6], lsr#63
	eor	@acc[2], @acc[2], @t[6]
	adcs	@acc[1], @acc[1], xzr
	eor	@acc[3], @acc[3], @t[6]
	adcs	@acc[2], @acc[2], xzr
	eor	@acc[4], @acc[4], @t[6]
	 umulh	@t[0], @acc[0], @t[7]
	adcs	@acc[3], @acc[3], xzr
	 umulh	@t[1], @acc[1], @t[7]
	eor	@acc[5], @acc[5], @t[6]
	 umulh	@t[2], @acc[2], @t[7]
	adcs	@acc[4], @acc[4], xzr
	 umulh	@t[3], @acc[3], @t[7]
	adc	@acc[5], @acc[5], xzr

	umulh	@t[4], @acc[4], @t[7]
	smulh	@t[5+$j], @acc[5], @t[7]
	mul	@acc[0], @acc[0], @t[7]
	mul	@acc[1], @acc[1], @t[7]
	mul	@acc[2], @acc[2], @t[7]
	adds	@acc[1], @acc[1], @t[0]
	mul	@acc[3], @acc[3], @t[7]
	adcs	@acc[2], @acc[2], @t[1]
	mul	@acc[4], @acc[4], @t[7]
	adcs	@acc[3], @acc[3], @t[2]
	mul	@acc[5], @acc[5], @t[7]
	adcs	@acc[4], @acc[4], @t[3]
	adcs	@acc[5], @acc[5] ,@t[4]
	adc	@t[5+$j], @t[5+$j], xzr
___
}
$code.=<<___;
	adds	@acc[0], @acc[0], @acc[6]
	adcs	@acc[1], @acc[1], @acc[7]
	adcs	@acc[2], @acc[2], @acc[8]
	adcs	@acc[3], @acc[3], @acc[9]
	adcs	@acc[4], @acc[4], @acc[10]
	adcs	@acc[5], @acc[5], @acc[11]
	adc	@acc[6], @t[5],   @t[6]

	extr	@acc[0], @acc[1], @acc[0], #62
	extr	@acc[1], @acc[2], @acc[1], #62
	extr	@acc[2], @acc[3], @acc[2], #62
	asr	@t[6], @acc[6], #63
	extr	@acc[3], @acc[4], @acc[3], #62
	extr	@acc[4], @acc[5], @acc[4], #62
	extr	@acc[5], @acc[6], @acc[5], #62

	eor	@acc[0], @acc[0], @t[6]
	eor	@acc[1], @acc[1], @t[6]
	adds	@acc[0], @acc[0], @t[6], lsr#63
	eor	@acc[2], @acc[2], @t[6]
	adcs	@acc[1], @acc[1], xzr
	eor	@acc[3], @acc[3], @t[6]
	adcs	@acc[2], @acc[2], xzr
	eor	@acc[4], @acc[4], @t[6]
	adcs	@acc[3], @acc[3], xzr
	eor	@acc[5], @acc[5], @t[6]
	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
	adcs	@acc[4], @acc[4], xzr
	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
	adc	@acc[5], @acc[5], xzr
	stp	@acc[4], @acc[5], [$out_ptr,#8*4]

	eor	$f0, $f0, @t[6]
	eor	$g0, $g0, @t[6]
	sub	$f0, $f0, @t[6]
	sub	$g0, $g0, @t[6]

	ret
.size	__smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
___

{
my @a = @acc[0..5];
my @b = @acc[6..11];

$code.=<<___;
.type	__ab_approximation_62, %function
.align	4
__ab_approximation_62:
	ldp	@a[4], @a[5], [$in_ptr,#8*4]
	ldp	@b[4], @b[5], [$in_ptr,#8*10]
	ldp	@a[2], @a[3], [$in_ptr,#8*2]
	ldp	@b[2], @b[3], [$in_ptr,#8*8]

.Lab_approximation_62_loaded:
	orr	@t[0], @a[5], @b[5]	// check top-most limbs, ...
	cmp	@t[0], #0
	csel	@a[5], @a[5], @a[4], ne
	csel	@b[5], @b[5], @b[4], ne
	csel	@a[4], @a[4], @a[3], ne
	orr	@t[0], @a[5], @b[5]	// ... ones before top-most, ...
	csel	@b[4], @b[4], @b[3], ne

	ldp	@a[0], @a[1], [$in_ptr,#8*0]
	ldp	@b[0], @b[1], [$in_ptr,#8*6]

	cmp	@t[0], #0
	csel	@a[5], @a[5], @a[4], ne
	csel	@b[5], @b[5], @b[4], ne
	csel	@a[4], @a[4], @a[2], ne
	orr	@t[0], @a[5], @b[5]	// ... and ones before that ...
	csel	@b[4], @b[4], @b[2], ne

	cmp	@t[0], #0
	csel	@a[5], @a[5], @a[4], ne
	csel	@b[5], @b[5], @b[4], ne
	csel	@a[4], @a[4], @a[1], ne
	orr	@t[0], @a[5], @b[5]
	csel	@b[4], @b[4], @b[1], ne

	clz	@t[0], @t[0]
	cmp	@t[0], #64
	csel	@t[0], @t[0], xzr, ne
	csel	@a[5], @a[5], @a[4], ne
	csel	@b[5], @b[5], @b[4], ne
	neg	@t[1], @t[0]

	lslv	@a[5], @a[5], @t[0]	// align high limbs to the left
	lslv	@b[5], @b[5], @t[0]
	lsrv	@a[4], @a[4], @t[1]
	lsrv	@b[4], @b[4], @t[1]
	and	@a[4], @a[4], @t[1], asr#6
	and	@b[4], @b[4], @t[1], asr#6
	orr	@a[5], @a[5], @a[4]
	orr	@b[5], @b[5], @b[4]

	b	__inner_loop_62
	ret
.size	__ab_approximation_62,.-__ab_approximation_62
___
}
$code.=<<___;
.type	__inner_loop_62, %function
.align	4
__inner_loop_62:
	mov	$f0, #1		// |f0|=1
	mov	$g0, #0		// |g0|=0
	mov	$f1, #0		// |f1|=0
	mov	$g1, #1		// |g1|=1

.Loop_62:
	sbfx	@t[6], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
	sub	$cnt, $cnt, #1
	subs	@t[2], $b_lo, $a_lo	// |b_|-|a_|
	and	@t[0], $b_lo, @t[6]
	sbc	@t[3], $b_hi, $a_hi
	and	@t[1], $b_hi, @t[6]
	subs	@t[4], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
	mov	@t[0], $f0
	sbcs	@t[5], $a_hi, @t[1]
	mov	@t[1], $g0
	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
	csel	$b_hi, $b_hi, $a_hi, hs
	csel	$a_lo, @t[4], @t[2], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
	csel	$a_hi, @t[5], @t[3], hs
	csel	$f0, $f0, $f1,       hs	// exchange |f0| and |f1|
	csel	$f1, $f1, @t[0],     hs
	csel	$g0, $g0, $g1,       hs	// exchange |g0| and |g1|
	csel	$g1, $g1, @t[1],     hs
	extr	$a_lo, $a_hi, $a_lo, #1
	lsr	$a_hi, $a_hi, #1
	and	@t[0], $f1, @t[6]
	and	@t[1], $g1, @t[6]
	add	$f1, $f1, $f1		// |f1|<<=1
	add	$g1, $g1, $g1		// |g1|<<=1
	sub	$f0, $f0, @t[0]		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
	sub	$g0, $g0, @t[1]		// |g0|-=|g1| (or |g0-=0| ...)
	cbnz	$cnt, .Loop_62

	ret
.size	__inner_loop_62,.-__inner_loop_62
___

print $code;
close STDOUT;