#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast Euclidean inversion as suggested in
# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 -
# on Cortex-A57.
#
# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
#                                                       const vec256 modx);
#
$python_ref.=<<'___';
def ct_inverse_mod_256(inp, mod):
    a, u = inp, 1
    b, v = mod, 0

    k = 31
    mask = (1 << k) - 1

    for i in range(0, 512 // k - 1):
        # __ab_approximation_31
        n = max(a.bit_length(), b.bit_length())
        if n < 64:
            a_, b_ = a, b
        else:
            a_ = (a & mask) | ((a >> (n-k-2)) << k)
            b_ = (b & mask) | ((b >> (n-k-2)) << k)

        # __inner_loop_31
        f0, g0, f1, g1 = 1, 0, 0, 1
        for j in range(0, k):
            if a_ & 1:
                if a_ < b_:
                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
                a_, f0, g0 = a_-b_, f0-f1, g0-g1
            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1

        # __smul_256_n_shift_by_31
        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
        if a < 0:
            a, f0, g0 = -a, -f0, -g0
        if b < 0:
            b, f1, g1 = -b, -f1, -g1

        # __smul_512x63
        u, v = u*f0 + v*g0, u*f1 + v*g1

    if 512 % k + k:
        f0, g0, f1, g1 = 1, 0, 0, 1
        for j in range(0, 512 % k + k):
            if a & 1:
                if a < b:
                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
                a, f0, g0 = a-b, f0-f1, g0-g1
            a, f1, g1 = a >> 1, f1 << 1, g1 << 1

        v = u*f1 + v*g1

    mod <<= 512 - mod.bit_length()  # align to the left
    if v < 0:
        v += mod
    if v < 0:
        v += mod
    elif v == 1<<512
        v -= mod

    return v & (2**512 - 1) # to be reduced % mod
___

$flavour = shift;
$output  = shift;

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
my @acc=map("x$_",(4..11));
my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17));
my $cnt = $n_ptr;
my @t = map("x$_",(19..26));
my ($a_lo, $b_lo) = @acc[3,7];

$frame = 16+2*512;

$code.=<<___;
.text

.globl	ct_inverse_mod_256
.type	ct_inverse_mod_256, %function
.align	5
ct_inverse_mod_256:
	paciasp
	stp	x29, x30, [sp,#-80]!
	add	x29, sp, #0
	stp	x19, x20, [sp,#16]
	stp	x21, x22, [sp,#32]
	stp	x23, x24, [sp,#48]
	stp	x25, x26, [sp,#64]
	sub	sp, sp, #$frame

	ldp	@acc[0], @acc[1], [$in_ptr,#8*0]
	ldp	@acc[2], @acc[3], [$in_ptr,#8*2]

	add	$in_ptr, sp, #16+511	// find closest 512-byte-aligned spot
	and	$in_ptr, $in_ptr, #-512	// in the frame...
	str	$out_ptr, [sp]

	ldp	@acc[4], @acc[5], [$n_ptr,#8*0]
	ldp	@acc[6], @acc[7], [$n_ptr,#8*2]

	stp	@acc[0], @acc[1], [$in_ptr,#8*0]	// copy input to |a|
	stp	@acc[2], @acc[3], [$in_ptr,#8*2]
	stp	@acc[4], @acc[5], [$in_ptr,#8*4]	// copy modulus to |b|
	stp	@acc[6], @acc[7], [$in_ptr,#8*6]

	////////////////////////////////////////// first iteration
	bl	.Lab_approximation_31_256_loaded

	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
	bl	__smul_256_n_shift_by_31
	str	$f0,[$out_ptr,#8*8]		// initialize |u| with |f0|

	mov	$f0, $f1			// |f1|
	mov	$g0, $g1			// |g1|
	add	$out_ptr, $out_ptr, #8*4	// pointer to dst |b|
	bl	__smul_256_n_shift_by_31
	str	$f0, [$out_ptr,#8*9]		// initialize |v| with |f1|

	////////////////////////////////////////// second iteration
	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
	bl	__ab_approximation_31_256

	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
	bl	__smul_256_n_shift_by_31
	mov	$f_, $f0			// corrected |f0|
	mov	$g_, $g0			// corrected |g0|

	mov	$f0, $f1			// |f1|
	mov	$g0, $g1			// |g1|
	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |b|
	bl	__smul_256_n_shift_by_31

	ldr	@acc[4], [$in_ptr,#8*8]		// |u|
	ldr	@acc[5], [$in_ptr,#8*13]	// |v|
	madd	@acc[0], $f_, @acc[4], xzr	// |u|*|f0|
	madd	@acc[0], $g_, @acc[5], @acc[0]	// |v|*|g0|
	str	@acc[0], [$out_ptr,#8*4]
	asr	@acc[1], @acc[0], #63		// sign extenstion
	stp	@acc[1], @acc[1], [$out_ptr,#8*5]
	stp	@acc[1], @acc[1], [$out_ptr,#8*7]

	madd	@acc[0], $f0, @acc[4], xzr	// |u|*|f1|
	madd	@acc[0], $g0, @acc[5], @acc[0]	// |v|*|g1|
	str	@acc[0], [$out_ptr,#8*9]
	asr	@acc[1], @acc[0], #63		// sign extenstion
	stp	@acc[1], @acc[1], [$out_ptr,#8*10]
	stp	@acc[1], @acc[1], [$out_ptr,#8*12]
___
for($i=2; $i<15; $i++) {
$code.=<<___;
	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
	bl	__ab_approximation_31_256

	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
	bl	__smul_256_n_shift_by_31
	mov	$f_, $f0			// corrected |f0|
	mov	$g_, $g0			// corrected |g0|

	mov	$f0, $f1			// |f1|
	mov	$g0, $g1			// |g1|
	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |b|
	bl	__smul_256_n_shift_by_31

	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |u|
	bl	__smul_256x63
	adc	@t[3], @t[3], @t[4]
	str	@t[3], [$out_ptr,#8*4]

	mov	$f_, $f0			// corrected |f1|
	mov	$g_, $g0			// corrected |g1|
	add	$out_ptr, $out_ptr, #8*5	// pointer to destination |v|
	bl	__smul_256x63
___
$code.=<<___	if ($i>7);
	bl	__smul_512x63_tail
___
$code.=<<___	if ($i<=7);
	adc	@t[3], @t[3], @t[4]
	stp	@t[3], @t[3], [$out_ptr,#8*4]
	stp	@t[3], @t[3], [$out_ptr,#8*6]
___
}
$code.=<<___;
	////////////////////////////////////////// two[!] last iterations
	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
	mov	$cnt, #47			// 31 + 512 % 31
	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
	ldr	$a_lo, [$in_ptr,#8*0]		// just load
	ldr	$b_lo, [$in_ptr,#8*4]
	bl	__inner_loop_62_256

	mov	$f_, $f1
	mov	$g_, $g1
	ldr	$out_ptr, [sp]			// original out_ptr
	bl	__smul_256x63
	bl	__smul_512x63_tail
	ldr	x30, [x29,#8]

	smulh	@t[1], @acc[3], $g_		// figure out top-most limb
	ldp	@acc[4], @acc[5], [$nx_ptr,#8*0]
	adc	@t[4], @t[4], @t[6]
	ldp	@acc[6], @acc[7], [$nx_ptr,#8*2]

	add	@t[1], @t[1], @t[4]		// @t[1] is 1, 0 or -1
	asr	@t[0], @t[1], #63		// sign as mask

	and	@t[4],   @acc[4], @t[0]		// add mod<<256 conditionally
	and	@t[5],   @acc[5], @t[0]
	adds	@acc[0], @acc[0], @t[4]
	and	@t[6],   @acc[6], @t[0]
	adcs	@acc[1], @acc[1], @t[5]
	and	@t[7],   @acc[7], @t[0]
	adcs	@acc[2], @acc[2], @t[6]
	adcs	@acc[3], @t[3],   @t[7]
	adc	@t[1], @t[1], xzr		// @t[1] is 1, 0 or -1

	neg	@t[0], @t[1]
	orr	@t[1], @t[1], @t[0]		// excess bit or sign as mask
	asr	@t[0], @t[0], #63		// excess bit as mask

	and	@acc[4], @acc[4], @t[1]		// mask |mod|
	and	@acc[5], @acc[5], @t[1]
	and	@acc[6], @acc[6], @t[1]
	and	@acc[7], @acc[7], @t[1]

	eor	@acc[4], @acc[4], @t[0]		// conditionally negate |mod|
	eor	@acc[5], @acc[5], @t[0]
	adds	@acc[4], @acc[4], @t[0], lsr#63
	eor	@acc[6], @acc[6], @t[0]
	adcs	@acc[5], @acc[5], xzr
	eor	@acc[7], @acc[7], @t[0]
	adcs	@acc[6], @acc[6], xzr
	adc	@acc[7], @acc[7], xzr

	adds	@acc[0], @acc[0], @acc[4]	// final adjustment for |mod|<<256
	adcs	@acc[1], @acc[1], @acc[5]
	adcs	@acc[2], @acc[2], @acc[6]
	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
	adc	@acc[3], @acc[3], @acc[7]
	stp	@acc[2], @acc[3], [$out_ptr,#8*6]

	add	sp, sp, #$frame
	ldp	x19, x20, [x29,#16]
	ldp	x21, x22, [x29,#32]
	ldp	x23, x24, [x29,#48]
	ldp	x25, x26, [x29,#64]
	ldr	x29, [sp],#80
	autiasp
	ret
.size	ct_inverse_mod_256,.-ct_inverse_mod_256

////////////////////////////////////////////////////////////////////////
.type	__smul_256x63, %function
.align	5
__smul_256x63:
___
for($j=0; $j<2; $j++) {
my $f_ = $f_;   $f_ = $g_          if ($j);
my @acc = @acc; @acc = @acc[4..7]  if ($j);
my $k = 8*8+8*5*$j;
$code.=<<___;
	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |u| (or |v|)
	asr	$f1, $f_, #63		// |f_|'s sign as mask (or |g_|'s)
	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
	eor	$f_, $f_, $f1		// conditionally negate |f_| (or |g_|)
	ldr	@t[3+$j], [$in_ptr,#8*4+$k]

	eor	@acc[0], @acc[0], $f1	// conditionally negate |u| (or |v|)
	sub	$f_, $f_, $f1
	eor	@acc[1], @acc[1], $f1
	adds	@acc[0], @acc[0], $f1, lsr#63
	eor	@acc[2], @acc[2], $f1
	adcs	@acc[1], @acc[1], xzr
	eor	@acc[3], @acc[3], $f1
	adcs	@acc[2], @acc[2], xzr
	eor	@t[3+$j], @t[3+$j], $f1
	 umulh	@t[0], @acc[0], $f_
	adcs	@acc[3], @acc[3], xzr
	 umulh	@t[1], @acc[1], $f_
	adcs	@t[3+$j], @t[3+$j], xzr
	 umulh	@t[2], @acc[2], $f_
___
$code.=<<___	if ($j!=0);
	adc	$g1, xzr, xzr		// used in __smul_512x63_tail
___
$code.=<<___;
	mul	@acc[0], @acc[0], $f_
	 cmp	$f_, #0
	mul	@acc[1], @acc[1], $f_
	 csel	@t[3+$j], @t[3+$j], xzr, ne
	mul	@acc[2], @acc[2], $f_
	adds	@acc[1], @acc[1], @t[0]
	mul	@t[5+$j], @acc[3], $f_
	adcs	@acc[2], @acc[2], @t[1]
	adcs	@t[5+$j], @t[5+$j], @t[2]
___
$code.=<<___	if ($j==0);
	adc	@t[7], xzr, xzr
___
}
$code.=<<___;
	adc	@t[7], @t[7], xzr

	adds	@acc[0], @acc[0], @acc[4]
	adcs	@acc[1], @acc[1], @acc[5]
	adcs	@acc[2], @acc[2], @acc[6]
	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
	adcs	@t[5],   @t[5],   @t[6]
	stp	@acc[2], @t[5], [$out_ptr,#8*2]

	ret
.size	__smul_256x63,.-__smul_256x63

.type	__smul_512x63_tail, %function
.align	5
__smul_512x63_tail:
	umulh	@t[5], @acc[3], $f_
	ldp	@acc[1], @acc[2], [$in_ptr,#8*18]	// load rest of |v|
	adc	@t[7], @t[7], xzr
	ldr	@acc[3], [$in_ptr,#8*20]
	and	@t[3], @t[3], $f_

	umulh	@acc[7], @acc[7], $g_	// resume |v|*|g1| chain

	sub	@t[5], @t[5], @t[3]	// tie up |u|*|f1| chain
	asr	@t[6], @t[5], #63

	eor	@acc[1], @acc[1], $f1	// conditionally negate rest of |v|
	eor	@acc[2], @acc[2], $f1
	adds	@acc[1], @acc[1], $g1
	eor	@acc[3], @acc[3], $f1
	adcs	@acc[2], @acc[2], xzr
	 umulh	@t[0], @t[4],   $g_
	adc	@acc[3], @acc[3], xzr
	 umulh	@t[1], @acc[1], $g_
	add	@acc[7], @acc[7], @t[7]
	 umulh	@t[2], @acc[2], $g_

	mul	@acc[0], @t[4],   $g_
	mul	@acc[1], @acc[1], $g_
	adds	@acc[0], @acc[0], @acc[7]
	mul	@acc[2], @acc[2], $g_
	adcs	@acc[1], @acc[1], @t[0]
	mul	@t[3],   @acc[3], $g_
	adcs	@acc[2], @acc[2], @t[1]
	adcs	@t[3],   @t[3],   @t[2]
	adc	@t[4], xzr, xzr		// used in the final step

	adds	@acc[0], @acc[0], @t[5]
	adcs	@acc[1], @acc[1], @t[6]
	adcs	@acc[2], @acc[2], @t[6]
	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
	adcs	@t[3],   @t[3],   @t[6]	// carry is used in the final step
	stp	@acc[2], @t[3],   [$out_ptr,#8*6]

	ret
.size	__smul_512x63_tail,.-__smul_512x63_tail

.type	__smul_256_n_shift_by_31, %function
.align	5
__smul_256_n_shift_by_31:
___
for($j=0; $j<2; $j++) {
my $f0 = $f0;   $f0 = $g0           if ($j);
my @acc = @acc; @acc = @acc[4..7]   if ($j);
my $k = 8*4*$j;
$code.=<<___;
	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |a| (or |b|)
	asr	@t[5], $f0, #63		// |f0|'s sign as mask (or |g0|'s)
	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
	eor	@t[6], $f0, @t[5]	// conditionally negate |f0| (or |g0|)

	eor	@acc[0], @acc[0], @t[5]	// conditionally negate |a| (or |b|)
	sub	@t[6], @t[6], @t[5]
	eor	@acc[1], @acc[1], @t[5]
	adds	@acc[0], @acc[0], @t[5], lsr#63
	eor	@acc[2], @acc[2], @t[5]
	adcs	@acc[1], @acc[1], xzr
	eor	@acc[3], @acc[3], @t[5]
	 umulh	@t[0], @acc[0], @t[6]
	adcs	@acc[2], @acc[2], xzr
	 umulh	@t[1], @acc[1], @t[6]
	adc	@acc[3], @acc[3], xzr
	 umulh	@t[2], @acc[2], @t[6]
	and	@t[5], @t[5], @t[6]
	 umulh	@t[3+$j], @acc[3], @t[6]
	neg	@t[5], @t[5]

	mul	@acc[0], @acc[0], @t[6]
	mul	@acc[1], @acc[1], @t[6]
	mul	@acc[2], @acc[2], @t[6]
	adds	@acc[1], @acc[1], @t[0]
	mul	@acc[3], @acc[3], @t[6]
	adcs	@acc[2], @acc[2], @t[1]
	adcs	@acc[3], @acc[3], @t[2]
	adc	@t[3+$j], @t[3+$j], @t[5]
___
}
$code.=<<___;
	adds	@acc[0], @acc[0], @acc[4]
	adcs	@acc[1], @acc[1], @acc[5]
	adcs	@acc[2], @acc[2], @acc[6]
	adcs	@acc[3], @acc[3], @acc[7]
	adc	@acc[4], @t[3],   @t[4]

	extr	@acc[0], @acc[1], @acc[0], #31
	extr	@acc[1], @acc[2], @acc[1], #31
	extr	@acc[2], @acc[3], @acc[2], #31
	asr	@t[4], @acc[4], #63	// result's sign as mask
	extr	@acc[3], @acc[4], @acc[3], #31

	eor	@acc[0], @acc[0], @t[4]	// ensure the result is positive
	eor	@acc[1], @acc[1], @t[4]
	adds	@acc[0], @acc[0], @t[4], lsr#63
	eor	@acc[2], @acc[2], @t[4]
	adcs	@acc[1], @acc[1], xzr
	eor	@acc[3], @acc[3], @t[4]
	adcs	@acc[2], @acc[2], xzr
	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
	adc	@acc[3], @acc[3], xzr
	stp	@acc[2], @acc[3], [$out_ptr,#8*2]

	eor	$f0, $f0, @t[4]		// adjust |f/g| accordingly
	eor	$g0, $g0, @t[4]
	sub	$f0, $f0, @t[4]
	sub	$g0, $g0, @t[4]

	ret
.size	__smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
___

{
my @a = @acc[0..3];
my @b = @acc[4..7];
my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]);

$code.=<<___;
.type	__ab_approximation_31_256, %function
.align	4
__ab_approximation_31_256:
	ldp	@a[2], @a[3], [$in_ptr,#8*2]
	ldp	@b[2], @b[3], [$in_ptr,#8*6]
	ldp	@a[0], @a[1], [$in_ptr,#8*0]
	ldp	@b[0], @b[1], [$in_ptr,#8*4]

.Lab_approximation_31_256_loaded:
	orr	@t[0], @a[3], @b[3]	// check top-most limbs, ...
	cmp	@t[0], #0
	csel	@a[3], @a[3], @a[2], ne
	csel	@b[3], @b[3], @b[2], ne
	csel	@a[2], @a[2], @a[1], ne
	orr	@t[0], @a[3], @b[3]	// and ones before top-most, ...
	csel	@b[2], @b[2], @b[1], ne

	cmp	@t[0], #0
	csel	@a[3], @a[3], @a[2], ne
	csel	@b[3], @b[3], @b[2], ne
	csel	@a[2], @a[2], @a[0], ne
	orr	@t[0], @a[3], @b[3]	// and one more, ...
	csel	@b[2], @b[2], @b[0], ne

	clz	@t[0], @t[0]
	cmp	@t[0], #64
	csel	@t[0], @t[0], xzr, ne
	csel	@a[3], @a[3], @a[2], ne
	csel	@b[3], @b[3], @b[2], ne
	neg	@t[1], @t[0]

	lslv	@a[3], @a[3], @t[0]	// align high limbs to the left
	lslv	@b[3], @b[3], @t[0]
	lsrv	@a[2], @a[2], @t[1]
	lsrv	@b[2], @b[2], @t[1]
	and	@a[2], @a[2], @t[1], asr#6
	and	@b[2], @b[2], @t[1], asr#6
	orr	$a_lo, @a[3], @a[2]
	orr	$b_lo, @b[3], @b[2]

	bfxil	$a_lo, @a[0], #0, #31
	bfxil	$b_lo, @b[0], #0, #31

	b	__inner_loop_31_256
	ret
.size	__ab_approximation_31_256,.-__ab_approximation_31_256

.type	__inner_loop_31_256, %function
.align	4
__inner_loop_31_256:
	mov	$cnt, #31
	mov	$fg0, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
	mov	$fg1, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
	mov	$bias,#0x7FFFFFFF7FFFFFFF

.Loop_31_256:
	sbfx	@t[3], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
	sub	$cnt, $cnt, #1
	and	@t[0], $b_lo, @t[3]
	sub	@t[1], $b_lo, $a_lo	// |b_|-|a_|
	subs	@t[2], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
	mov	@t[0], $fg1
	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
	csel	$a_lo, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
	csel	$fg1, $fg1, $fg0,    hs	// exchange |fg0| and |fg1|
	csel	$fg0, $fg0, @t[0],   hs
	lsr	$a_lo, $a_lo, #1
	and	@t[0], $fg1, @t[3]
	and	@t[1], $bias, @t[3]
	sub	$fg0, $fg0, @t[0]	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
	add	$fg1, $fg1, $fg1	// |f1|<<=1
	add	$fg0, $fg0, @t[1]
	sub	$fg1, $fg1, $bias
	cbnz	$cnt, .Loop_31_256

	mov	$bias, #0x7FFFFFFF
	ubfx	$f0, $fg0, #0, #32
	ubfx	$g0, $fg0, #32, #32
	ubfx	$f1, $fg1, #0, #32
	ubfx	$g1, $fg1, #32, #32
	sub	$f0, $f0, $bias		// remove bias
	sub	$g0, $g0, $bias
	sub	$f1, $f1, $bias
	sub	$g1, $g1, $bias

	ret
.size	__inner_loop_31_256,.-__inner_loop_31_256

.type	__inner_loop_62_256, %function
.align	4
__inner_loop_62_256:
	mov	$f0, #1		// |f0|=1
	mov	$g0, #0		// |g0|=0
	mov	$f1, #0		// |f1|=0
	mov	$g1, #1		// |g1|=1

.Loop_62_256:
	sbfx	@t[3], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
	sub	$cnt, $cnt, #1
	and	@t[0], $b_lo, @t[3]
	sub	@t[1], $b_lo, $a_lo	// |b_|-|a_|
	subs	@t[2], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
	mov	@t[0], $f0
	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
	csel	$a_lo, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
	mov	@t[1], $g0
	csel	$f0, $f0, $f1,       hs	// exchange |f0| and |f1|
	csel	$f1, $f1, @t[0],     hs
	csel	$g0, $g0, $g1,       hs	// exchange |g0| and |g1|
	csel	$g1, $g1, @t[1],     hs
	lsr	$a_lo, $a_lo, #1
	and	@t[0], $f1, @t[3]
	and	@t[1], $g1, @t[3]
	add	$f1, $f1, $f1		// |f1|<<=1
	add	$g1, $g1, $g1		// |g1|<<=1
	sub	$f0, $f0, @t[0]		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
	sub	$g0, $g0, @t[1]		// |g0|-=|g1| (or |g0-=0| ...)
	cbnz	$cnt, .Loop_62_256

	ret
.size	__inner_loop_62_256,.-__inner_loop_62_256
___
}

foreach(split("\n",$code)) {
    s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/;
    print $_,"\n";
}
close STDOUT;