ftu/blst/asm/ct_inverse_mod_384-armv8.pl
2022-09-09 02:47:49 -04:00

611 lines
17 KiB
Raku
Executable file

#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast Euclidean inversion as suggested in
# https://eprint.iacr.org/2020/972. Performance is >12x better [on
# Cortex cores] than modulus-specific FLT addition chain...
#
# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
#
$python_ref.=<<'___';
def ct_inverse_mod_383(inp, mod):
a, u = inp, 1
b, v = mod, 0
k = 62
w = 64
mask = (1 << w) - 1
for i in range(0, 766 // k):
# __ab_approximation_62
n = max(a.bit_length(), b.bit_length())
if n < 128:
a_, b_ = a, b
else:
a_ = (a & mask) | ((a >> (n-w)) << w)
b_ = (b & mask) | ((b >> (n-w)) << w)
# __inner_loop_62
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, k):
if a_ & 1:
if a_ < b_:
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
a_, f0, g0 = a_-b_, f0-f1, g0-g1
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
# __smul_383_n_shift_by_62
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
if a < 0:
a, f0, g0 = -a, -f0, -g0
if b < 0:
b, f1, g1 = -b, -f1, -g1
# __smul_767x63
u, v = u*f0 + v*g0, u*f1 + v*g1
if 766 % k:
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, 766 % k):
if a & 1:
if a < b:
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
a, f0, g0 = a-b, f0-f1, g0-g1
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
v = u*f1 + v*g1
if v < 0:
v += mod << (768 - mod.bit_length()) # left aligned
return v & (2**768 - 1) # to be reduced % mod
___
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
my @acc=map("x$_",(3..14));
my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21));
my $cnt = $n_ptr;
my @t = map("x$_",(22..28,2));
my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11];
$frame = 16+2*512;
$code.=<<___;
.text
.globl ct_inverse_mod_383
.type ct_inverse_mod_383, %function
.align 5
ct_inverse_mod_383:
paciasp
stp x29, x30, [sp,#-128]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
sub sp, sp, #$frame
ldp @t[0], @acc[1], [$in_ptr,#8*0]
ldp @acc[2], @acc[3], [$in_ptr,#8*2]
ldp @acc[4], @acc[5], [$in_ptr,#8*4]
add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot
and $in_ptr, $in_ptr, #-512 // in the frame...
stp $out_ptr, $nx_ptr, [sp]
ldp @acc[6], @acc[7], [$n_ptr,#8*0]
ldp @acc[8], @acc[9], [$n_ptr,#8*2]
ldp @acc[10], @acc[11], [$n_ptr,#8*4]
stp @t[0], @acc[1], [$in_ptr,#8*0] // copy input to |a|
stp @acc[2], @acc[3], [$in_ptr,#8*2]
stp @acc[4], @acc[5], [$in_ptr,#8*4]
stp @acc[6], @acc[7], [$in_ptr,#8*6] // copy modulus to |b|
stp @acc[8], @acc[9], [$in_ptr,#8*8]
stp @acc[10], @acc[11], [$in_ptr,#8*10]
////////////////////////////////////////// first iteration
mov $cnt, #62
bl .Lab_approximation_62_loaded
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
str $f0,[$out_ptr,#8*12] // initialize |u| with |f0|
mov $f0, $f1 // |f1|
mov $g0, $g1 // |g1|
add $out_ptr, $out_ptr, #8*6 // pointer to dst |b|
bl __smul_383_n_shift_by_62
str $f0, [$out_ptr,#8*12] // initialize |v| with |f1|
////////////////////////////////////////// second iteration
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
mov $cnt, #62
bl __ab_approximation_62
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov $f_, $f0 // corrected |f0|
mov $g_, $g0 // corrected |g0|
mov $f0, $f1 // |f1|
mov $g0, $g1 // |g1|
add $out_ptr, $out_ptr, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
ldr @acc[4], [$in_ptr,#8*12] // |u|
ldr @acc[5], [$in_ptr,#8*18] // |v|
mul @acc[0], $f_, @acc[4] // |u|*|f0|
smulh @acc[1], $f_, @acc[4]
mul @acc[2], $g_, @acc[5] // |v|*|g0|
smulh @acc[3], $g_, @acc[5]
adds @acc[0], @acc[0], @acc[2]
adc @acc[1], @acc[1], @acc[3]
stp @acc[0], @acc[1], [$out_ptr,#8*6]
asr @acc[2], @acc[1], #63 // sign extenstion
stp @acc[2], @acc[2], [$out_ptr,#8*8]
stp @acc[2], @acc[2], [$out_ptr,#8*10]
mul @acc[0], $f0, @acc[4] // |u|*|f1|
smulh @acc[1], $f0, @acc[4]
mul @acc[2], $g0, @acc[5] // |v|*|g1|
smulh @acc[3], $g0, @acc[5]
adds @acc[0], @acc[0], @acc[2]
adc @acc[1], @acc[1], @acc[3]
stp @acc[0], @acc[1], [$out_ptr,#8*12]
asr @acc[2], @acc[1], #63 // sign extenstion
stp @acc[2], @acc[2], [$out_ptr,#8*14]
stp @acc[2], @acc[2], [$out_ptr,#8*16]
___
for($i=2; $i<11; $i++) {
$code.=<<___;
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
mov $cnt, #62
bl __ab_approximation_62
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov $f_, $f0 // corrected |f0|
mov $g_, $g0 // corrected |g0|
mov $f0, $f1 // |f1|
mov $g0, $g1 // |g1|
add $out_ptr, $out_ptr, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add $out_ptr, $out_ptr, #8*6 // pointer to destination |u|
bl __smul_383x63
mov $f_, $f0 // corrected |f1|
mov $g_, $g0 // corrected |g1|
add $out_ptr, $out_ptr, #8*6 // pointer to destination |v|
bl __smul_383x63
___
$code.=<<___ if ($i>5);
bl __smul_767x63_tail
___
$code.=<<___ if ($i==5);
asr @t[5], @t[5], #63 // sign extension
stp @t[5], @t[5], [$out_ptr,#8*6]
stp @t[5], @t[5], [$out_ptr,#8*8]
stp @t[5], @t[5], [$out_ptr,#8*10]
___
}
$code.=<<___;
////////////////////////////////////////// iteration before last
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
mov $cnt, #62
//bl __ab_approximation_62 // |a| and |b| are exact,
ldp $a_lo, $a_hi, [$in_ptr,#8*0] // just load
ldp $b_lo, $b_hi, [$in_ptr,#8*6]
bl __inner_loop_62
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
str $a_lo, [$out_ptr,#8*0]
str $b_lo, [$out_ptr,#8*6]
mov $f_, $f0 // exact |f0|
mov $g_, $g0 // exact |g0|
mov $f0, $f1
mov $g0, $g1
add $out_ptr, $out_ptr, #8*12 // pointer to dst |u|
bl __smul_383x63
mov $f_, $f0 // exact |f1|
mov $g_, $g0 // exact |g1|
add $out_ptr, $out_ptr, #8*6 // pointer to dst |v|
bl __smul_383x63
bl __smul_767x63_tail
////////////////////////////////////////// last iteration
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
mov $cnt, #22 // 766 % 62
//bl __ab_approximation_62 // |a| and |b| are exact,
ldr $a_lo, [$in_ptr,#8*0] // just load
eor $a_hi, $a_hi, $a_hi
ldr $b_lo, [$in_ptr,#8*6]
eor $b_hi, $b_hi, $b_hi
bl __inner_loop_62
mov $f_, $f1
mov $g_, $g1
ldp $out_ptr, $f0, [sp] // original out_ptr and n_ptr
bl __smul_383x63
bl __smul_767x63_tail
ldr x30, [x29,#8]
asr @t[0], @acc[5], #63 // sign as mask
ldp @acc[6], @acc[7], [$f0,#8*0]
ldp @acc[8], @acc[9], [$f0,#8*2]
ldp @acc[10], @acc[11], [$f0,#8*4]
and @acc[6], @acc[6], @t[0] // add mod<<384 conditionally
and @acc[7], @acc[7], @t[0]
adds @acc[0], @acc[0], @acc[6]
and @acc[8], @acc[8], @t[0]
adcs @acc[1], @acc[1], @acc[7]
and @acc[9], @acc[9], @t[0]
adcs @acc[2], @acc[2], @acc[8]
and @acc[10], @acc[10], @t[0]
adcs @acc[3], @acc[3], @acc[9]
and @acc[11], @acc[11], @t[0]
stp @acc[0], @acc[1], [$out_ptr,#8*6]
adcs @acc[4], @acc[4], @acc[10]
stp @acc[2], @acc[3], [$out_ptr,#8*8]
adc @acc[5], @acc[5], @acc[11]
stp @acc[4], @acc[5], [$out_ptr,#8*10]
add sp, sp, #$frame
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldr x29, [sp],#128
autiasp
ret
.size ct_inverse_mod_383,.-ct_inverse_mod_383
////////////////////////////////////////////////////////////////////////
// see corresponding commentary in ctx_inverse_mod_384-x86_64...
.type __smul_383x63, %function
.align 5
__smul_383x63:
___
for($j=0; $j<2; $j++) {
my $f_ = $f_; $f_ = $g_ if ($j);
my @acc = @acc; @acc = @acc[6..11] if ($j);
my $k = 8*12+8*6*$j;
$code.=<<___;
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|)
asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s)
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|)
ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k]
eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|)
sub $f_, $f_, $f1
eor @acc[1], @acc[1], $f1
adds @acc[0], @acc[0], $f1, lsr#63
eor @acc[2], @acc[2], $f1
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], $f1
adcs @acc[2], @acc[2], xzr
eor @acc[4], @acc[4], $f1
adcs @acc[3], @acc[3], xzr
umulh @t[0], @acc[0], $f_
eor @acc[5], @acc[5], $f1
umulh @t[1], @acc[1], $f_
adcs @acc[4], @acc[4], xzr
umulh @t[2], @acc[2], $f_
adcs @acc[5], @acc[5], xzr
umulh @t[3], @acc[3], $f_
___
$code.=<<___ if ($j);
adc $g1, xzr, xzr // used in __smul_767x63_tail
___
$code.=<<___;
umulh @t[4], @acc[4], $f_
mul @acc[0], @acc[0], $f_
mul @acc[1], @acc[1], $f_
mul @acc[2], @acc[2], $f_
adds @acc[1], @acc[1], @t[0]
mul @acc[3], @acc[3], $f_
adcs @acc[2], @acc[2], @t[1]
mul @acc[4], @acc[4], $f_
adcs @acc[3], @acc[3], @t[2]
mul @t[5+$j],@acc[5], $f_
adcs @acc[4], @acc[4], @t[3]
adcs @t[5+$j],@t[5+$j],@t[4]
___
$code.=<<___ if ($j==0);
adc @t[7], xzr, xzr
___
}
$code.=<<___;
adc @t[7], @t[7], xzr
adds @acc[0], @acc[0], @acc[6]
adcs @acc[1], @acc[1], @acc[7]
adcs @acc[2], @acc[2], @acc[8]
adcs @acc[3], @acc[3], @acc[9]
stp @acc[0], @acc[1], [$out_ptr,#8*0]
adcs @acc[4], @acc[4], @acc[10]
stp @acc[2], @acc[3], [$out_ptr,#8*2]
adcs @t[5], @t[5], @t[6]
stp @acc[4], @t[5], [$out_ptr,#8*4]
adc @t[6], @t[7], xzr // used in __smul_767x63_tail
ret
.size __smul_383x63,.-__smul_383x63
.type __smul_767x63_tail, %function
.align 5
__smul_767x63_tail:
smulh @t[5], @acc[5], $f_
ldp @acc[0], @acc[1], [$in_ptr,#8*24] // load rest of |v|
umulh @acc[11],@acc[11], $g_
ldp @acc[2], @acc[3], [$in_ptr,#8*26]
ldp @acc[4], @acc[5], [$in_ptr,#8*28]
eor @acc[0], @acc[0], $f1 // conditionally negate rest of |v|
eor @acc[1], @acc[1], $f1
eor @acc[2], @acc[2], $f1
adds @acc[0], @acc[0], $g1
eor @acc[3], @acc[3], $f1
adcs @acc[1], @acc[1], xzr
eor @acc[4], @acc[4], $f1
adcs @acc[2], @acc[2], xzr
eor @acc[5], @acc[5], $f1
adcs @acc[3], @acc[3], xzr
umulh @t[0], @acc[0], $g_
adcs @acc[4], @acc[4], xzr
umulh @t[1], @acc[1], $g_
adc @acc[5], @acc[5], xzr
umulh @t[2], @acc[2], $g_
add @acc[11], @acc[11], @t[6]
umulh @t[3], @acc[3], $g_
asr @t[6], @t[5], #63
umulh @t[4], @acc[4], $g_
mul @acc[0], @acc[0], $g_
mul @acc[1], @acc[1], $g_
mul @acc[2], @acc[2], $g_
adds @acc[0], @acc[0], @acc[11]
mul @acc[3], @acc[3], $g_
adcs @acc[1], @acc[1], @t[0]
mul @acc[4], @acc[4], $g_
adcs @acc[2], @acc[2], @t[1]
mul @acc[5], @acc[5], $g_
adcs @acc[3], @acc[3], @t[2]
adcs @acc[4], @acc[4], @t[3]
adc @acc[5], @acc[5], @t[4]
adds @acc[0], @acc[0], @t[5]
adcs @acc[1], @acc[1], @t[6]
adcs @acc[2], @acc[2], @t[6]
adcs @acc[3], @acc[3], @t[6]
stp @acc[0], @acc[1], [$out_ptr,#8*6]
adcs @acc[4], @acc[4], @t[6]
stp @acc[2], @acc[3], [$out_ptr,#8*8]
adc @acc[5], @acc[5], @t[6]
stp @acc[4], @acc[5], [$out_ptr,#8*10]
ret
.size __smul_767x63_tail,.-__smul_767x63_tail
.type __smul_383_n_shift_by_62, %function
.align 5
__smul_383_n_shift_by_62:
___
for($j=0; $j<2; $j++) {
my $f0 = $f0; $f0 = $g0 if ($j);
my @acc = @acc; @acc = @acc[6..11] if ($j);
my $k = 8*6*$j;
$code.=<<___;
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|)
asr @t[6], $f0, #63 // |f0|'s sign as mask (or |g0|'s)
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
eor @t[7], $f0, @t[6] // conditionally negate |f0| (or |g0|)
ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k]
eor @acc[0], @acc[0], @t[6] // conditionally negate |a| (or |b|)
sub @t[7], @t[7], @t[6]
eor @acc[1], @acc[1], @t[6]
adds @acc[0], @acc[0], @t[6], lsr#63
eor @acc[2], @acc[2], @t[6]
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], @t[6]
adcs @acc[2], @acc[2], xzr
eor @acc[4], @acc[4], @t[6]
umulh @t[0], @acc[0], @t[7]
adcs @acc[3], @acc[3], xzr
umulh @t[1], @acc[1], @t[7]
eor @acc[5], @acc[5], @t[6]
umulh @t[2], @acc[2], @t[7]
adcs @acc[4], @acc[4], xzr
umulh @t[3], @acc[3], @t[7]
adc @acc[5], @acc[5], xzr
umulh @t[4], @acc[4], @t[7]
smulh @t[5+$j], @acc[5], @t[7]
mul @acc[0], @acc[0], @t[7]
mul @acc[1], @acc[1], @t[7]
mul @acc[2], @acc[2], @t[7]
adds @acc[1], @acc[1], @t[0]
mul @acc[3], @acc[3], @t[7]
adcs @acc[2], @acc[2], @t[1]
mul @acc[4], @acc[4], @t[7]
adcs @acc[3], @acc[3], @t[2]
mul @acc[5], @acc[5], @t[7]
adcs @acc[4], @acc[4], @t[3]
adcs @acc[5], @acc[5] ,@t[4]
adc @t[5+$j], @t[5+$j], xzr
___
}
$code.=<<___;
adds @acc[0], @acc[0], @acc[6]
adcs @acc[1], @acc[1], @acc[7]
adcs @acc[2], @acc[2], @acc[8]
adcs @acc[3], @acc[3], @acc[9]
adcs @acc[4], @acc[4], @acc[10]
adcs @acc[5], @acc[5], @acc[11]
adc @acc[6], @t[5], @t[6]
extr @acc[0], @acc[1], @acc[0], #62
extr @acc[1], @acc[2], @acc[1], #62
extr @acc[2], @acc[3], @acc[2], #62
asr @t[6], @acc[6], #63
extr @acc[3], @acc[4], @acc[3], #62
extr @acc[4], @acc[5], @acc[4], #62
extr @acc[5], @acc[6], @acc[5], #62
eor @acc[0], @acc[0], @t[6]
eor @acc[1], @acc[1], @t[6]
adds @acc[0], @acc[0], @t[6], lsr#63
eor @acc[2], @acc[2], @t[6]
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], @t[6]
adcs @acc[2], @acc[2], xzr
eor @acc[4], @acc[4], @t[6]
adcs @acc[3], @acc[3], xzr
eor @acc[5], @acc[5], @t[6]
stp @acc[0], @acc[1], [$out_ptr,#8*0]
adcs @acc[4], @acc[4], xzr
stp @acc[2], @acc[3], [$out_ptr,#8*2]
adc @acc[5], @acc[5], xzr
stp @acc[4], @acc[5], [$out_ptr,#8*4]
eor $f0, $f0, @t[6]
eor $g0, $g0, @t[6]
sub $f0, $f0, @t[6]
sub $g0, $g0, @t[6]
ret
.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
___
{
my @a = @acc[0..5];
my @b = @acc[6..11];
$code.=<<___;
.type __ab_approximation_62, %function
.align 4
__ab_approximation_62:
ldp @a[4], @a[5], [$in_ptr,#8*4]
ldp @b[4], @b[5], [$in_ptr,#8*10]
ldp @a[2], @a[3], [$in_ptr,#8*2]
ldp @b[2], @b[3], [$in_ptr,#8*8]
.Lab_approximation_62_loaded:
orr @t[0], @a[5], @b[5] // check top-most limbs, ...
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[3], ne
orr @t[0], @a[5], @b[5] // ... ones before top-most, ...
csel @b[4], @b[4], @b[3], ne
ldp @a[0], @a[1], [$in_ptr,#8*0]
ldp @b[0], @b[1], [$in_ptr,#8*6]
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[2], ne
orr @t[0], @a[5], @b[5] // ... and ones before that ...
csel @b[4], @b[4], @b[2], ne
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[1], ne
orr @t[0], @a[5], @b[5]
csel @b[4], @b[4], @b[1], ne
clz @t[0], @t[0]
cmp @t[0], #64
csel @t[0], @t[0], xzr, ne
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
neg @t[1], @t[0]
lslv @a[5], @a[5], @t[0] // align high limbs to the left
lslv @b[5], @b[5], @t[0]
lsrv @a[4], @a[4], @t[1]
lsrv @b[4], @b[4], @t[1]
and @a[4], @a[4], @t[1], asr#6
and @b[4], @b[4], @t[1], asr#6
orr @a[5], @a[5], @a[4]
orr @b[5], @b[5], @b[4]
b __inner_loop_62
ret
.size __ab_approximation_62,.-__ab_approximation_62
___
}
$code.=<<___;
.type __inner_loop_62, %function
.align 4
__inner_loop_62:
mov $f0, #1 // |f0|=1
mov $g0, #0 // |g0|=0
mov $f1, #0 // |f1|=0
mov $g1, #1 // |g1|=1
.Loop_62:
sbfx @t[6], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting
sub $cnt, $cnt, #1
subs @t[2], $b_lo, $a_lo // |b_|-|a_|
and @t[0], $b_lo, @t[6]
sbc @t[3], $b_hi, $a_hi
and @t[1], $b_hi, @t[6]
subs @t[4], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
mov @t[0], $f0
sbcs @t[5], $a_hi, @t[1]
mov @t[1], $g0
csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_|
csel $b_hi, $b_hi, $a_hi, hs
csel $a_lo, @t[4], @t[2], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel $a_hi, @t[5], @t[3], hs
csel $f0, $f0, $f1, hs // exchange |f0| and |f1|
csel $f1, $f1, @t[0], hs
csel $g0, $g0, $g1, hs // exchange |g0| and |g1|
csel $g1, $g1, @t[1], hs
extr $a_lo, $a_hi, $a_lo, #1
lsr $a_hi, $a_hi, #1
and @t[0], $f1, @t[6]
and @t[1], $g1, @t[6]
add $f1, $f1, $f1 // |f1|<<=1
add $g1, $g1, $g1 // |g1|<<=1
sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even)
sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...)
cbnz $cnt, .Loop_62
ret
.size __inner_loop_62,.-__inner_loop_62
___
print $code;
close STDOUT;