ftu/blst/elf/ct_inverse_mod_256-armv8.S

785 lines
19 KiB
ArmAsm
Raw Normal View History

2022-09-09 06:47:49 +00:00
.text
.globl ct_inverse_mod_256
.type ct_inverse_mod_256, %function
.align 5
ct_inverse_mod_256:
.inst 0xd503233f
stp x29, x30, [sp,#-80]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
sub sp, sp, #1040
ldp x4, x5, [x1,#8*0]
ldp x6, x7, [x1,#8*2]
add x1, sp, #16+511 // find closest 512-byte-aligned spot
and x1, x1, #-512 // in the frame...
str x0, [sp]
ldp x8, x9, [x2,#8*0]
ldp x10, x11, [x2,#8*2]
stp x4, x5, [x1,#8*0] // copy input to |a|
stp x6, x7, [x1,#8*2]
stp x8, x9, [x1,#8*4] // copy modulus to |b|
stp x10, x11, [x1,#8*6]
////////////////////////////////////////// first iteration
bl .Lab_approximation_31_256_loaded
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
str x12,[x0,#8*8] // initialize |u| with |f0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to dst |b|
bl __smul_256_n_shift_by_31
str x12, [x0,#8*9] // initialize |v| with |f1|
////////////////////////////////////////// second iteration
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
ldr x8, [x1,#8*8] // |u|
ldr x9, [x1,#8*13] // |v|
madd x4, x16, x8, xzr // |u|*|f0|
madd x4, x17, x9, x4 // |v|*|g0|
str x4, [x0,#8*4]
asr x5, x4, #63 // sign extenstion
stp x5, x5, [x0,#8*5]
stp x5, x5, [x0,#8*7]
madd x4, x12, x8, xzr // |u|*|f1|
madd x4, x13, x9, x4 // |v|*|g1|
str x4, [x0,#8*9]
asr x5, x4, #63 // sign extenstion
stp x5, x5, [x0,#8*10]
stp x5, x5, [x0,#8*12]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
////////////////////////////////////////// two[!] last iterations
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #47 // 31 + 512 % 31
//bl __ab_approximation_62_256 // |a| and |b| are exact,
ldr x7, [x1,#8*0] // just load
ldr x11, [x1,#8*4]
bl __inner_loop_62_256
mov x16, x14
mov x17, x15
ldr x0, [sp] // original out_ptr
bl __smul_256x63
bl __smul_512x63_tail
ldr x30, [x29,#8]
smulh x20, x7, x17 // figure out top-most limb
ldp x8, x9, [x3,#8*0]
adc x23, x23, x25
ldp x10, x11, [x3,#8*2]
add x20, x20, x23 // x20 is 1, 0 or -1
asr x19, x20, #63 // sign as mask
and x23, x8, x19 // add mod<<256 conditionally
and x24, x9, x19
adds x4, x4, x23
and x25, x10, x19
adcs x5, x5, x24
and x26, x11, x19
adcs x6, x6, x25
adcs x7, x22, x26
adc x20, x20, xzr // x20 is 1, 0 or -1
neg x19, x20
orr x20, x20, x19 // excess bit or sign as mask
asr x19, x19, #63 // excess bit as mask
and x8, x8, x20 // mask |mod|
and x9, x9, x20
and x10, x10, x20
and x11, x11, x20
eor x8, x8, x19 // conditionally negate |mod|
eor x9, x9, x19
adds x8, x8, x19, lsr#63
eor x10, x10, x19
adcs x9, x9, xzr
eor x11, x11, x19
adcs x10, x10, xzr
adc x11, x11, xzr
adds x4, x4, x8 // final adjustment for |mod|<<256
adcs x5, x5, x9
adcs x6, x6, x10
stp x4, x5, [x0,#8*4]
adc x7, x7, x11
stp x6, x7, [x0,#8*6]
add sp, sp, #1040
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldr x29, [sp],#80
.inst 0xd50323bf
ret
.size ct_inverse_mod_256,.-ct_inverse_mod_256
////////////////////////////////////////////////////////////////////////
.type __smul_256x63, %function
.align 5
__smul_256x63:
ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|)
asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s)
ldp x6, x7, [x1,#8*2+64]
eor x16, x16, x14 // conditionally negate |f_| (or |g_|)
ldr x22, [x1,#8*4+64]
eor x4, x4, x14 // conditionally negate |u| (or |v|)
sub x16, x16, x14
eor x5, x5, x14
adds x4, x4, x14, lsr#63
eor x6, x6, x14
adcs x5, x5, xzr
eor x7, x7, x14
adcs x6, x6, xzr
eor x22, x22, x14
umulh x19, x4, x16
adcs x7, x7, xzr
umulh x20, x5, x16
adcs x22, x22, xzr
umulh x21, x6, x16
mul x4, x4, x16
cmp x16, #0
mul x5, x5, x16
csel x22, x22, xzr, ne
mul x6, x6, x16
adds x5, x5, x19
mul x24, x7, x16
adcs x6, x6, x20
adcs x24, x24, x21
adc x26, xzr, xzr
ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|)
asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s)
ldp x10, x11, [x1,#8*2+104]
eor x17, x17, x14 // conditionally negate |f_| (or |g_|)
ldr x23, [x1,#8*4+104]
eor x8, x8, x14 // conditionally negate |u| (or |v|)
sub x17, x17, x14
eor x9, x9, x14
adds x8, x8, x14, lsr#63
eor x10, x10, x14
adcs x9, x9, xzr
eor x11, x11, x14
adcs x10, x10, xzr
eor x23, x23, x14
umulh x19, x8, x17
adcs x11, x11, xzr
umulh x20, x9, x17
adcs x23, x23, xzr
umulh x21, x10, x17
adc x15, xzr, xzr // used in __smul_512x63_tail
mul x8, x8, x17
cmp x17, #0
mul x9, x9, x17
csel x23, x23, xzr, ne
mul x10, x10, x17
adds x9, x9, x19
mul x25, x11, x17
adcs x10, x10, x20
adcs x25, x25, x21
adc x26, x26, xzr
adds x4, x4, x8
adcs x5, x5, x9
adcs x6, x6, x10
stp x4, x5, [x0,#8*0]
adcs x24, x24, x25
stp x6, x24, [x0,#8*2]
ret
.size __smul_256x63,.-__smul_256x63
.type __smul_512x63_tail, %function
.align 5
__smul_512x63_tail:
umulh x24, x7, x16
ldp x5, x6, [x1,#8*18] // load rest of |v|
adc x26, x26, xzr
ldr x7, [x1,#8*20]
and x22, x22, x16
umulh x11, x11, x17 // resume |v|*|g1| chain
sub x24, x24, x22 // tie up |u|*|f1| chain
asr x25, x24, #63
eor x5, x5, x14 // conditionally negate rest of |v|
eor x6, x6, x14
adds x5, x5, x15
eor x7, x7, x14
adcs x6, x6, xzr
umulh x19, x23, x17
adc x7, x7, xzr
umulh x20, x5, x17
add x11, x11, x26
umulh x21, x6, x17
mul x4, x23, x17
mul x5, x5, x17
adds x4, x4, x11
mul x6, x6, x17
adcs x5, x5, x19
mul x22, x7, x17
adcs x6, x6, x20
adcs x22, x22, x21
adc x23, xzr, xzr // used in the final step
adds x4, x4, x24
adcs x5, x5, x25
adcs x6, x6, x25
stp x4, x5, [x0,#8*4]
adcs x22, x22, x25 // carry is used in the final step
stp x6, x22, [x0,#8*6]
ret
.size __smul_512x63_tail,.-__smul_512x63_tail
.type __smul_256_n_shift_by_31, %function
.align 5
__smul_256_n_shift_by_31:
ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|)
asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s)
ldp x6, x7, [x1,#8*2+0]
eor x25, x12, x24 // conditionally negate |f0| (or |g0|)
eor x4, x4, x24 // conditionally negate |a| (or |b|)
sub x25, x25, x24
eor x5, x5, x24
adds x4, x4, x24, lsr#63
eor x6, x6, x24
adcs x5, x5, xzr
eor x7, x7, x24
umulh x19, x4, x25
adcs x6, x6, xzr
umulh x20, x5, x25
adc x7, x7, xzr
umulh x21, x6, x25
and x24, x24, x25
umulh x22, x7, x25
neg x24, x24
mul x4, x4, x25
mul x5, x5, x25
mul x6, x6, x25
adds x5, x5, x19
mul x7, x7, x25
adcs x6, x6, x20
adcs x7, x7, x21
adc x22, x22, x24
ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|)
asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s)
ldp x10, x11, [x1,#8*2+32]
eor x25, x13, x24 // conditionally negate |f0| (or |g0|)
eor x8, x8, x24 // conditionally negate |a| (or |b|)
sub x25, x25, x24
eor x9, x9, x24
adds x8, x8, x24, lsr#63
eor x10, x10, x24
adcs x9, x9, xzr
eor x11, x11, x24
umulh x19, x8, x25
adcs x10, x10, xzr
umulh x20, x9, x25
adc x11, x11, xzr
umulh x21, x10, x25
and x24, x24, x25
umulh x23, x11, x25
neg x24, x24
mul x8, x8, x25
mul x9, x9, x25
mul x10, x10, x25
adds x9, x9, x19
mul x11, x11, x25
adcs x10, x10, x20
adcs x11, x11, x21
adc x23, x23, x24
adds x4, x4, x8
adcs x5, x5, x9
adcs x6, x6, x10
adcs x7, x7, x11
adc x8, x22, x23
extr x4, x5, x4, #31
extr x5, x6, x5, #31
extr x6, x7, x6, #31
asr x23, x8, #63 // result's sign as mask
extr x7, x8, x7, #31
eor x4, x4, x23 // ensure the result is positive
eor x5, x5, x23
adds x4, x4, x23, lsr#63
eor x6, x6, x23
adcs x5, x5, xzr
eor x7, x7, x23
adcs x6, x6, xzr
stp x4, x5, [x0,#8*0]
adc x7, x7, xzr
stp x6, x7, [x0,#8*2]
eor x12, x12, x23 // adjust |f/g| accordingly
eor x13, x13, x23
sub x12, x12, x23
sub x13, x13, x23
ret
.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
.type __ab_approximation_31_256, %function
.align 4
__ab_approximation_31_256:
ldp x6, x7, [x1,#8*2]
ldp x10, x11, [x1,#8*6]
ldp x4, x5, [x1,#8*0]
ldp x8, x9, [x1,#8*4]
.Lab_approximation_31_256_loaded:
orr x19, x7, x11 // check top-most limbs, ...
cmp x19, #0
csel x7, x7, x6, ne
csel x11, x11, x10, ne
csel x6, x6, x5, ne
orr x19, x7, x11 // and ones before top-most, ...
csel x10, x10, x9, ne
cmp x19, #0
csel x7, x7, x6, ne
csel x11, x11, x10, ne
csel x6, x6, x4, ne
orr x19, x7, x11 // and one more, ...
csel x10, x10, x8, ne
clz x19, x19
cmp x19, #64
csel x19, x19, xzr, ne
csel x7, x7, x6, ne
csel x11, x11, x10, ne
neg x20, x19
lslv x7, x7, x19 // align high limbs to the left
lslv x11, x11, x19
lsrv x6, x6, x20
lsrv x10, x10, x20
and x6, x6, x20, asr#6
and x10, x10, x20, asr#6
orr x7, x7, x6
orr x11, x11, x10
bfxil x7, x4, #0, #31
bfxil x11, x8, #0, #31
b __inner_loop_31_256
ret
.size __ab_approximation_31_256,.-__ab_approximation_31_256
.type __inner_loop_31_256, %function
.align 4
__inner_loop_31_256:
mov x2, #31
mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0
mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1
mov x23,#0x7FFFFFFF7FFFFFFF
.Loop_31_256:
sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting
sub x2, x2, #1
and x19, x11, x22
sub x20, x11, x7 // |b_|-|a_|
subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even)
mov x19, x15
csel x11, x11, x7, hs // |b_| = |a_|
csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel x15, x15, x13, hs // exchange |fg0| and |fg1|
csel x13, x13, x19, hs
lsr x7, x7, #1
and x19, x15, x22
and x20, x23, x22
sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even)
add x15, x15, x15 // |f1|<<=1
add x13, x13, x20
sub x15, x15, x23
cbnz x2, .Loop_31_256
mov x23, #0x7FFFFFFF
ubfx x12, x13, #0, #32
ubfx x13, x13, #32, #32
ubfx x14, x15, #0, #32
ubfx x15, x15, #32, #32
sub x12, x12, x23 // remove bias
sub x13, x13, x23
sub x14, x14, x23
sub x15, x15, x23
ret
.size __inner_loop_31_256,.-__inner_loop_31_256
.type __inner_loop_62_256, %function
.align 4
__inner_loop_62_256:
mov x12, #1 // |f0|=1
mov x13, #0 // |g0|=0
mov x14, #0 // |f1|=0
mov x15, #1 // |g1|=1
.Loop_62_256:
sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting
sub x2, x2, #1
and x19, x11, x22
sub x20, x11, x7 // |b_|-|a_|
subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even)
mov x19, x12
csel x11, x11, x7, hs // |b_| = |a_|
csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
mov x20, x13
csel x12, x12, x14, hs // exchange |f0| and |f1|
csel x14, x14, x19, hs
csel x13, x13, x15, hs // exchange |g0| and |g1|
csel x15, x15, x20, hs
lsr x7, x7, #1
and x19, x14, x22
and x20, x15, x22
add x14, x14, x14 // |f1|<<=1
add x15, x15, x15 // |g1|<<=1
sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even)
sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...)
cbnz x2, .Loop_62_256
ret
.size __inner_loop_62_256,.-__inner_loop_62_256