718 lines
16 KiB
ArmAsm
718 lines
16 KiB
ArmAsm
.text
|
|
|
|
.globl ct_inverse_mod_383
|
|
.type ct_inverse_mod_383, %function
|
|
.align 5
|
|
ct_inverse_mod_383:
|
|
.inst 0xd503233f
|
|
stp x29, x30, [sp,#-128]!
|
|
add x29, sp, #0
|
|
stp x19, x20, [sp,#16]
|
|
stp x21, x22, [sp,#32]
|
|
stp x23, x24, [sp,#48]
|
|
stp x25, x26, [sp,#64]
|
|
stp x27, x28, [sp,#80]
|
|
sub sp, sp, #1040
|
|
|
|
ldp x22, x4, [x1,#8*0]
|
|
ldp x5, x6, [x1,#8*2]
|
|
ldp x7, x8, [x1,#8*4]
|
|
|
|
add x1, sp, #16+511 // find closest 512-byte-aligned spot
|
|
and x1, x1, #-512 // in the frame...
|
|
stp x0, x3, [sp]
|
|
|
|
ldp x9, x10, [x2,#8*0]
|
|
ldp x11, x12, [x2,#8*2]
|
|
ldp x13, x14, [x2,#8*4]
|
|
|
|
stp x22, x4, [x1,#8*0] // copy input to |a|
|
|
stp x5, x6, [x1,#8*2]
|
|
stp x7, x8, [x1,#8*4]
|
|
stp x9, x10, [x1,#8*6] // copy modulus to |b|
|
|
stp x11, x12, [x1,#8*8]
|
|
stp x13, x14, [x1,#8*10]
|
|
|
|
////////////////////////////////////////// first iteration
|
|
mov x2, #62
|
|
bl .Lab_approximation_62_loaded
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
str x15,[x0,#8*12] // initialize |u| with |f0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to dst |b|
|
|
bl __smul_383_n_shift_by_62
|
|
str x15, [x0,#8*12] // initialize |v| with |f1|
|
|
|
|
////////////////////////////////////////// second iteration
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
bl __ab_approximation_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
mov x20, x15 // corrected |f0|
|
|
mov x21, x16 // corrected |g0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to destination |b|
|
|
bl __smul_383_n_shift_by_62
|
|
|
|
ldr x7, [x1,#8*12] // |u|
|
|
ldr x8, [x1,#8*18] // |v|
|
|
mul x3, x20, x7 // |u|*|f0|
|
|
smulh x4, x20, x7
|
|
mul x5, x21, x8 // |v|*|g0|
|
|
smulh x6, x21, x8
|
|
adds x3, x3, x5
|
|
adc x4, x4, x6
|
|
stp x3, x4, [x0,#8*6]
|
|
asr x5, x4, #63 // sign extenstion
|
|
stp x5, x5, [x0,#8*8]
|
|
stp x5, x5, [x0,#8*10]
|
|
|
|
mul x3, x15, x7 // |u|*|f1|
|
|
smulh x4, x15, x7
|
|
mul x5, x16, x8 // |v|*|g1|
|
|
smulh x6, x16, x8
|
|
adds x3, x3, x5
|
|
adc x4, x4, x6
|
|
stp x3, x4, [x0,#8*12]
|
|
asr x5, x4, #63 // sign extenstion
|
|
stp x5, x5, [x0,#8*14]
|
|
stp x5, x5, [x0,#8*16]
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
bl __ab_approximation_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
mov x20, x15 // corrected |f0|
|
|
mov x21, x16 // corrected |g0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to destination |b|
|
|
bl __smul_383_n_shift_by_62
|
|
|
|
add x0, x0, #8*6 // pointer to destination |u|
|
|
bl __smul_383x63
|
|
|
|
mov x20, x15 // corrected |f1|
|
|
mov x21, x16 // corrected |g1|
|
|
add x0, x0, #8*6 // pointer to destination |v|
|
|
bl __smul_383x63
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
bl __ab_approximation_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
mov x20, x15 // corrected |f0|
|
|
mov x21, x16 // corrected |g0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to destination |b|
|
|
bl __smul_383_n_shift_by_62
|
|
|
|
add x0, x0, #8*6 // pointer to destination |u|
|
|
bl __smul_383x63
|
|
|
|
mov x20, x15 // corrected |f1|
|
|
mov x21, x16 // corrected |g1|
|
|
add x0, x0, #8*6 // pointer to destination |v|
|
|
bl __smul_383x63
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
bl __ab_approximation_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
mov x20, x15 // corrected |f0|
|
|
mov x21, x16 // corrected |g0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to destination |b|
|
|
bl __smul_383_n_shift_by_62
|
|
|
|
add x0, x0, #8*6 // pointer to destination |u|
|
|
bl __smul_383x63
|
|
|
|
mov x20, x15 // corrected |f1|
|
|
mov x21, x16 // corrected |g1|
|
|
add x0, x0, #8*6 // pointer to destination |v|
|
|
bl __smul_383x63
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
bl __ab_approximation_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
mov x20, x15 // corrected |f0|
|
|
mov x21, x16 // corrected |g0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to destination |b|
|
|
bl __smul_383_n_shift_by_62
|
|
|
|
add x0, x0, #8*6 // pointer to destination |u|
|
|
bl __smul_383x63
|
|
|
|
mov x20, x15 // corrected |f1|
|
|
mov x21, x16 // corrected |g1|
|
|
add x0, x0, #8*6 // pointer to destination |v|
|
|
bl __smul_383x63
|
|
asr x27, x27, #63 // sign extension
|
|
stp x27, x27, [x0,#8*6]
|
|
stp x27, x27, [x0,#8*8]
|
|
stp x27, x27, [x0,#8*10]
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
bl __ab_approximation_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
mov x20, x15 // corrected |f0|
|
|
mov x21, x16 // corrected |g0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to destination |b|
|
|
bl __smul_383_n_shift_by_62
|
|
|
|
add x0, x0, #8*6 // pointer to destination |u|
|
|
bl __smul_383x63
|
|
|
|
mov x20, x15 // corrected |f1|
|
|
mov x21, x16 // corrected |g1|
|
|
add x0, x0, #8*6 // pointer to destination |v|
|
|
bl __smul_383x63
|
|
bl __smul_767x63_tail
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
bl __ab_approximation_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
mov x20, x15 // corrected |f0|
|
|
mov x21, x16 // corrected |g0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to destination |b|
|
|
bl __smul_383_n_shift_by_62
|
|
|
|
add x0, x0, #8*6 // pointer to destination |u|
|
|
bl __smul_383x63
|
|
|
|
mov x20, x15 // corrected |f1|
|
|
mov x21, x16 // corrected |g1|
|
|
add x0, x0, #8*6 // pointer to destination |v|
|
|
bl __smul_383x63
|
|
bl __smul_767x63_tail
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
bl __ab_approximation_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
mov x20, x15 // corrected |f0|
|
|
mov x21, x16 // corrected |g0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to destination |b|
|
|
bl __smul_383_n_shift_by_62
|
|
|
|
add x0, x0, #8*6 // pointer to destination |u|
|
|
bl __smul_383x63
|
|
|
|
mov x20, x15 // corrected |f1|
|
|
mov x21, x16 // corrected |g1|
|
|
add x0, x0, #8*6 // pointer to destination |v|
|
|
bl __smul_383x63
|
|
bl __smul_767x63_tail
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
bl __ab_approximation_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
mov x20, x15 // corrected |f0|
|
|
mov x21, x16 // corrected |g0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to destination |b|
|
|
bl __smul_383_n_shift_by_62
|
|
|
|
add x0, x0, #8*6 // pointer to destination |u|
|
|
bl __smul_383x63
|
|
|
|
mov x20, x15 // corrected |f1|
|
|
mov x21, x16 // corrected |g1|
|
|
add x0, x0, #8*6 // pointer to destination |v|
|
|
bl __smul_383x63
|
|
bl __smul_767x63_tail
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
bl __ab_approximation_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
bl __smul_383_n_shift_by_62
|
|
mov x20, x15 // corrected |f0|
|
|
mov x21, x16 // corrected |g0|
|
|
|
|
mov x15, x17 // |f1|
|
|
mov x16, x19 // |g1|
|
|
add x0, x0, #8*6 // pointer to destination |b|
|
|
bl __smul_383_n_shift_by_62
|
|
|
|
add x0, x0, #8*6 // pointer to destination |u|
|
|
bl __smul_383x63
|
|
|
|
mov x20, x15 // corrected |f1|
|
|
mov x21, x16 // corrected |g1|
|
|
add x0, x0, #8*6 // pointer to destination |v|
|
|
bl __smul_383x63
|
|
bl __smul_767x63_tail
|
|
////////////////////////////////////////// iteration before last
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #62
|
|
//bl __ab_approximation_62 // |a| and |b| are exact,
|
|
ldp x3, x8, [x1,#8*0] // just load
|
|
ldp x9, x14, [x1,#8*6]
|
|
bl __inner_loop_62
|
|
|
|
eor x0, x1, #256 // pointer to dst |a|b|u|v|
|
|
str x3, [x0,#8*0]
|
|
str x9, [x0,#8*6]
|
|
|
|
mov x20, x15 // exact |f0|
|
|
mov x21, x16 // exact |g0|
|
|
mov x15, x17
|
|
mov x16, x19
|
|
add x0, x0, #8*12 // pointer to dst |u|
|
|
bl __smul_383x63
|
|
|
|
mov x20, x15 // exact |f1|
|
|
mov x21, x16 // exact |g1|
|
|
add x0, x0, #8*6 // pointer to dst |v|
|
|
bl __smul_383x63
|
|
bl __smul_767x63_tail
|
|
|
|
////////////////////////////////////////// last iteration
|
|
eor x1, x1, #256 // flip-flop src |a|b|u|v|
|
|
mov x2, #22 // 766 % 62
|
|
//bl __ab_approximation_62 // |a| and |b| are exact,
|
|
ldr x3, [x1,#8*0] // just load
|
|
eor x8, x8, x8
|
|
ldr x9, [x1,#8*6]
|
|
eor x14, x14, x14
|
|
bl __inner_loop_62
|
|
|
|
mov x20, x17
|
|
mov x21, x19
|
|
ldp x0, x15, [sp] // original out_ptr and n_ptr
|
|
bl __smul_383x63
|
|
bl __smul_767x63_tail
|
|
ldr x30, [x29,#8]
|
|
|
|
asr x22, x8, #63 // sign as mask
|
|
ldp x9, x10, [x15,#8*0]
|
|
ldp x11, x12, [x15,#8*2]
|
|
ldp x13, x14, [x15,#8*4]
|
|
|
|
and x9, x9, x22 // add mod<<384 conditionally
|
|
and x10, x10, x22
|
|
adds x3, x3, x9
|
|
and x11, x11, x22
|
|
adcs x4, x4, x10
|
|
and x12, x12, x22
|
|
adcs x5, x5, x11
|
|
and x13, x13, x22
|
|
adcs x6, x6, x12
|
|
and x14, x14, x22
|
|
stp x3, x4, [x0,#8*6]
|
|
adcs x7, x7, x13
|
|
stp x5, x6, [x0,#8*8]
|
|
adc x8, x8, x14
|
|
stp x7, x8, [x0,#8*10]
|
|
|
|
add sp, sp, #1040
|
|
ldp x19, x20, [x29,#16]
|
|
ldp x21, x22, [x29,#32]
|
|
ldp x23, x24, [x29,#48]
|
|
ldp x25, x26, [x29,#64]
|
|
ldp x27, x28, [x29,#80]
|
|
ldr x29, [sp],#128
|
|
.inst 0xd50323bf
|
|
ret
|
|
.size ct_inverse_mod_383,.-ct_inverse_mod_383
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
// see corresponding commentary in ctx_inverse_mod_384-x86_64...
|
|
.type __smul_383x63, %function
|
|
.align 5
|
|
__smul_383x63:
|
|
ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|)
|
|
asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s)
|
|
ldp x5, x6, [x1,#8*2+96]
|
|
eor x20, x20, x17 // conditionally negate |f_| (or |g_|)
|
|
ldp x7, x8, [x1,#8*4+96]
|
|
|
|
eor x3, x3, x17 // conditionally negate |u| (or |v|)
|
|
sub x20, x20, x17
|
|
eor x4, x4, x17
|
|
adds x3, x3, x17, lsr#63
|
|
eor x5, x5, x17
|
|
adcs x4, x4, xzr
|
|
eor x6, x6, x17
|
|
adcs x5, x5, xzr
|
|
eor x7, x7, x17
|
|
adcs x6, x6, xzr
|
|
umulh x22, x3, x20
|
|
eor x8, x8, x17
|
|
umulh x23, x4, x20
|
|
adcs x7, x7, xzr
|
|
umulh x24, x5, x20
|
|
adcs x8, x8, xzr
|
|
umulh x25, x6, x20
|
|
umulh x26, x7, x20
|
|
mul x3, x3, x20
|
|
mul x4, x4, x20
|
|
mul x5, x5, x20
|
|
adds x4, x4, x22
|
|
mul x6, x6, x20
|
|
adcs x5, x5, x23
|
|
mul x7, x7, x20
|
|
adcs x6, x6, x24
|
|
mul x27,x8, x20
|
|
adcs x7, x7, x25
|
|
adcs x27,x27,x26
|
|
adc x2, xzr, xzr
|
|
ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|)
|
|
asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s)
|
|
ldp x11, x12, [x1,#8*2+144]
|
|
eor x21, x21, x17 // conditionally negate |f_| (or |g_|)
|
|
ldp x13, x14, [x1,#8*4+144]
|
|
|
|
eor x9, x9, x17 // conditionally negate |u| (or |v|)
|
|
sub x21, x21, x17
|
|
eor x10, x10, x17
|
|
adds x9, x9, x17, lsr#63
|
|
eor x11, x11, x17
|
|
adcs x10, x10, xzr
|
|
eor x12, x12, x17
|
|
adcs x11, x11, xzr
|
|
eor x13, x13, x17
|
|
adcs x12, x12, xzr
|
|
umulh x22, x9, x21
|
|
eor x14, x14, x17
|
|
umulh x23, x10, x21
|
|
adcs x13, x13, xzr
|
|
umulh x24, x11, x21
|
|
adcs x14, x14, xzr
|
|
umulh x25, x12, x21
|
|
adc x19, xzr, xzr // used in __smul_767x63_tail
|
|
umulh x26, x13, x21
|
|
mul x9, x9, x21
|
|
mul x10, x10, x21
|
|
mul x11, x11, x21
|
|
adds x10, x10, x22
|
|
mul x12, x12, x21
|
|
adcs x11, x11, x23
|
|
mul x13, x13, x21
|
|
adcs x12, x12, x24
|
|
mul x28,x14, x21
|
|
adcs x13, x13, x25
|
|
adcs x28,x28,x26
|
|
adc x2, x2, xzr
|
|
|
|
adds x3, x3, x9
|
|
adcs x4, x4, x10
|
|
adcs x5, x5, x11
|
|
adcs x6, x6, x12
|
|
stp x3, x4, [x0,#8*0]
|
|
adcs x7, x7, x13
|
|
stp x5, x6, [x0,#8*2]
|
|
adcs x27, x27, x28
|
|
stp x7, x27, [x0,#8*4]
|
|
adc x28, x2, xzr // used in __smul_767x63_tail
|
|
|
|
ret
|
|
.size __smul_383x63,.-__smul_383x63
|
|
|
|
.type __smul_767x63_tail, %function
|
|
.align 5
|
|
__smul_767x63_tail:
|
|
smulh x27, x8, x20
|
|
ldp x3, x4, [x1,#8*24] // load rest of |v|
|
|
umulh x14,x14, x21
|
|
ldp x5, x6, [x1,#8*26]
|
|
ldp x7, x8, [x1,#8*28]
|
|
|
|
eor x3, x3, x17 // conditionally negate rest of |v|
|
|
eor x4, x4, x17
|
|
eor x5, x5, x17
|
|
adds x3, x3, x19
|
|
eor x6, x6, x17
|
|
adcs x4, x4, xzr
|
|
eor x7, x7, x17
|
|
adcs x5, x5, xzr
|
|
eor x8, x8, x17
|
|
adcs x6, x6, xzr
|
|
umulh x22, x3, x21
|
|
adcs x7, x7, xzr
|
|
umulh x23, x4, x21
|
|
adc x8, x8, xzr
|
|
|
|
umulh x24, x5, x21
|
|
add x14, x14, x28
|
|
umulh x25, x6, x21
|
|
asr x28, x27, #63
|
|
umulh x26, x7, x21
|
|
mul x3, x3, x21
|
|
mul x4, x4, x21
|
|
mul x5, x5, x21
|
|
adds x3, x3, x14
|
|
mul x6, x6, x21
|
|
adcs x4, x4, x22
|
|
mul x7, x7, x21
|
|
adcs x5, x5, x23
|
|
mul x8, x8, x21
|
|
adcs x6, x6, x24
|
|
adcs x7, x7, x25
|
|
adc x8, x8, x26
|
|
|
|
adds x3, x3, x27
|
|
adcs x4, x4, x28
|
|
adcs x5, x5, x28
|
|
adcs x6, x6, x28
|
|
stp x3, x4, [x0,#8*6]
|
|
adcs x7, x7, x28
|
|
stp x5, x6, [x0,#8*8]
|
|
adc x8, x8, x28
|
|
stp x7, x8, [x0,#8*10]
|
|
|
|
ret
|
|
.size __smul_767x63_tail,.-__smul_767x63_tail
|
|
|
|
.type __smul_383_n_shift_by_62, %function
|
|
.align 5
|
|
__smul_383_n_shift_by_62:
|
|
ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|)
|
|
asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s)
|
|
ldp x5, x6, [x1,#8*2+0]
|
|
eor x2, x15, x28 // conditionally negate |f0| (or |g0|)
|
|
ldp x7, x8, [x1,#8*4+0]
|
|
|
|
eor x3, x3, x28 // conditionally negate |a| (or |b|)
|
|
sub x2, x2, x28
|
|
eor x4, x4, x28
|
|
adds x3, x3, x28, lsr#63
|
|
eor x5, x5, x28
|
|
adcs x4, x4, xzr
|
|
eor x6, x6, x28
|
|
adcs x5, x5, xzr
|
|
eor x7, x7, x28
|
|
umulh x22, x3, x2
|
|
adcs x6, x6, xzr
|
|
umulh x23, x4, x2
|
|
eor x8, x8, x28
|
|
umulh x24, x5, x2
|
|
adcs x7, x7, xzr
|
|
umulh x25, x6, x2
|
|
adc x8, x8, xzr
|
|
|
|
umulh x26, x7, x2
|
|
smulh x27, x8, x2
|
|
mul x3, x3, x2
|
|
mul x4, x4, x2
|
|
mul x5, x5, x2
|
|
adds x4, x4, x22
|
|
mul x6, x6, x2
|
|
adcs x5, x5, x23
|
|
mul x7, x7, x2
|
|
adcs x6, x6, x24
|
|
mul x8, x8, x2
|
|
adcs x7, x7, x25
|
|
adcs x8, x8 ,x26
|
|
adc x27, x27, xzr
|
|
ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|)
|
|
asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s)
|
|
ldp x11, x12, [x1,#8*2+48]
|
|
eor x2, x16, x28 // conditionally negate |f0| (or |g0|)
|
|
ldp x13, x14, [x1,#8*4+48]
|
|
|
|
eor x9, x9, x28 // conditionally negate |a| (or |b|)
|
|
sub x2, x2, x28
|
|
eor x10, x10, x28
|
|
adds x9, x9, x28, lsr#63
|
|
eor x11, x11, x28
|
|
adcs x10, x10, xzr
|
|
eor x12, x12, x28
|
|
adcs x11, x11, xzr
|
|
eor x13, x13, x28
|
|
umulh x22, x9, x2
|
|
adcs x12, x12, xzr
|
|
umulh x23, x10, x2
|
|
eor x14, x14, x28
|
|
umulh x24, x11, x2
|
|
adcs x13, x13, xzr
|
|
umulh x25, x12, x2
|
|
adc x14, x14, xzr
|
|
|
|
umulh x26, x13, x2
|
|
smulh x28, x14, x2
|
|
mul x9, x9, x2
|
|
mul x10, x10, x2
|
|
mul x11, x11, x2
|
|
adds x10, x10, x22
|
|
mul x12, x12, x2
|
|
adcs x11, x11, x23
|
|
mul x13, x13, x2
|
|
adcs x12, x12, x24
|
|
mul x14, x14, x2
|
|
adcs x13, x13, x25
|
|
adcs x14, x14 ,x26
|
|
adc x28, x28, xzr
|
|
adds x3, x3, x9
|
|
adcs x4, x4, x10
|
|
adcs x5, x5, x11
|
|
adcs x6, x6, x12
|
|
adcs x7, x7, x13
|
|
adcs x8, x8, x14
|
|
adc x9, x27, x28
|
|
|
|
extr x3, x4, x3, #62
|
|
extr x4, x5, x4, #62
|
|
extr x5, x6, x5, #62
|
|
asr x28, x9, #63
|
|
extr x6, x7, x6, #62
|
|
extr x7, x8, x7, #62
|
|
extr x8, x9, x8, #62
|
|
|
|
eor x3, x3, x28
|
|
eor x4, x4, x28
|
|
adds x3, x3, x28, lsr#63
|
|
eor x5, x5, x28
|
|
adcs x4, x4, xzr
|
|
eor x6, x6, x28
|
|
adcs x5, x5, xzr
|
|
eor x7, x7, x28
|
|
adcs x6, x6, xzr
|
|
eor x8, x8, x28
|
|
stp x3, x4, [x0,#8*0]
|
|
adcs x7, x7, xzr
|
|
stp x5, x6, [x0,#8*2]
|
|
adc x8, x8, xzr
|
|
stp x7, x8, [x0,#8*4]
|
|
|
|
eor x15, x15, x28
|
|
eor x16, x16, x28
|
|
sub x15, x15, x28
|
|
sub x16, x16, x28
|
|
|
|
ret
|
|
.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
|
|
.type __ab_approximation_62, %function
|
|
.align 4
|
|
__ab_approximation_62:
|
|
ldp x7, x8, [x1,#8*4]
|
|
ldp x13, x14, [x1,#8*10]
|
|
ldp x5, x6, [x1,#8*2]
|
|
ldp x11, x12, [x1,#8*8]
|
|
|
|
.Lab_approximation_62_loaded:
|
|
orr x22, x8, x14 // check top-most limbs, ...
|
|
cmp x22, #0
|
|
csel x8, x8, x7, ne
|
|
csel x14, x14, x13, ne
|
|
csel x7, x7, x6, ne
|
|
orr x22, x8, x14 // ... ones before top-most, ...
|
|
csel x13, x13, x12, ne
|
|
|
|
ldp x3, x4, [x1,#8*0]
|
|
ldp x9, x10, [x1,#8*6]
|
|
|
|
cmp x22, #0
|
|
csel x8, x8, x7, ne
|
|
csel x14, x14, x13, ne
|
|
csel x7, x7, x5, ne
|
|
orr x22, x8, x14 // ... and ones before that ...
|
|
csel x13, x13, x11, ne
|
|
|
|
cmp x22, #0
|
|
csel x8, x8, x7, ne
|
|
csel x14, x14, x13, ne
|
|
csel x7, x7, x4, ne
|
|
orr x22, x8, x14
|
|
csel x13, x13, x10, ne
|
|
|
|
clz x22, x22
|
|
cmp x22, #64
|
|
csel x22, x22, xzr, ne
|
|
csel x8, x8, x7, ne
|
|
csel x14, x14, x13, ne
|
|
neg x23, x22
|
|
|
|
lslv x8, x8, x22 // align high limbs to the left
|
|
lslv x14, x14, x22
|
|
lsrv x7, x7, x23
|
|
lsrv x13, x13, x23
|
|
and x7, x7, x23, asr#6
|
|
and x13, x13, x23, asr#6
|
|
orr x8, x8, x7
|
|
orr x14, x14, x13
|
|
|
|
b __inner_loop_62
|
|
ret
|
|
.size __ab_approximation_62,.-__ab_approximation_62
|
|
.type __inner_loop_62, %function
|
|
.align 4
|
|
__inner_loop_62:
|
|
mov x15, #1 // |f0|=1
|
|
mov x16, #0 // |g0|=0
|
|
mov x17, #0 // |f1|=0
|
|
mov x19, #1 // |g1|=1
|
|
|
|
.Loop_62:
|
|
sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting
|
|
sub x2, x2, #1
|
|
subs x24, x9, x3 // |b_|-|a_|
|
|
and x22, x9, x28
|
|
sbc x25, x14, x8
|
|
and x23, x14, x28
|
|
subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even)
|
|
mov x22, x15
|
|
sbcs x27, x8, x23
|
|
mov x23, x16
|
|
csel x9, x9, x3, hs // |b_| = |a_|
|
|
csel x14, x14, x8, hs
|
|
csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
|
|
csel x8, x27, x25, hs
|
|
csel x15, x15, x17, hs // exchange |f0| and |f1|
|
|
csel x17, x17, x22, hs
|
|
csel x16, x16, x19, hs // exchange |g0| and |g1|
|
|
csel x19, x19, x23, hs
|
|
extr x3, x8, x3, #1
|
|
lsr x8, x8, #1
|
|
and x22, x17, x28
|
|
and x23, x19, x28
|
|
add x17, x17, x17 // |f1|<<=1
|
|
add x19, x19, x19 // |g1|<<=1
|
|
sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even)
|
|
sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...)
|
|
cbnz x2, .Loop_62
|
|
|
|
ret
|
|
.size __inner_loop_62,.-__inner_loop_62
|