ftu/blst/elf/ct_inverse_mod_384-armv8.S
2022-09-09 02:47:49 -04:00

718 lines
16 KiB
ArmAsm

.text
.globl ct_inverse_mod_383
.type ct_inverse_mod_383, %function
.align 5
ct_inverse_mod_383:
.inst 0xd503233f
stp x29, x30, [sp,#-128]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
sub sp, sp, #1040
ldp x22, x4, [x1,#8*0]
ldp x5, x6, [x1,#8*2]
ldp x7, x8, [x1,#8*4]
add x1, sp, #16+511 // find closest 512-byte-aligned spot
and x1, x1, #-512 // in the frame...
stp x0, x3, [sp]
ldp x9, x10, [x2,#8*0]
ldp x11, x12, [x2,#8*2]
ldp x13, x14, [x2,#8*4]
stp x22, x4, [x1,#8*0] // copy input to |a|
stp x5, x6, [x1,#8*2]
stp x7, x8, [x1,#8*4]
stp x9, x10, [x1,#8*6] // copy modulus to |b|
stp x11, x12, [x1,#8*8]
stp x13, x14, [x1,#8*10]
////////////////////////////////////////// first iteration
mov x2, #62
bl .Lab_approximation_62_loaded
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
str x15,[x0,#8*12] // initialize |u| with |f0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to dst |b|
bl __smul_383_n_shift_by_62
str x15, [x0,#8*12] // initialize |v| with |f1|
////////////////////////////////////////// second iteration
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
ldr x7, [x1,#8*12] // |u|
ldr x8, [x1,#8*18] // |v|
mul x3, x20, x7 // |u|*|f0|
smulh x4, x20, x7
mul x5, x21, x8 // |v|*|g0|
smulh x6, x21, x8
adds x3, x3, x5
adc x4, x4, x6
stp x3, x4, [x0,#8*6]
asr x5, x4, #63 // sign extenstion
stp x5, x5, [x0,#8*8]
stp x5, x5, [x0,#8*10]
mul x3, x15, x7 // |u|*|f1|
smulh x4, x15, x7
mul x5, x16, x8 // |v|*|g1|
smulh x6, x16, x8
adds x3, x3, x5
adc x4, x4, x6
stp x3, x4, [x0,#8*12]
asr x5, x4, #63 // sign extenstion
stp x5, x5, [x0,#8*14]
stp x5, x5, [x0,#8*16]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
asr x27, x27, #63 // sign extension
stp x27, x27, [x0,#8*6]
stp x27, x27, [x0,#8*8]
stp x27, x27, [x0,#8*10]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
bl __smul_767x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
bl __smul_767x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
bl __smul_767x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
bl __smul_767x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
bl __smul_767x63_tail
////////////////////////////////////////// iteration before last
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
//bl __ab_approximation_62 // |a| and |b| are exact,
ldp x3, x8, [x1,#8*0] // just load
ldp x9, x14, [x1,#8*6]
bl __inner_loop_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
str x3, [x0,#8*0]
str x9, [x0,#8*6]
mov x20, x15 // exact |f0|
mov x21, x16 // exact |g0|
mov x15, x17
mov x16, x19
add x0, x0, #8*12 // pointer to dst |u|
bl __smul_383x63
mov x20, x15 // exact |f1|
mov x21, x16 // exact |g1|
add x0, x0, #8*6 // pointer to dst |v|
bl __smul_383x63
bl __smul_767x63_tail
////////////////////////////////////////// last iteration
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #22 // 766 % 62
//bl __ab_approximation_62 // |a| and |b| are exact,
ldr x3, [x1,#8*0] // just load
eor x8, x8, x8
ldr x9, [x1,#8*6]
eor x14, x14, x14
bl __inner_loop_62
mov x20, x17
mov x21, x19
ldp x0, x15, [sp] // original out_ptr and n_ptr
bl __smul_383x63
bl __smul_767x63_tail
ldr x30, [x29,#8]
asr x22, x8, #63 // sign as mask
ldp x9, x10, [x15,#8*0]
ldp x11, x12, [x15,#8*2]
ldp x13, x14, [x15,#8*4]
and x9, x9, x22 // add mod<<384 conditionally
and x10, x10, x22
adds x3, x3, x9
and x11, x11, x22
adcs x4, x4, x10
and x12, x12, x22
adcs x5, x5, x11
and x13, x13, x22
adcs x6, x6, x12
and x14, x14, x22
stp x3, x4, [x0,#8*6]
adcs x7, x7, x13
stp x5, x6, [x0,#8*8]
adc x8, x8, x14
stp x7, x8, [x0,#8*10]
add sp, sp, #1040
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldr x29, [sp],#128
.inst 0xd50323bf
ret
.size ct_inverse_mod_383,.-ct_inverse_mod_383
////////////////////////////////////////////////////////////////////////
// see corresponding commentary in ctx_inverse_mod_384-x86_64...
.type __smul_383x63, %function
.align 5
__smul_383x63:
ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|)
asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s)
ldp x5, x6, [x1,#8*2+96]
eor x20, x20, x17 // conditionally negate |f_| (or |g_|)
ldp x7, x8, [x1,#8*4+96]
eor x3, x3, x17 // conditionally negate |u| (or |v|)
sub x20, x20, x17
eor x4, x4, x17
adds x3, x3, x17, lsr#63
eor x5, x5, x17
adcs x4, x4, xzr
eor x6, x6, x17
adcs x5, x5, xzr
eor x7, x7, x17
adcs x6, x6, xzr
umulh x22, x3, x20
eor x8, x8, x17
umulh x23, x4, x20
adcs x7, x7, xzr
umulh x24, x5, x20
adcs x8, x8, xzr
umulh x25, x6, x20
umulh x26, x7, x20
mul x3, x3, x20
mul x4, x4, x20
mul x5, x5, x20
adds x4, x4, x22
mul x6, x6, x20
adcs x5, x5, x23
mul x7, x7, x20
adcs x6, x6, x24
mul x27,x8, x20
adcs x7, x7, x25
adcs x27,x27,x26
adc x2, xzr, xzr
ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|)
asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s)
ldp x11, x12, [x1,#8*2+144]
eor x21, x21, x17 // conditionally negate |f_| (or |g_|)
ldp x13, x14, [x1,#8*4+144]
eor x9, x9, x17 // conditionally negate |u| (or |v|)
sub x21, x21, x17
eor x10, x10, x17
adds x9, x9, x17, lsr#63
eor x11, x11, x17
adcs x10, x10, xzr
eor x12, x12, x17
adcs x11, x11, xzr
eor x13, x13, x17
adcs x12, x12, xzr
umulh x22, x9, x21
eor x14, x14, x17
umulh x23, x10, x21
adcs x13, x13, xzr
umulh x24, x11, x21
adcs x14, x14, xzr
umulh x25, x12, x21
adc x19, xzr, xzr // used in __smul_767x63_tail
umulh x26, x13, x21
mul x9, x9, x21
mul x10, x10, x21
mul x11, x11, x21
adds x10, x10, x22
mul x12, x12, x21
adcs x11, x11, x23
mul x13, x13, x21
adcs x12, x12, x24
mul x28,x14, x21
adcs x13, x13, x25
adcs x28,x28,x26
adc x2, x2, xzr
adds x3, x3, x9
adcs x4, x4, x10
adcs x5, x5, x11
adcs x6, x6, x12
stp x3, x4, [x0,#8*0]
adcs x7, x7, x13
stp x5, x6, [x0,#8*2]
adcs x27, x27, x28
stp x7, x27, [x0,#8*4]
adc x28, x2, xzr // used in __smul_767x63_tail
ret
.size __smul_383x63,.-__smul_383x63
.type __smul_767x63_tail, %function
.align 5
__smul_767x63_tail:
smulh x27, x8, x20
ldp x3, x4, [x1,#8*24] // load rest of |v|
umulh x14,x14, x21
ldp x5, x6, [x1,#8*26]
ldp x7, x8, [x1,#8*28]
eor x3, x3, x17 // conditionally negate rest of |v|
eor x4, x4, x17
eor x5, x5, x17
adds x3, x3, x19
eor x6, x6, x17
adcs x4, x4, xzr
eor x7, x7, x17
adcs x5, x5, xzr
eor x8, x8, x17
adcs x6, x6, xzr
umulh x22, x3, x21
adcs x7, x7, xzr
umulh x23, x4, x21
adc x8, x8, xzr
umulh x24, x5, x21
add x14, x14, x28
umulh x25, x6, x21
asr x28, x27, #63
umulh x26, x7, x21
mul x3, x3, x21
mul x4, x4, x21
mul x5, x5, x21
adds x3, x3, x14
mul x6, x6, x21
adcs x4, x4, x22
mul x7, x7, x21
adcs x5, x5, x23
mul x8, x8, x21
adcs x6, x6, x24
adcs x7, x7, x25
adc x8, x8, x26
adds x3, x3, x27
adcs x4, x4, x28
adcs x5, x5, x28
adcs x6, x6, x28
stp x3, x4, [x0,#8*6]
adcs x7, x7, x28
stp x5, x6, [x0,#8*8]
adc x8, x8, x28
stp x7, x8, [x0,#8*10]
ret
.size __smul_767x63_tail,.-__smul_767x63_tail
.type __smul_383_n_shift_by_62, %function
.align 5
__smul_383_n_shift_by_62:
ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|)
asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s)
ldp x5, x6, [x1,#8*2+0]
eor x2, x15, x28 // conditionally negate |f0| (or |g0|)
ldp x7, x8, [x1,#8*4+0]
eor x3, x3, x28 // conditionally negate |a| (or |b|)
sub x2, x2, x28
eor x4, x4, x28
adds x3, x3, x28, lsr#63
eor x5, x5, x28
adcs x4, x4, xzr
eor x6, x6, x28
adcs x5, x5, xzr
eor x7, x7, x28
umulh x22, x3, x2
adcs x6, x6, xzr
umulh x23, x4, x2
eor x8, x8, x28
umulh x24, x5, x2
adcs x7, x7, xzr
umulh x25, x6, x2
adc x8, x8, xzr
umulh x26, x7, x2
smulh x27, x8, x2
mul x3, x3, x2
mul x4, x4, x2
mul x5, x5, x2
adds x4, x4, x22
mul x6, x6, x2
adcs x5, x5, x23
mul x7, x7, x2
adcs x6, x6, x24
mul x8, x8, x2
adcs x7, x7, x25
adcs x8, x8 ,x26
adc x27, x27, xzr
ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|)
asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s)
ldp x11, x12, [x1,#8*2+48]
eor x2, x16, x28 // conditionally negate |f0| (or |g0|)
ldp x13, x14, [x1,#8*4+48]
eor x9, x9, x28 // conditionally negate |a| (or |b|)
sub x2, x2, x28
eor x10, x10, x28
adds x9, x9, x28, lsr#63
eor x11, x11, x28
adcs x10, x10, xzr
eor x12, x12, x28
adcs x11, x11, xzr
eor x13, x13, x28
umulh x22, x9, x2
adcs x12, x12, xzr
umulh x23, x10, x2
eor x14, x14, x28
umulh x24, x11, x2
adcs x13, x13, xzr
umulh x25, x12, x2
adc x14, x14, xzr
umulh x26, x13, x2
smulh x28, x14, x2
mul x9, x9, x2
mul x10, x10, x2
mul x11, x11, x2
adds x10, x10, x22
mul x12, x12, x2
adcs x11, x11, x23
mul x13, x13, x2
adcs x12, x12, x24
mul x14, x14, x2
adcs x13, x13, x25
adcs x14, x14 ,x26
adc x28, x28, xzr
adds x3, x3, x9
adcs x4, x4, x10
adcs x5, x5, x11
adcs x6, x6, x12
adcs x7, x7, x13
adcs x8, x8, x14
adc x9, x27, x28
extr x3, x4, x3, #62
extr x4, x5, x4, #62
extr x5, x6, x5, #62
asr x28, x9, #63
extr x6, x7, x6, #62
extr x7, x8, x7, #62
extr x8, x9, x8, #62
eor x3, x3, x28
eor x4, x4, x28
adds x3, x3, x28, lsr#63
eor x5, x5, x28
adcs x4, x4, xzr
eor x6, x6, x28
adcs x5, x5, xzr
eor x7, x7, x28
adcs x6, x6, xzr
eor x8, x8, x28
stp x3, x4, [x0,#8*0]
adcs x7, x7, xzr
stp x5, x6, [x0,#8*2]
adc x8, x8, xzr
stp x7, x8, [x0,#8*4]
eor x15, x15, x28
eor x16, x16, x28
sub x15, x15, x28
sub x16, x16, x28
ret
.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
.type __ab_approximation_62, %function
.align 4
__ab_approximation_62:
ldp x7, x8, [x1,#8*4]
ldp x13, x14, [x1,#8*10]
ldp x5, x6, [x1,#8*2]
ldp x11, x12, [x1,#8*8]
.Lab_approximation_62_loaded:
orr x22, x8, x14 // check top-most limbs, ...
cmp x22, #0
csel x8, x8, x7, ne
csel x14, x14, x13, ne
csel x7, x7, x6, ne
orr x22, x8, x14 // ... ones before top-most, ...
csel x13, x13, x12, ne
ldp x3, x4, [x1,#8*0]
ldp x9, x10, [x1,#8*6]
cmp x22, #0
csel x8, x8, x7, ne
csel x14, x14, x13, ne
csel x7, x7, x5, ne
orr x22, x8, x14 // ... and ones before that ...
csel x13, x13, x11, ne
cmp x22, #0
csel x8, x8, x7, ne
csel x14, x14, x13, ne
csel x7, x7, x4, ne
orr x22, x8, x14
csel x13, x13, x10, ne
clz x22, x22
cmp x22, #64
csel x22, x22, xzr, ne
csel x8, x8, x7, ne
csel x14, x14, x13, ne
neg x23, x22
lslv x8, x8, x22 // align high limbs to the left
lslv x14, x14, x22
lsrv x7, x7, x23
lsrv x13, x13, x23
and x7, x7, x23, asr#6
and x13, x13, x23, asr#6
orr x8, x8, x7
orr x14, x14, x13
b __inner_loop_62
ret
.size __ab_approximation_62,.-__ab_approximation_62
.type __inner_loop_62, %function
.align 4
__inner_loop_62:
mov x15, #1 // |f0|=1
mov x16, #0 // |g0|=0
mov x17, #0 // |f1|=0
mov x19, #1 // |g1|=1
.Loop_62:
sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting
sub x2, x2, #1
subs x24, x9, x3 // |b_|-|a_|
and x22, x9, x28
sbc x25, x14, x8
and x23, x14, x28
subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even)
mov x22, x15
sbcs x27, x8, x23
mov x23, x16
csel x9, x9, x3, hs // |b_| = |a_|
csel x14, x14, x8, hs
csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel x8, x27, x25, hs
csel x15, x15, x17, hs // exchange |f0| and |f1|
csel x17, x17, x22, hs
csel x16, x16, x19, hs // exchange |g0| and |g1|
csel x19, x19, x23, hs
extr x3, x8, x3, #1
lsr x8, x8, #1
and x22, x17, x28
and x23, x19, x28
add x17, x17, x17 // |f1|<<=1
add x19, x19, x19 // |g1|<<=1
sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even)
sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...)
cbnz x2, .Loop_62
ret
.size __inner_loop_62,.-__inner_loop_62