325 lines
7.2 KiB
ArmAsm
325 lines
7.2 KiB
ArmAsm
.text
|
|
|
|
.globl ct_is_square_mod_384
|
|
.type ct_is_square_mod_384, %function
|
|
.align 5
|
|
ct_is_square_mod_384:
|
|
.inst 0xd503233f
|
|
stp x29, x30, [sp,#-128]!
|
|
add x29, sp, #0
|
|
stp x19, x20, [sp,#16]
|
|
stp x21, x22, [sp,#32]
|
|
stp x23, x24, [sp,#48]
|
|
stp x25, x26, [sp,#64]
|
|
stp x27, x28, [sp,#80]
|
|
sub sp, sp, #512
|
|
|
|
ldp x3, x4, [x0,#8*0] // load input
|
|
ldp x5, x6, [x0,#8*2]
|
|
ldp x7, x8, [x0,#8*4]
|
|
|
|
add x0, sp, #255 // find closest 256-byte-aligned spot
|
|
and x0, x0, #-256 // in the frame...
|
|
|
|
ldp x9, x10, [x1,#8*0] // load modulus
|
|
ldp x11, x12, [x1,#8*2]
|
|
ldp x13, x14, [x1,#8*4]
|
|
|
|
stp x3, x4, [x0,#8*6] // copy input to |a|
|
|
stp x5, x6, [x0,#8*8]
|
|
stp x7, x8, [x0,#8*10]
|
|
stp x9, x10, [x0,#8*0] // copy modulus to |b|
|
|
stp x11, x12, [x0,#8*2]
|
|
stp x13, x14, [x0,#8*4]
|
|
|
|
eor x2, x2, x2 // init the .Legendre symbol
|
|
mov x15, #24 // 24 is 768/30-1
|
|
b .Loop_is_square
|
|
|
|
.align 4
|
|
.Loop_is_square:
|
|
bl __ab_approximation_30
|
|
sub x15, x15, #1
|
|
|
|
eor x1, x0, #128 // pointer to dst |b|
|
|
bl __smul_384_n_shift_by_30
|
|
|
|
mov x19, x16 // |f0|
|
|
mov x20, x17 // |g0|
|
|
add x1, x1, #8*6 // pointer to dst |a|
|
|
bl __smul_384_n_shift_by_30
|
|
|
|
ldp x9, x10, [x1,#-8*6]
|
|
eor x0, x0, #128 // flip-flop src |a|b|
|
|
and x27, x27, x9 // if |a| was negative,
|
|
add x2, x2, x27, lsr#1 // adjust |L|
|
|
|
|
cbnz x15, .Loop_is_square
|
|
|
|
////////////////////////////////////////// last iteration
|
|
//bl __ab_approximation_30 // |a| and |b| are exact,
|
|
//ldr x8, [x0,#8*6] // just load
|
|
mov x14, x9 // ldr x14, [x0,#8*0]
|
|
mov x15, #48 // 48 is 768%30 + 30
|
|
bl __inner_loop_48
|
|
ldr x30, [x29,#8]
|
|
|
|
and x0, x2, #1
|
|
eor x0, x0, #1
|
|
|
|
add sp, sp, #512
|
|
ldp x19, x20, [x29,#16]
|
|
ldp x21, x22, [x29,#32]
|
|
ldp x23, x24, [x29,#48]
|
|
ldp x25, x26, [x29,#64]
|
|
ldp x27, x28, [x29,#80]
|
|
ldr x29, [sp],#128
|
|
.inst 0xd50323bf
|
|
ret
|
|
.size ct_is_square_mod_384,.-ct_is_square_mod_384
|
|
|
|
.type __smul_384_n_shift_by_30, %function
|
|
.align 5
|
|
__smul_384_n_shift_by_30:
|
|
ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|)
|
|
asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s)
|
|
ldp x5, x6, [x0,#8*2+0]
|
|
eor x20, x20, x27 // conditionally negate |g1| (or |f1|)
|
|
ldp x7, x8, [x0,#8*4+0]
|
|
|
|
eor x3, x3, x27 // conditionally negate |b| (or |a|)
|
|
sub x20, x20, x27
|
|
eor x4, x4, x27
|
|
adds x3, x3, x27, lsr#63
|
|
eor x5, x5, x27
|
|
adcs x4, x4, xzr
|
|
eor x6, x6, x27
|
|
adcs x5, x5, xzr
|
|
eor x7, x7, x27
|
|
umulh x21, x3, x20
|
|
adcs x6, x6, xzr
|
|
umulh x22, x4, x20
|
|
eor x8, x8, x27
|
|
umulh x23, x5, x20
|
|
adcs x7, x7, xzr
|
|
umulh x24, x6, x20
|
|
adc x8, x8, xzr
|
|
|
|
umulh x25, x7, x20
|
|
and x28, x20, x27
|
|
umulh x26, x8, x20
|
|
neg x28, x28
|
|
mul x3, x3, x20
|
|
mul x4, x4, x20
|
|
mul x5, x5, x20
|
|
adds x4, x4, x21
|
|
mul x6, x6, x20
|
|
adcs x5, x5, x22
|
|
mul x7, x7, x20
|
|
adcs x6, x6, x23
|
|
mul x8, x8, x20
|
|
adcs x7, x7, x24
|
|
adcs x8, x8 ,x25
|
|
adc x26, x26, x28
|
|
ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|)
|
|
asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s)
|
|
ldp x11, x12, [x0,#8*2+48]
|
|
eor x19, x19, x27 // conditionally negate |g1| (or |f1|)
|
|
ldp x13, x14, [x0,#8*4+48]
|
|
|
|
eor x9, x9, x27 // conditionally negate |b| (or |a|)
|
|
sub x19, x19, x27
|
|
eor x10, x10, x27
|
|
adds x9, x9, x27, lsr#63
|
|
eor x11, x11, x27
|
|
adcs x10, x10, xzr
|
|
eor x12, x12, x27
|
|
adcs x11, x11, xzr
|
|
eor x13, x13, x27
|
|
umulh x21, x9, x19
|
|
adcs x12, x12, xzr
|
|
umulh x22, x10, x19
|
|
eor x14, x14, x27
|
|
umulh x23, x11, x19
|
|
adcs x13, x13, xzr
|
|
umulh x24, x12, x19
|
|
adc x14, x14, xzr
|
|
|
|
umulh x25, x13, x19
|
|
and x28, x19, x27
|
|
umulh x27, x14, x19
|
|
neg x28, x28
|
|
mul x9, x9, x19
|
|
mul x10, x10, x19
|
|
mul x11, x11, x19
|
|
adds x10, x10, x21
|
|
mul x12, x12, x19
|
|
adcs x11, x11, x22
|
|
mul x13, x13, x19
|
|
adcs x12, x12, x23
|
|
mul x14, x14, x19
|
|
adcs x13, x13, x24
|
|
adcs x14, x14 ,x25
|
|
adc x27, x27, x28
|
|
adds x3, x3, x9
|
|
adcs x4, x4, x10
|
|
adcs x5, x5, x11
|
|
adcs x6, x6, x12
|
|
adcs x7, x7, x13
|
|
adcs x8, x8, x14
|
|
adc x9, x26, x27
|
|
|
|
extr x3, x4, x3, #30
|
|
extr x4, x5, x4, #30
|
|
extr x5, x6, x5, #30
|
|
asr x27, x9, #63
|
|
extr x6, x7, x6, #30
|
|
extr x7, x8, x7, #30
|
|
extr x8, x9, x8, #30
|
|
|
|
eor x3, x3, x27
|
|
eor x4, x4, x27
|
|
adds x3, x3, x27, lsr#63
|
|
eor x5, x5, x27
|
|
adcs x4, x4, xzr
|
|
eor x6, x6, x27
|
|
adcs x5, x5, xzr
|
|
eor x7, x7, x27
|
|
adcs x6, x6, xzr
|
|
eor x8, x8, x27
|
|
stp x3, x4, [x1,#8*0]
|
|
adcs x7, x7, xzr
|
|
stp x5, x6, [x1,#8*2]
|
|
adc x8, x8, xzr
|
|
stp x7, x8, [x1,#8*4]
|
|
|
|
ret
|
|
.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
|
|
.type __ab_approximation_30, %function
|
|
.align 4
|
|
__ab_approximation_30:
|
|
ldp x13, x14, [x0,#8*4] // |a| is still in registers
|
|
ldp x11, x12, [x0,#8*2]
|
|
|
|
orr x21, x8, x14 // check top-most limbs, ...
|
|
cmp x21, #0
|
|
csel x8, x8, x7, ne
|
|
csel x14, x14, x13, ne
|
|
csel x7, x7, x6, ne
|
|
orr x21, x8, x14 // ... ones before top-most, ...
|
|
csel x13, x13, x12, ne
|
|
|
|
cmp x21, #0
|
|
csel x8, x8, x7, ne
|
|
csel x14, x14, x13, ne
|
|
csel x7, x7, x5, ne
|
|
orr x21, x8, x14 // ... and ones before that ...
|
|
csel x13, x13, x11, ne
|
|
|
|
cmp x21, #0
|
|
csel x8, x8, x7, ne
|
|
csel x14, x14, x13, ne
|
|
csel x7, x7, x4, ne
|
|
orr x21, x8, x14 // and one more, ...
|
|
csel x13, x13, x10, ne
|
|
|
|
cmp x21, #0
|
|
csel x8, x8, x7, ne
|
|
csel x14, x14, x13, ne
|
|
csel x7, x7, x3, ne
|
|
orr x21, x8, x14
|
|
csel x13, x13, x9, ne
|
|
|
|
clz x21, x21
|
|
cmp x21, #64
|
|
csel x21, x21, xzr, ne
|
|
csel x8, x8, x7, ne
|
|
csel x14, x14, x13, ne
|
|
neg x22, x21
|
|
|
|
lslv x8, x8, x21 // align high limbs to the left
|
|
lslv x14, x14, x21
|
|
lsrv x7, x7, x22
|
|
lsrv x13, x13, x22
|
|
and x7, x7, x22, asr#6
|
|
and x13, x13, x22, asr#6
|
|
orr x8, x8, x7
|
|
orr x14, x14, x13
|
|
|
|
bfxil x8, x3, #0, #32
|
|
bfxil x14, x9, #0, #32
|
|
|
|
b __inner_loop_30
|
|
ret
|
|
.size __ab_approximation_30,.-__ab_approximation_30
|
|
|
|
.type __inner_loop_30, %function
|
|
.align 4
|
|
__inner_loop_30:
|
|
mov x28, #30
|
|
mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0
|
|
mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1
|
|
mov x27,#0x7FFFFFFF7FFFFFFF
|
|
|
|
.Loop_30:
|
|
sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting
|
|
and x25, x8, x14
|
|
sub x28, x28, #1
|
|
and x21, x14, x24
|
|
|
|
sub x22, x14, x8 // |b_|-|a_|
|
|
subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even)
|
|
add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1
|
|
mov x21, x20
|
|
csel x14, x14, x8, hs // |b_| = |a_|
|
|
csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
|
|
csel x20, x20, x17, hs // exchange |fg0| and |fg1|
|
|
csel x17, x17, x21, hs
|
|
csel x2, x2, x25, hs
|
|
lsr x8, x8, #1
|
|
and x21, x20, x24
|
|
and x22, x27, x24
|
|
add x23, x14, #2
|
|
sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even)
|
|
add x20, x20, x20 // |f1|<<=1
|
|
add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5
|
|
add x17, x17, x22
|
|
sub x20, x20, x27
|
|
|
|
cbnz x28, .Loop_30
|
|
|
|
mov x27, #0x7FFFFFFF
|
|
ubfx x16, x17, #0, #32
|
|
ubfx x17, x17, #32, #32
|
|
ubfx x19, x20, #0, #32
|
|
ubfx x20, x20, #32, #32
|
|
sub x16, x16, x27 // remove the bias
|
|
sub x17, x17, x27
|
|
sub x19, x19, x27
|
|
sub x20, x20, x27
|
|
|
|
ret
|
|
.size __inner_loop_30,.-__inner_loop_30
|
|
.type __inner_loop_48, %function
|
|
.align 4
|
|
__inner_loop_48:
|
|
.Loop_48:
|
|
sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting
|
|
and x25, x8, x14
|
|
sub x15, x15, #1
|
|
and x21, x14, x24
|
|
sub x22, x14, x8 // |b_|-|a_|
|
|
subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even)
|
|
add x25, x2, x25, lsr#1
|
|
csel x14, x14, x8, hs // |b_| = |a_|
|
|
csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
|
|
csel x2, x2, x25, hs
|
|
add x23, x14, #2
|
|
lsr x8, x8, #1
|
|
add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5
|
|
|
|
cbnz x15, .Loop_48
|
|
|
|
ret
|
|
.size __inner_loop_48,.-__inner_loop_48
|