ftu/blst/elf/mul_mont_256-armv8.S
2022-09-09 02:47:49 -04:00

465 lines
8 KiB
ArmAsm

.text
.globl mul_mont_sparse_256
.hidden mul_mont_sparse_256
.type mul_mont_sparse_256,%function
.align 5
mul_mont_sparse_256:
stp x29,x30,[sp,#-64]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
ldp x10,x11,[x1]
ldr x9, [x2]
ldp x12,x13,[x1,#16]
mul x19,x10,x9
ldp x5,x6,[x3]
mul x20,x11,x9
ldp x7,x8,[x3,#16]
mul x21,x12,x9
mul x22,x13,x9
umulh x14,x10,x9
umulh x15,x11,x9
mul x3,x4,x19
umulh x16,x12,x9
umulh x17,x13,x9
adds x20,x20,x14
//mul x14,x5,x3
adcs x21,x21,x15
mul x15,x6,x3
adcs x22,x22,x16
mul x16,x7,x3
adc x23,xzr, x17
mul x17,x8,x3
ldr x9,[x2,8*1]
subs xzr,x19,#1 //adds x19,x19,x14
umulh x14,x5,x3
adcs x20,x20,x15
umulh x15,x6,x3
adcs x21,x21,x16
umulh x16,x7,x3
adcs x22,x22,x17
umulh x17,x8,x3
adc x23,x23,xzr
adds x19,x20,x14
mul x14,x10,x9
adcs x20,x21,x15
mul x15,x11,x9
adcs x21,x22,x16
mul x16,x12,x9
adcs x22,x23,x17
mul x17,x13,x9
adc x23,xzr,xzr
adds x19,x19,x14
umulh x14,x10,x9
adcs x20,x20,x15
umulh x15,x11,x9
adcs x21,x21,x16
mul x3,x4,x19
umulh x16,x12,x9
adcs x22,x22,x17
umulh x17,x13,x9
adc x23,x23,xzr
adds x20,x20,x14
//mul x14,x5,x3
adcs x21,x21,x15
mul x15,x6,x3
adcs x22,x22,x16
mul x16,x7,x3
adc x23,x23,x17
mul x17,x8,x3
ldr x9,[x2,8*2]
subs xzr,x19,#1 //adds x19,x19,x14
umulh x14,x5,x3
adcs x20,x20,x15
umulh x15,x6,x3
adcs x21,x21,x16
umulh x16,x7,x3
adcs x22,x22,x17
umulh x17,x8,x3
adc x23,x23,xzr
adds x19,x20,x14
mul x14,x10,x9
adcs x20,x21,x15
mul x15,x11,x9
adcs x21,x22,x16
mul x16,x12,x9
adcs x22,x23,x17
mul x17,x13,x9
adc x23,xzr,xzr
adds x19,x19,x14
umulh x14,x10,x9
adcs x20,x20,x15
umulh x15,x11,x9
adcs x21,x21,x16
mul x3,x4,x19
umulh x16,x12,x9
adcs x22,x22,x17
umulh x17,x13,x9
adc x23,x23,xzr
adds x20,x20,x14
//mul x14,x5,x3
adcs x21,x21,x15
mul x15,x6,x3
adcs x22,x22,x16
mul x16,x7,x3
adc x23,x23,x17
mul x17,x8,x3
ldr x9,[x2,8*3]
subs xzr,x19,#1 //adds x19,x19,x14
umulh x14,x5,x3
adcs x20,x20,x15
umulh x15,x6,x3
adcs x21,x21,x16
umulh x16,x7,x3
adcs x22,x22,x17
umulh x17,x8,x3
adc x23,x23,xzr
adds x19,x20,x14
mul x14,x10,x9
adcs x20,x21,x15
mul x15,x11,x9
adcs x21,x22,x16
mul x16,x12,x9
adcs x22,x23,x17
mul x17,x13,x9
adc x23,xzr,xzr
adds x19,x19,x14
umulh x14,x10,x9
adcs x20,x20,x15
umulh x15,x11,x9
adcs x21,x21,x16
mul x3,x4,x19
umulh x16,x12,x9
adcs x22,x22,x17
umulh x17,x13,x9
adc x23,x23,xzr
adds x20,x20,x14
//mul x14,x5,x3
adcs x21,x21,x15
mul x15,x6,x3
adcs x22,x22,x16
mul x16,x7,x3
adc x23,x23,x17
mul x17,x8,x3
subs xzr,x19,#1 //adds x19,x19,x14
umulh x14,x5,x3
adcs x20,x20,x15
umulh x15,x6,x3
adcs x21,x21,x16
umulh x16,x7,x3
adcs x22,x22,x17
umulh x17,x8,x3
adc x23,x23,xzr
adds x19,x20,x14
adcs x20,x21,x15
adcs x21,x22,x16
adcs x22,x23,x17
adc x23,xzr,xzr
subs x14,x19,x5
sbcs x15,x20,x6
sbcs x16,x21,x7
sbcs x17,x22,x8
sbcs xzr, x23,xzr
csel x19,x19,x14,lo
csel x20,x20,x15,lo
csel x21,x21,x16,lo
csel x22,x22,x17,lo
stp x19,x20,[x0]
stp x21,x22,[x0,#16]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldr x29,[sp],#64
ret
.size mul_mont_sparse_256,.-mul_mont_sparse_256
.globl sqr_mont_sparse_256
.hidden sqr_mont_sparse_256
.type sqr_mont_sparse_256,%function
.align 5
sqr_mont_sparse_256:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x5,x6,[x1]
ldp x7,x8,[x1,#16]
mov x4,x3
////////////////////////////////////////////////////////////////
// | | | | | |a1*a0| |
// | | | | |a2*a0| | |
// | |a3*a2|a3*a0| | | |
// | | | |a2*a1| | | |
// | | |a3*a1| | | | |
// *| | | | | | | | 2|
// +|a3*a3|a2*a2|a1*a1|a0*a0|
// |--+--+--+--+--+--+--+--|
// |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10
//
// "can't overflow" below mark carrying into high part of
// multiplication result, which can't overflow, because it
// can never be all ones.
mul x11,x6,x5 // a[1]*a[0]
umulh x15,x6,x5
mul x12,x7,x5 // a[2]*a[0]
umulh x16,x7,x5
mul x13,x8,x5 // a[3]*a[0]
umulh x19,x8,x5
adds x12,x12,x15 // accumulate high parts of multiplication
mul x14,x7,x6 // a[2]*a[1]
umulh x15,x7,x6
adcs x13,x13,x16
mul x16,x8,x6 // a[3]*a[1]
umulh x17,x8,x6
adc x19,x19,xzr // can't overflow
mul x20,x8,x7 // a[3]*a[2]
umulh x21,x8,x7
adds x15,x15,x16 // accumulate high parts of multiplication
mul x10,x5,x5 // a[0]*a[0]
adc x16,x17,xzr // can't overflow
adds x13,x13,x14 // accumulate low parts of multiplication
umulh x5,x5,x5
adcs x19,x19,x15
mul x15,x6,x6 // a[1]*a[1]
adcs x20,x20,x16
umulh x6,x6,x6
adc x21,x21,xzr // can't overflow
adds x11,x11,x11 // acc[1-6]*=2
mul x16,x7,x7 // a[2]*a[2]
adcs x12,x12,x12
umulh x7,x7,x7
adcs x13,x13,x13
mul x17,x8,x8 // a[3]*a[3]
adcs x19,x19,x19
umulh x8,x8,x8
adcs x20,x20,x20
adcs x21,x21,x21
adc x22,xzr,xzr
adds x11,x11,x5 // +a[i]*a[i]
adcs x12,x12,x15
adcs x13,x13,x6
adcs x19,x19,x16
adcs x20,x20,x7
adcs x21,x21,x17
adc x22,x22,x8
bl __mul_by_1_mont_256
ldr x30,[x29,#8]
adds x10,x10,x19 // accumulate upper half
adcs x11,x11,x20
adcs x12,x12,x21
adcs x13,x13,x22
adc x19,xzr,xzr
subs x14,x10,x5
sbcs x15,x11,x6
sbcs x16,x12,x7
sbcs x17,x13,x8
sbcs xzr, x19,xzr
csel x10,x10,x14,lo
csel x11,x11,x15,lo
csel x12,x12,x16,lo
csel x13,x13,x17,lo
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size sqr_mont_sparse_256,.-sqr_mont_sparse_256
.globl from_mont_256
.hidden from_mont_256
.type from_mont_256,%function
.align 5
from_mont_256:
.inst 0xd503233f
stp x29,x30,[sp,#-16]!
add x29,sp,#0
mov x4,x3
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
bl __mul_by_1_mont_256
ldr x30,[x29,#8]
subs x14,x10,x5
sbcs x15,x11,x6
sbcs x16,x12,x7
sbcs x17,x13,x8
csel x10,x10,x14,lo
csel x11,x11,x15,lo
csel x12,x12,x16,lo
csel x13,x13,x17,lo
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
ldr x29,[sp],#16
.inst 0xd50323bf
ret
.size from_mont_256,.-from_mont_256
.globl redc_mont_256
.hidden redc_mont_256
.type redc_mont_256,%function
.align 5
redc_mont_256:
.inst 0xd503233f
stp x29,x30,[sp,#-16]!
add x29,sp,#0
mov x4,x3
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
bl __mul_by_1_mont_256
ldr x30,[x29,#8]
ldp x14,x15,[x1,#32]
ldp x16,x17,[x1,#48]
adds x10,x10,x14
adcs x11,x11,x15
adcs x12,x12,x16
adcs x13,x13,x17
adc x9,xzr,xzr
subs x14,x10,x5
sbcs x15,x11,x6
sbcs x16,x12,x7
sbcs x17,x13,x8
sbcs xzr, x9,xzr
csel x10,x10,x14,lo
csel x11,x11,x15,lo
csel x12,x12,x16,lo
csel x13,x13,x17,lo
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
ldr x29,[sp],#16
.inst 0xd50323bf
ret
.size redc_mont_256,.-redc_mont_256
.type __mul_by_1_mont_256,%function
.align 5
__mul_by_1_mont_256:
mul x3,x4,x10
ldp x5,x6,[x2]
ldp x7,x8,[x2,#16]
//mul x14,x5,x3
mul x15,x6,x3
mul x16,x7,x3
mul x17,x8,x3
subs xzr,x10,#1 //adds x10,x10,x14
umulh x14,x5,x3
adcs x11,x11,x15
umulh x15,x6,x3
adcs x12,x12,x16
umulh x16,x7,x3
adcs x13,x13,x17
umulh x17,x8,x3
adc x9,xzr,xzr
adds x10,x11,x14
adcs x11,x12,x15
adcs x12,x13,x16
mul x3,x4,x10
adc x13,x9,x17
//mul x14,x5,x3
mul x15,x6,x3
mul x16,x7,x3
mul x17,x8,x3
subs xzr,x10,#1 //adds x10,x10,x14
umulh x14,x5,x3
adcs x11,x11,x15
umulh x15,x6,x3
adcs x12,x12,x16
umulh x16,x7,x3
adcs x13,x13,x17
umulh x17,x8,x3
adc x9,xzr,xzr
adds x10,x11,x14
adcs x11,x12,x15
adcs x12,x13,x16
mul x3,x4,x10
adc x13,x9,x17
//mul x14,x5,x3
mul x15,x6,x3
mul x16,x7,x3
mul x17,x8,x3
subs xzr,x10,#1 //adds x10,x10,x14
umulh x14,x5,x3
adcs x11,x11,x15
umulh x15,x6,x3
adcs x12,x12,x16
umulh x16,x7,x3
adcs x13,x13,x17
umulh x17,x8,x3
adc x9,xzr,xzr
adds x10,x11,x14
adcs x11,x12,x15
adcs x12,x13,x16
mul x3,x4,x10
adc x13,x9,x17
//mul x14,x5,x3
mul x15,x6,x3
mul x16,x7,x3
mul x17,x8,x3
subs xzr,x10,#1 //adds x10,x10,x14
umulh x14,x5,x3
adcs x11,x11,x15
umulh x15,x6,x3
adcs x12,x12,x16
umulh x16,x7,x3
adcs x13,x13,x17
umulh x17,x8,x3
adc x9,xzr,xzr
adds x10,x11,x14
adcs x11,x12,x15
adcs x12,x13,x16
adc x13,x9,x17
ret
.size __mul_by_1_mont_256,.-__mul_by_1_mont_256