ftu/blst/elf/add_mod_384-armv8.S
2022-09-09 02:47:49 -04:00

932 lines
17 KiB
ArmAsm

.text
.globl add_mod_384
.hidden add_mod_384
.type add_mod_384,%function
.align 5
add_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
bl __add_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size add_mod_384,.-add_mod_384
.type __add_mod_384,%function
.align 5
__add_mod_384:
ldp x10,x11,[x1]
ldp x16,x17,[x2]
ldp x12,x13,[x1,#16]
ldp x19,x20,[x2,#16]
ldp x14,x15,[x1,#32]
ldp x21,x22,[x2,#32]
__add_mod_384_ab_are_loaded:
adds x10,x10,x16
adcs x11,x11,x17
adcs x12,x12,x19
adcs x13,x13,x20
adcs x14,x14,x21
adcs x15,x15,x22
adc x3,xzr,xzr
subs x16,x10,x4
sbcs x17,x11,x5
sbcs x19,x12,x6
sbcs x20,x13,x7
sbcs x21,x14,x8
sbcs x22,x15,x9
sbcs xzr,x3,xzr
csel x10,x10,x16,lo
csel x11,x11,x17,lo
csel x12,x12,x19,lo
csel x13,x13,x20,lo
csel x14,x14,x21,lo
csel x15,x15,x22,lo
ret
.size __add_mod_384,.-__add_mod_384
.globl add_mod_384x
.hidden add_mod_384x
.type add_mod_384x,%function
.align 5
add_mod_384x:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
bl __add_mod_384
stp x10,x11,[x0]
add x1,x1,#48
stp x12,x13,[x0,#16]
add x2,x2,#48
stp x14,x15,[x0,#32]
bl __add_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0,#48]
stp x12,x13,[x0,#64]
stp x14,x15,[x0,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size add_mod_384x,.-add_mod_384x
.globl rshift_mod_384
.hidden rshift_mod_384
.type rshift_mod_384,%function
.align 5
rshift_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
.Loop_rshift_mod_384:
sub x2,x2,#1
bl __rshift_mod_384
cbnz x2,.Loop_rshift_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size rshift_mod_384,.-rshift_mod_384
.type __rshift_mod_384,%function
.align 5
__rshift_mod_384:
sbfx x22,x10,#0,#1
and x16,x22,x4
and x17,x22,x5
adds x10,x10,x16
and x19,x22,x6
adcs x11,x11,x17
and x20,x22,x7
adcs x12,x12,x19
and x21,x22,x8
adcs x13,x13,x20
and x22,x22,x9
adcs x14,x14,x21
extr x10,x11,x10,#1 // a[0:5] >>= 1
adcs x15,x15,x22
extr x11,x12,x11,#1
adc x22,xzr,xzr
extr x12,x13,x12,#1
extr x13,x14,x13,#1
extr x14,x15,x14,#1
extr x15,x22,x15,#1
ret
.size __rshift_mod_384,.-__rshift_mod_384
.globl div_by_2_mod_384
.hidden div_by_2_mod_384
.type div_by_2_mod_384,%function
.align 5
div_by_2_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
bl __rshift_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size div_by_2_mod_384,.-div_by_2_mod_384
.globl lshift_mod_384
.hidden lshift_mod_384
.type lshift_mod_384,%function
.align 5
lshift_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
.Loop_lshift_mod_384:
sub x2,x2,#1
bl __lshift_mod_384
cbnz x2,.Loop_lshift_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size lshift_mod_384,.-lshift_mod_384
.type __lshift_mod_384,%function
.align 5
__lshift_mod_384:
adds x10,x10,x10
adcs x11,x11,x11
adcs x12,x12,x12
adcs x13,x13,x13
adcs x14,x14,x14
adcs x15,x15,x15
adc x3,xzr,xzr
subs x16,x10,x4
sbcs x17,x11,x5
sbcs x19,x12,x6
sbcs x20,x13,x7
sbcs x21,x14,x8
sbcs x22,x15,x9
sbcs xzr,x3,xzr
csel x10,x10,x16,lo
csel x11,x11,x17,lo
csel x12,x12,x19,lo
csel x13,x13,x20,lo
csel x14,x14,x21,lo
csel x15,x15,x22,lo
ret
.size __lshift_mod_384,.-__lshift_mod_384
.globl mul_by_3_mod_384
.hidden mul_by_3_mod_384
.type mul_by_3_mod_384,%function
.align 5
mul_by_3_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
bl __lshift_mod_384
ldp x16,x17,[x1]
ldp x19,x20,[x1,#16]
ldp x21,x22,[x1,#32]
bl __add_mod_384_ab_are_loaded
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size mul_by_3_mod_384,.-mul_by_3_mod_384
.globl mul_by_8_mod_384
.hidden mul_by_8_mod_384
.type mul_by_8_mod_384,%function
.align 5
mul_by_8_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
bl __lshift_mod_384
bl __lshift_mod_384
bl __lshift_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size mul_by_8_mod_384,.-mul_by_8_mod_384
.globl mul_by_3_mod_384x
.hidden mul_by_3_mod_384x
.type mul_by_3_mod_384x,%function
.align 5
mul_by_3_mod_384x:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
bl __lshift_mod_384
ldp x16,x17,[x1]
ldp x19,x20,[x1,#16]
ldp x21,x22,[x1,#32]
bl __add_mod_384_ab_are_loaded
stp x10,x11,[x0]
ldp x10,x11,[x1,#48]
stp x12,x13,[x0,#16]
ldp x12,x13,[x1,#64]
stp x14,x15,[x0,#32]
ldp x14,x15,[x1,#80]
bl __lshift_mod_384
ldp x16,x17,[x1,#48]
ldp x19,x20,[x1,#64]
ldp x21,x22,[x1,#80]
bl __add_mod_384_ab_are_loaded
ldr x30,[sp,#8]
stp x10,x11,[x0,#48]
stp x12,x13,[x0,#64]
stp x14,x15,[x0,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size mul_by_3_mod_384x,.-mul_by_3_mod_384x
.globl mul_by_8_mod_384x
.hidden mul_by_8_mod_384x
.type mul_by_8_mod_384x,%function
.align 5
mul_by_8_mod_384x:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
bl __lshift_mod_384
bl __lshift_mod_384
bl __lshift_mod_384
stp x10,x11,[x0]
ldp x10,x11,[x1,#48]
stp x12,x13,[x0,#16]
ldp x12,x13,[x1,#64]
stp x14,x15,[x0,#32]
ldp x14,x15,[x1,#80]
bl __lshift_mod_384
bl __lshift_mod_384
bl __lshift_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0,#48]
stp x12,x13,[x0,#64]
stp x14,x15,[x0,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size mul_by_8_mod_384x,.-mul_by_8_mod_384x
.globl cneg_mod_384
.hidden cneg_mod_384
.type cneg_mod_384,%function
.align 5
cneg_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x4,x5,[x3]
ldp x12,x13,[x1,#16]
ldp x6,x7,[x3,#16]
subs x16,x4,x10
ldp x14,x15,[x1,#32]
ldp x8,x9,[x3,#32]
orr x3,x10,x11
sbcs x17,x5,x11
orr x3,x3,x12
sbcs x19,x6,x12
orr x3,x3,x13
sbcs x20,x7,x13
orr x3,x3,x14
sbcs x21,x8,x14
orr x3,x3,x15
sbc x22,x9,x15
cmp x3,#0
csetm x3,ne
ands x2,x2,x3
csel x10,x10,x16,eq
csel x11,x11,x17,eq
csel x12,x12,x19,eq
csel x13,x13,x20,eq
stp x10,x11,[x0]
csel x14,x14,x21,eq
stp x12,x13,[x0,#16]
csel x15,x15,x22,eq
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size cneg_mod_384,.-cneg_mod_384
.globl sub_mod_384
.hidden sub_mod_384
.type sub_mod_384,%function
.align 5
sub_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
bl __sub_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size sub_mod_384,.-sub_mod_384
.type __sub_mod_384,%function
.align 5
__sub_mod_384:
ldp x10,x11,[x1]
ldp x16,x17,[x2]
ldp x12,x13,[x1,#16]
ldp x19,x20,[x2,#16]
ldp x14,x15,[x1,#32]
ldp x21,x22,[x2,#32]
subs x10,x10,x16
sbcs x11,x11,x17
sbcs x12,x12,x19
sbcs x13,x13,x20
sbcs x14,x14,x21
sbcs x15,x15,x22
sbc x3,xzr,xzr
and x16,x4,x3
and x17,x5,x3
adds x10,x10,x16
and x19,x6,x3
adcs x11,x11,x17
and x20,x7,x3
adcs x12,x12,x19
and x21,x8,x3
adcs x13,x13,x20
and x22,x9,x3
adcs x14,x14,x21
adc x15,x15,x22
ret
.size __sub_mod_384,.-__sub_mod_384
.globl sub_mod_384x
.hidden sub_mod_384x
.type sub_mod_384x,%function
.align 5
sub_mod_384x:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
bl __sub_mod_384
stp x10,x11,[x0]
add x1,x1,#48
stp x12,x13,[x0,#16]
add x2,x2,#48
stp x14,x15,[x0,#32]
bl __sub_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0,#48]
stp x12,x13,[x0,#64]
stp x14,x15,[x0,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size sub_mod_384x,.-sub_mod_384x
.globl mul_by_1_plus_i_mod_384x
.hidden mul_by_1_plus_i_mod_384x
.type mul_by_1_plus_i_mod_384x,%function
.align 5
mul_by_1_plus_i_mod_384x:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
add x2,x1,#48
bl __sub_mod_384 // a->re - a->im
ldp x16,x17,[x1]
ldp x19,x20,[x1,#16]
ldp x21,x22,[x1,#32]
stp x10,x11,[x0]
ldp x10,x11,[x1,#48]
stp x12,x13,[x0,#16]
ldp x12,x13,[x1,#64]
stp x14,x15,[x0,#32]
ldp x14,x15,[x1,#80]
bl __add_mod_384_ab_are_loaded // a->re + a->im
ldr x30,[sp,#8]
stp x10,x11,[x0,#48]
stp x12,x13,[x0,#64]
stp x14,x15,[x0,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
.globl sgn0_pty_mod_384
.hidden sgn0_pty_mod_384
.type sgn0_pty_mod_384,%function
.align 5
sgn0_pty_mod_384:
ldp x10,x11,[x0]
ldp x12,x13,[x0,#16]
ldp x14,x15,[x0,#32]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
ldp x8,x9,[x1,#32]
and x0,x10,#1
adds x10,x10,x10
adcs x11,x11,x11
adcs x12,x12,x12
adcs x13,x13,x13
adcs x14,x14,x14
adcs x15,x15,x15
adc x3,xzr,xzr
subs x10,x10,x4
sbcs x11,x11,x5
sbcs x12,x12,x6
sbcs x13,x13,x7
sbcs x14,x14,x8
sbcs x15,x15,x9
sbc x3,x3,xzr
mvn x3,x3
and x3,x3,#2
orr x0,x0,x3
ret
.size sgn0_pty_mod_384,.-sgn0_pty_mod_384
.globl sgn0_pty_mod_384x
.hidden sgn0_pty_mod_384x
.type sgn0_pty_mod_384x,%function
.align 5
sgn0_pty_mod_384x:
ldp x10,x11,[x0]
ldp x12,x13,[x0,#16]
ldp x14,x15,[x0,#32]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
ldp x8,x9,[x1,#32]
and x2,x10,#1
orr x3,x10,x11
adds x10,x10,x10
orr x3,x3,x12
adcs x11,x11,x11
orr x3,x3,x13
adcs x12,x12,x12
orr x3,x3,x14
adcs x13,x13,x13
orr x3,x3,x15
adcs x14,x14,x14
adcs x15,x15,x15
adc x16,xzr,xzr
subs x10,x10,x4
sbcs x11,x11,x5
sbcs x12,x12,x6
sbcs x13,x13,x7
sbcs x14,x14,x8
sbcs x15,x15,x9
sbc x16,x16,xzr
ldp x10,x11,[x0,#48]
ldp x12,x13,[x0,#64]
ldp x14,x15,[x0,#80]
mvn x16,x16
and x16,x16,#2
orr x2,x2,x16
and x0,x10,#1
orr x1,x10,x11
adds x10,x10,x10
orr x1,x1,x12
adcs x11,x11,x11
orr x1,x1,x13
adcs x12,x12,x12
orr x1,x1,x14
adcs x13,x13,x13
orr x1,x1,x15
adcs x14,x14,x14
adcs x15,x15,x15
adc x16,xzr,xzr
subs x10,x10,x4
sbcs x11,x11,x5
sbcs x12,x12,x6
sbcs x13,x13,x7
sbcs x14,x14,x8
sbcs x15,x15,x9
sbc x16,x16,xzr
mvn x16,x16
and x16,x16,#2
orr x0,x0,x16
cmp x3,#0
csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re)
cmp x1,#0
csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re)
and x3,x3,#1
and x1,x1,#2
orr x0,x1,x3 // pack sign and parity
ret
.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
.globl vec_select_48
.hidden vec_select_48
.type vec_select_48,%function
.align 5
vec_select_48:
dup v6.2d, x3
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
cmeq v6.2d, v6.2d, #0
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v0.16b, v3.16b, v6.16b
bit v1.16b, v4.16b, v6.16b
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0]
ret
.size vec_select_48,.-vec_select_48
.globl vec_select_96
.hidden vec_select_96
.type vec_select_96,%function
.align 5
vec_select_96:
dup v6.2d, x3
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
cmeq v6.2d, v6.2d, #0
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
bit v17.16b, v20.16b, v6.16b
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0]
ret
.size vec_select_96,.-vec_select_96
.globl vec_select_192
.hidden vec_select_192
.type vec_select_192,%function
.align 5
vec_select_192:
dup v6.2d, x3
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
cmeq v6.2d, v6.2d, #0
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
bit v17.16b, v20.16b, v6.16b
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
bit v17.16b, v20.16b, v6.16b
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0]
ret
.size vec_select_192,.-vec_select_192
.globl vec_select_144
.hidden vec_select_144
.type vec_select_144,%function
.align 5
vec_select_144:
dup v6.2d, x3
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
cmeq v6.2d, v6.2d, #0
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
bit v17.16b, v20.16b, v6.16b
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0],#48
bit v0.16b, v3.16b, v6.16b
bit v1.16b, v4.16b, v6.16b
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0]
ret
.size vec_select_144,.-vec_select_144
.globl vec_select_288
.hidden vec_select_288
.type vec_select_288,%function
.align 5
vec_select_288:
dup v6.2d, x3
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
cmeq v6.2d, v6.2d, #0
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
bit v17.16b, v20.16b, v6.16b
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
bit v17.16b, v20.16b, v6.16b
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
bit v17.16b, v20.16b, v6.16b
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0]
ret
.size vec_select_288,.-vec_select_288
.globl vec_prefetch
.hidden vec_prefetch
.type vec_prefetch,%function
.align 5
vec_prefetch:
add x1, x1, x0
sub x1, x1, #1
mov x2, #64
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
csel x2, xzr, x2, hi
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
csel x2, xzr, x2, hi
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
csel x2, xzr, x2, hi
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
csel x2, xzr, x2, hi
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
csel x2, xzr, x2, hi
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
prfm pldl1keep, [x0]
ret
.size vec_prefetch,.-vec_prefetch