873 lines
18 KiB
Perl
873 lines
18 KiB
Perl
|
#!/usr/bin/env perl
|
||
|
#
|
||
|
# Copyright Supranational LLC
|
||
|
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
|
||
|
$flavour = shift;
|
||
|
$output = shift;
|
||
|
|
||
|
if ($flavour && $flavour ne "void") {
|
||
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||
|
die "can't locate arm-xlate.pl";
|
||
|
|
||
|
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||
|
} else {
|
||
|
open STDOUT,">$output";
|
||
|
}
|
||
|
|
||
|
($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
|
||
|
|
||
|
@mod=map("x$_",(4..9));
|
||
|
@a=map("x$_",(10..15));
|
||
|
@b=map("x$_",(16,17,19..22));
|
||
|
$carry=$n_ptr;
|
||
|
|
||
|
$code.=<<___;
|
||
|
.text
|
||
|
|
||
|
.globl add_mod_384
|
||
|
.hidden add_mod_384
|
||
|
.type add_mod_384,%function
|
||
|
.align 5
|
||
|
add_mod_384:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$n_ptr]
|
||
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||
|
|
||
|
bl __add_mod_384
|
||
|
ldr x30,[sp,#8]
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size add_mod_384,.-add_mod_384
|
||
|
|
||
|
.type __add_mod_384,%function
|
||
|
.align 5
|
||
|
__add_mod_384:
|
||
|
ldp @a[0],@a[1],[$a_ptr]
|
||
|
ldp @b[0],@b[1],[$b_ptr]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
||
|
ldp @b[2],@b[3],[$b_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
||
|
ldp @b[4],@b[5],[$b_ptr,#32]
|
||
|
|
||
|
__add_mod_384_ab_are_loaded:
|
||
|
adds @a[0],@a[0],@b[0]
|
||
|
adcs @a[1],@a[1],@b[1]
|
||
|
adcs @a[2],@a[2],@b[2]
|
||
|
adcs @a[3],@a[3],@b[3]
|
||
|
adcs @a[4],@a[4],@b[4]
|
||
|
adcs @a[5],@a[5],@b[5]
|
||
|
adc $carry,xzr,xzr
|
||
|
|
||
|
subs @b[0],@a[0],@mod[0]
|
||
|
sbcs @b[1],@a[1],@mod[1]
|
||
|
sbcs @b[2],@a[2],@mod[2]
|
||
|
sbcs @b[3],@a[3],@mod[3]
|
||
|
sbcs @b[4],@a[4],@mod[4]
|
||
|
sbcs @b[5],@a[5],@mod[5]
|
||
|
sbcs xzr,$carry,xzr
|
||
|
|
||
|
csel @a[0],@a[0],@b[0],lo
|
||
|
csel @a[1],@a[1],@b[1],lo
|
||
|
csel @a[2],@a[2],@b[2],lo
|
||
|
csel @a[3],@a[3],@b[3],lo
|
||
|
csel @a[4],@a[4],@b[4],lo
|
||
|
csel @a[5],@a[5],@b[5],lo
|
||
|
|
||
|
ret
|
||
|
.size __add_mod_384,.-__add_mod_384
|
||
|
|
||
|
.globl add_mod_384x
|
||
|
.hidden add_mod_384x
|
||
|
.type add_mod_384x,%function
|
||
|
.align 5
|
||
|
add_mod_384x:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$n_ptr]
|
||
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||
|
|
||
|
bl __add_mod_384
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
add $a_ptr,$a_ptr,#48
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
add $b_ptr,$b_ptr,#48
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
bl __add_mod_384
|
||
|
ldr x30,[sp,#8]
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr,#48]
|
||
|
stp @a[2],@a[3],[$r_ptr,#64]
|
||
|
stp @a[4],@a[5],[$r_ptr,#80]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size add_mod_384x,.-add_mod_384x
|
||
|
|
||
|
.globl rshift_mod_384
|
||
|
.hidden rshift_mod_384
|
||
|
.type rshift_mod_384,%function
|
||
|
.align 5
|
||
|
rshift_mod_384:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @a[0],@a[1],[$a_ptr]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$n_ptr]
|
||
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||
|
|
||
|
.Loop_rshift_mod_384:
|
||
|
sub $b_ptr,$b_ptr,#1
|
||
|
bl __rshift_mod_384
|
||
|
cbnz $b_ptr,.Loop_rshift_mod_384
|
||
|
|
||
|
ldr x30,[sp,#8]
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size rshift_mod_384,.-rshift_mod_384
|
||
|
|
||
|
.type __rshift_mod_384,%function
|
||
|
.align 5
|
||
|
__rshift_mod_384:
|
||
|
sbfx @b[5],@a[0],#0,#1
|
||
|
and @b[0],@b[5],@mod[0]
|
||
|
and @b[1],@b[5],@mod[1]
|
||
|
adds @a[0],@a[0],@b[0]
|
||
|
and @b[2],@b[5],@mod[2]
|
||
|
adcs @a[1],@a[1],@b[1]
|
||
|
and @b[3],@b[5],@mod[3]
|
||
|
adcs @a[2],@a[2],@b[2]
|
||
|
and @b[4],@b[5],@mod[4]
|
||
|
adcs @a[3],@a[3],@b[3]
|
||
|
and @b[5],@b[5],@mod[5]
|
||
|
adcs @a[4],@a[4],@b[4]
|
||
|
extr @a[0],@a[1],@a[0],#1 // a[0:5] >>= 1
|
||
|
adcs @a[5],@a[5],@b[5]
|
||
|
extr @a[1],@a[2],@a[1],#1
|
||
|
adc @b[5],xzr,xzr
|
||
|
extr @a[2],@a[3],@a[2],#1
|
||
|
extr @a[3],@a[4],@a[3],#1
|
||
|
extr @a[4],@a[5],@a[4],#1
|
||
|
extr @a[5],@b[5],@a[5],#1
|
||
|
ret
|
||
|
.size __rshift_mod_384,.-__rshift_mod_384
|
||
|
|
||
|
.globl div_by_2_mod_384
|
||
|
.hidden div_by_2_mod_384
|
||
|
.type div_by_2_mod_384,%function
|
||
|
.align 5
|
||
|
div_by_2_mod_384:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @a[0],@a[1],[$a_ptr]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$b_ptr]
|
||
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||
|
|
||
|
bl __rshift_mod_384
|
||
|
|
||
|
ldr x30,[sp,#8]
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size div_by_2_mod_384,.-div_by_2_mod_384
|
||
|
|
||
|
.globl lshift_mod_384
|
||
|
.hidden lshift_mod_384
|
||
|
.type lshift_mod_384,%function
|
||
|
.align 5
|
||
|
lshift_mod_384:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @a[0],@a[1],[$a_ptr]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$n_ptr]
|
||
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||
|
|
||
|
.Loop_lshift_mod_384:
|
||
|
sub $b_ptr,$b_ptr,#1
|
||
|
bl __lshift_mod_384
|
||
|
cbnz $b_ptr,.Loop_lshift_mod_384
|
||
|
|
||
|
ldr x30,[sp,#8]
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size lshift_mod_384,.-lshift_mod_384
|
||
|
|
||
|
.type __lshift_mod_384,%function
|
||
|
.align 5
|
||
|
__lshift_mod_384:
|
||
|
adds @a[0],@a[0],@a[0]
|
||
|
adcs @a[1],@a[1],@a[1]
|
||
|
adcs @a[2],@a[2],@a[2]
|
||
|
adcs @a[3],@a[3],@a[3]
|
||
|
adcs @a[4],@a[4],@a[4]
|
||
|
adcs @a[5],@a[5],@a[5]
|
||
|
adc $carry,xzr,xzr
|
||
|
|
||
|
subs @b[0],@a[0],@mod[0]
|
||
|
sbcs @b[1],@a[1],@mod[1]
|
||
|
sbcs @b[2],@a[2],@mod[2]
|
||
|
sbcs @b[3],@a[3],@mod[3]
|
||
|
sbcs @b[4],@a[4],@mod[4]
|
||
|
sbcs @b[5],@a[5],@mod[5]
|
||
|
sbcs xzr,$carry,xzr
|
||
|
|
||
|
csel @a[0],@a[0],@b[0],lo
|
||
|
csel @a[1],@a[1],@b[1],lo
|
||
|
csel @a[2],@a[2],@b[2],lo
|
||
|
csel @a[3],@a[3],@b[3],lo
|
||
|
csel @a[4],@a[4],@b[4],lo
|
||
|
csel @a[5],@a[5],@b[5],lo
|
||
|
|
||
|
ret
|
||
|
.size __lshift_mod_384,.-__lshift_mod_384
|
||
|
|
||
|
.globl mul_by_3_mod_384
|
||
|
.hidden mul_by_3_mod_384
|
||
|
.type mul_by_3_mod_384,%function
|
||
|
.align 5
|
||
|
mul_by_3_mod_384:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @a[0],@a[1],[$a_ptr]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$b_ptr]
|
||
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||
|
|
||
|
bl __lshift_mod_384
|
||
|
|
||
|
ldp @b[0],@b[1],[$a_ptr]
|
||
|
ldp @b[2],@b[3],[$a_ptr,#16]
|
||
|
ldp @b[4],@b[5],[$a_ptr,#32]
|
||
|
|
||
|
bl __add_mod_384_ab_are_loaded
|
||
|
ldr x30,[sp,#8]
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size mul_by_3_mod_384,.-mul_by_3_mod_384
|
||
|
|
||
|
.globl mul_by_8_mod_384
|
||
|
.hidden mul_by_8_mod_384
|
||
|
.type mul_by_8_mod_384,%function
|
||
|
.align 5
|
||
|
mul_by_8_mod_384:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @a[0],@a[1],[$a_ptr]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$b_ptr]
|
||
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||
|
|
||
|
bl __lshift_mod_384
|
||
|
bl __lshift_mod_384
|
||
|
bl __lshift_mod_384
|
||
|
ldr x30,[sp,#8]
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size mul_by_8_mod_384,.-mul_by_8_mod_384
|
||
|
|
||
|
.globl mul_by_3_mod_384x
|
||
|
.hidden mul_by_3_mod_384x
|
||
|
.type mul_by_3_mod_384x,%function
|
||
|
.align 5
|
||
|
mul_by_3_mod_384x:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @a[0],@a[1],[$a_ptr]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$b_ptr]
|
||
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||
|
|
||
|
bl __lshift_mod_384
|
||
|
|
||
|
ldp @b[0],@b[1],[$a_ptr]
|
||
|
ldp @b[2],@b[3],[$a_ptr,#16]
|
||
|
ldp @b[4],@b[5],[$a_ptr,#32]
|
||
|
|
||
|
bl __add_mod_384_ab_are_loaded
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
ldp @a[0],@a[1],[$a_ptr,#48]
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#64]
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#80]
|
||
|
|
||
|
bl __lshift_mod_384
|
||
|
|
||
|
ldp @b[0],@b[1],[$a_ptr,#48]
|
||
|
ldp @b[2],@b[3],[$a_ptr,#64]
|
||
|
ldp @b[4],@b[5],[$a_ptr,#80]
|
||
|
|
||
|
bl __add_mod_384_ab_are_loaded
|
||
|
ldr x30,[sp,#8]
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr,#48]
|
||
|
stp @a[2],@a[3],[$r_ptr,#64]
|
||
|
stp @a[4],@a[5],[$r_ptr,#80]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size mul_by_3_mod_384x,.-mul_by_3_mod_384x
|
||
|
|
||
|
.globl mul_by_8_mod_384x
|
||
|
.hidden mul_by_8_mod_384x
|
||
|
.type mul_by_8_mod_384x,%function
|
||
|
.align 5
|
||
|
mul_by_8_mod_384x:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @a[0],@a[1],[$a_ptr]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$b_ptr]
|
||
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||
|
|
||
|
bl __lshift_mod_384
|
||
|
bl __lshift_mod_384
|
||
|
bl __lshift_mod_384
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
ldp @a[0],@a[1],[$a_ptr,#48]
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#64]
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#80]
|
||
|
|
||
|
bl __lshift_mod_384
|
||
|
bl __lshift_mod_384
|
||
|
bl __lshift_mod_384
|
||
|
ldr x30,[sp,#8]
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr,#48]
|
||
|
stp @a[2],@a[3],[$r_ptr,#64]
|
||
|
stp @a[4],@a[5],[$r_ptr,#80]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size mul_by_8_mod_384x,.-mul_by_8_mod_384x
|
||
|
|
||
|
.globl cneg_mod_384
|
||
|
.hidden cneg_mod_384
|
||
|
.type cneg_mod_384,%function
|
||
|
.align 5
|
||
|
cneg_mod_384:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @a[0],@a[1],[$a_ptr]
|
||
|
ldp @mod[0],@mod[1],[$n_ptr]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
||
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||
|
|
||
|
subs @b[0],@mod[0],@a[0]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
||
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||
|
orr $carry,@a[0],@a[1]
|
||
|
sbcs @b[1],@mod[1],@a[1]
|
||
|
orr $carry,$carry,@a[2]
|
||
|
sbcs @b[2],@mod[2],@a[2]
|
||
|
orr $carry,$carry,@a[3]
|
||
|
sbcs @b[3],@mod[3],@a[3]
|
||
|
orr $carry,$carry,@a[4]
|
||
|
sbcs @b[4],@mod[4],@a[4]
|
||
|
orr $carry,$carry,@a[5]
|
||
|
sbc @b[5],@mod[5],@a[5]
|
||
|
|
||
|
cmp $carry,#0
|
||
|
csetm $carry,ne
|
||
|
ands $b_ptr,$b_ptr,$carry
|
||
|
|
||
|
csel @a[0],@a[0],@b[0],eq
|
||
|
csel @a[1],@a[1],@b[1],eq
|
||
|
csel @a[2],@a[2],@b[2],eq
|
||
|
csel @a[3],@a[3],@b[3],eq
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
csel @a[4],@a[4],@b[4],eq
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
csel @a[5],@a[5],@b[5],eq
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size cneg_mod_384,.-cneg_mod_384
|
||
|
|
||
|
.globl sub_mod_384
|
||
|
.hidden sub_mod_384
|
||
|
.type sub_mod_384,%function
|
||
|
.align 5
|
||
|
sub_mod_384:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$n_ptr]
|
||
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||
|
|
||
|
bl __sub_mod_384
|
||
|
ldr x30,[sp,#8]
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size sub_mod_384,.-sub_mod_384
|
||
|
|
||
|
.type __sub_mod_384,%function
|
||
|
.align 5
|
||
|
__sub_mod_384:
|
||
|
ldp @a[0],@a[1],[$a_ptr]
|
||
|
ldp @b[0],@b[1],[$b_ptr]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
||
|
ldp @b[2],@b[3],[$b_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
||
|
ldp @b[4],@b[5],[$b_ptr,#32]
|
||
|
|
||
|
subs @a[0],@a[0],@b[0]
|
||
|
sbcs @a[1],@a[1],@b[1]
|
||
|
sbcs @a[2],@a[2],@b[2]
|
||
|
sbcs @a[3],@a[3],@b[3]
|
||
|
sbcs @a[4],@a[4],@b[4]
|
||
|
sbcs @a[5],@a[5],@b[5]
|
||
|
sbc $carry,xzr,xzr
|
||
|
|
||
|
and @b[0],@mod[0],$carry
|
||
|
and @b[1],@mod[1],$carry
|
||
|
adds @a[0],@a[0],@b[0]
|
||
|
and @b[2],@mod[2],$carry
|
||
|
adcs @a[1],@a[1],@b[1]
|
||
|
and @b[3],@mod[3],$carry
|
||
|
adcs @a[2],@a[2],@b[2]
|
||
|
and @b[4],@mod[4],$carry
|
||
|
adcs @a[3],@a[3],@b[3]
|
||
|
and @b[5],@mod[5],$carry
|
||
|
adcs @a[4],@a[4],@b[4]
|
||
|
adc @a[5],@a[5],@b[5]
|
||
|
|
||
|
ret
|
||
|
.size __sub_mod_384,.-__sub_mod_384
|
||
|
|
||
|
.globl sub_mod_384x
|
||
|
.hidden sub_mod_384x
|
||
|
.type sub_mod_384x,%function
|
||
|
.align 5
|
||
|
sub_mod_384x:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$n_ptr]
|
||
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||
|
|
||
|
bl __sub_mod_384
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
add $a_ptr,$a_ptr,#48
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
add $b_ptr,$b_ptr,#48
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
bl __sub_mod_384
|
||
|
ldr x30,[sp,#8]
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr,#48]
|
||
|
stp @a[2],@a[3],[$r_ptr,#64]
|
||
|
stp @a[4],@a[5],[$r_ptr,#80]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size sub_mod_384x,.-sub_mod_384x
|
||
|
|
||
|
.globl mul_by_1_plus_i_mod_384x
|
||
|
.hidden mul_by_1_plus_i_mod_384x
|
||
|
.type mul_by_1_plus_i_mod_384x,%function
|
||
|
.align 5
|
||
|
mul_by_1_plus_i_mod_384x:
|
||
|
paciasp
|
||
|
stp x29,x30,[sp,#-48]!
|
||
|
add x29,sp,#0
|
||
|
stp x19,x20,[sp,#16]
|
||
|
stp x21,x22,[sp,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$b_ptr]
|
||
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||
|
add $b_ptr,$a_ptr,#48
|
||
|
|
||
|
bl __sub_mod_384 // a->re - a->im
|
||
|
|
||
|
ldp @b[0],@b[1],[$a_ptr]
|
||
|
ldp @b[2],@b[3],[$a_ptr,#16]
|
||
|
ldp @b[4],@b[5],[$a_ptr,#32]
|
||
|
stp @a[0],@a[1],[$r_ptr]
|
||
|
ldp @a[0],@a[1],[$a_ptr,#48]
|
||
|
stp @a[2],@a[3],[$r_ptr,#16]
|
||
|
ldp @a[2],@a[3],[$a_ptr,#64]
|
||
|
stp @a[4],@a[5],[$r_ptr,#32]
|
||
|
ldp @a[4],@a[5],[$a_ptr,#80]
|
||
|
|
||
|
bl __add_mod_384_ab_are_loaded // a->re + a->im
|
||
|
ldr x30,[sp,#8]
|
||
|
|
||
|
stp @a[0],@a[1],[$r_ptr,#48]
|
||
|
stp @a[2],@a[3],[$r_ptr,#64]
|
||
|
stp @a[4],@a[5],[$r_ptr,#80]
|
||
|
|
||
|
ldp x19,x20,[x29,#16]
|
||
|
ldp x21,x22,[x29,#32]
|
||
|
ldr x29,[sp],#48
|
||
|
autiasp
|
||
|
ret
|
||
|
.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
|
||
|
|
||
|
.globl sgn0_pty_mod_384
|
||
|
.hidden sgn0_pty_mod_384
|
||
|
.type sgn0_pty_mod_384,%function
|
||
|
.align 5
|
||
|
sgn0_pty_mod_384:
|
||
|
ldp @a[0],@a[1],[$r_ptr]
|
||
|
ldp @a[2],@a[3],[$r_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$a_ptr]
|
||
|
ldp @mod[2],@mod[3],[$a_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$a_ptr,#32]
|
||
|
|
||
|
and $r_ptr,@a[0],#1
|
||
|
adds @a[0],@a[0],@a[0]
|
||
|
adcs @a[1],@a[1],@a[1]
|
||
|
adcs @a[2],@a[2],@a[2]
|
||
|
adcs @a[3],@a[3],@a[3]
|
||
|
adcs @a[4],@a[4],@a[4]
|
||
|
adcs @a[5],@a[5],@a[5]
|
||
|
adc $carry,xzr,xzr
|
||
|
|
||
|
subs @a[0],@a[0],@mod[0]
|
||
|
sbcs @a[1],@a[1],@mod[1]
|
||
|
sbcs @a[2],@a[2],@mod[2]
|
||
|
sbcs @a[3],@a[3],@mod[3]
|
||
|
sbcs @a[4],@a[4],@mod[4]
|
||
|
sbcs @a[5],@a[5],@mod[5]
|
||
|
sbc $carry,$carry,xzr
|
||
|
|
||
|
mvn $carry,$carry
|
||
|
and $carry,$carry,#2
|
||
|
orr $r_ptr,$r_ptr,$carry
|
||
|
|
||
|
ret
|
||
|
.size sgn0_pty_mod_384,.-sgn0_pty_mod_384
|
||
|
|
||
|
.globl sgn0_pty_mod_384x
|
||
|
.hidden sgn0_pty_mod_384x
|
||
|
.type sgn0_pty_mod_384x,%function
|
||
|
.align 5
|
||
|
sgn0_pty_mod_384x:
|
||
|
ldp @a[0],@a[1],[$r_ptr]
|
||
|
ldp @a[2],@a[3],[$r_ptr,#16]
|
||
|
ldp @a[4],@a[5],[$r_ptr,#32]
|
||
|
|
||
|
ldp @mod[0],@mod[1],[$a_ptr]
|
||
|
ldp @mod[2],@mod[3],[$a_ptr,#16]
|
||
|
ldp @mod[4],@mod[5],[$a_ptr,#32]
|
||
|
|
||
|
and $b_ptr,@a[0],#1
|
||
|
orr $n_ptr,@a[0],@a[1]
|
||
|
adds @a[0],@a[0],@a[0]
|
||
|
orr $n_ptr,$n_ptr,@a[2]
|
||
|
adcs @a[1],@a[1],@a[1]
|
||
|
orr $n_ptr,$n_ptr,@a[3]
|
||
|
adcs @a[2],@a[2],@a[2]
|
||
|
orr $n_ptr,$n_ptr,@a[4]
|
||
|
adcs @a[3],@a[3],@a[3]
|
||
|
orr $n_ptr,$n_ptr,@a[5]
|
||
|
adcs @a[4],@a[4],@a[4]
|
||
|
adcs @a[5],@a[5],@a[5]
|
||
|
adc @b[0],xzr,xzr
|
||
|
|
||
|
subs @a[0],@a[0],@mod[0]
|
||
|
sbcs @a[1],@a[1],@mod[1]
|
||
|
sbcs @a[2],@a[2],@mod[2]
|
||
|
sbcs @a[3],@a[3],@mod[3]
|
||
|
sbcs @a[4],@a[4],@mod[4]
|
||
|
sbcs @a[5],@a[5],@mod[5]
|
||
|
sbc @b[0],@b[0],xzr
|
||
|
|
||
|
ldp @a[0],@a[1],[$r_ptr,#48]
|
||
|
ldp @a[2],@a[3],[$r_ptr,#64]
|
||
|
ldp @a[4],@a[5],[$r_ptr,#80]
|
||
|
|
||
|
mvn @b[0],@b[0]
|
||
|
and @b[0],@b[0],#2
|
||
|
orr $b_ptr,$b_ptr,@b[0]
|
||
|
|
||
|
and $r_ptr,@a[0],#1
|
||
|
orr $a_ptr,@a[0],@a[1]
|
||
|
adds @a[0],@a[0],@a[0]
|
||
|
orr $a_ptr,$a_ptr,@a[2]
|
||
|
adcs @a[1],@a[1],@a[1]
|
||
|
orr $a_ptr,$a_ptr,@a[3]
|
||
|
adcs @a[2],@a[2],@a[2]
|
||
|
orr $a_ptr,$a_ptr,@a[4]
|
||
|
adcs @a[3],@a[3],@a[3]
|
||
|
orr $a_ptr,$a_ptr,@a[5]
|
||
|
adcs @a[4],@a[4],@a[4]
|
||
|
adcs @a[5],@a[5],@a[5]
|
||
|
adc @b[0],xzr,xzr
|
||
|
|
||
|
subs @a[0],@a[0],@mod[0]
|
||
|
sbcs @a[1],@a[1],@mod[1]
|
||
|
sbcs @a[2],@a[2],@mod[2]
|
||
|
sbcs @a[3],@a[3],@mod[3]
|
||
|
sbcs @a[4],@a[4],@mod[4]
|
||
|
sbcs @a[5],@a[5],@mod[5]
|
||
|
sbc @b[0],@b[0],xzr
|
||
|
|
||
|
mvn @b[0],@b[0]
|
||
|
and @b[0],@b[0],#2
|
||
|
orr $r_ptr,$r_ptr,@b[0]
|
||
|
|
||
|
cmp $n_ptr,#0
|
||
|
csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re)
|
||
|
|
||
|
cmp $a_ptr,#0
|
||
|
csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re)
|
||
|
|
||
|
and $n_ptr,$n_ptr,#1
|
||
|
and $a_ptr,$a_ptr,#2
|
||
|
orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity
|
||
|
|
||
|
ret
|
||
|
.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
|
||
|
___
|
||
|
if (1) {
|
||
|
sub vec_select {
|
||
|
my $sz = shift;
|
||
|
my @v=map("v$_",(0..5,16..21));
|
||
|
|
||
|
$code.=<<___;
|
||
|
.globl vec_select_$sz
|
||
|
.hidden vec_select_$sz
|
||
|
.type vec_select_$sz,%function
|
||
|
.align 5
|
||
|
vec_select_$sz:
|
||
|
dup v6.2d, $n_ptr
|
||
|
ld1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48
|
||
|
cmeq v6.2d, v6.2d, #0
|
||
|
ld1 {@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48
|
||
|
___
|
||
|
for($i=0; $i<$sz-48; $i+=48) {
|
||
|
$code.=<<___;
|
||
|
bit @v[0].16b, @v[3].16b, v6.16b
|
||
|
ld1 {@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48
|
||
|
bit @v[1].16b, @v[4].16b, v6.16b
|
||
|
ld1 {@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48
|
||
|
bit @v[2].16b, @v[5].16b, v6.16b
|
||
|
st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48
|
||
|
___
|
||
|
@v = @v[6..11,0..5];
|
||
|
}
|
||
|
$code.=<<___;
|
||
|
bit @v[0].16b, @v[3].16b, v6.16b
|
||
|
bit @v[1].16b, @v[4].16b, v6.16b
|
||
|
bit @v[2].16b, @v[5].16b, v6.16b
|
||
|
st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr]
|
||
|
ret
|
||
|
.size vec_select_$sz,.-vec_select_$sz
|
||
|
___
|
||
|
}
|
||
|
vec_select(48);
|
||
|
vec_select(96);
|
||
|
vec_select(192);
|
||
|
vec_select(144);
|
||
|
vec_select(288);
|
||
|
}
|
||
|
|
||
|
{
|
||
|
my ($inp, $end, $step) = map("x$_", (0..2));
|
||
|
|
||
|
$code.=<<___;
|
||
|
.globl vec_prefetch
|
||
|
.hidden vec_prefetch
|
||
|
.type vec_prefetch,%function
|
||
|
.align 5
|
||
|
vec_prefetch:
|
||
|
add $end, $end, $inp
|
||
|
sub $end, $end, #1
|
||
|
mov $step, #64
|
||
|
prfm pldl1keep, [$inp]
|
||
|
add $inp, $inp, $step
|
||
|
cmp $inp, $end
|
||
|
csel $inp, $end, $inp, hi
|
||
|
csel $step, xzr, $step, hi
|
||
|
prfm pldl1keep, [$inp]
|
||
|
add $inp, $inp, $step
|
||
|
cmp $inp, $end
|
||
|
csel $inp, $end, $inp, hi
|
||
|
csel $step, xzr, $step, hi
|
||
|
prfm pldl1keep, [$inp]
|
||
|
add $inp, $inp, $step
|
||
|
cmp $inp, $end
|
||
|
csel $inp, $end, $inp, hi
|
||
|
csel $step, xzr, $step, hi
|
||
|
prfm pldl1keep, [$inp]
|
||
|
add $inp, $inp, $step
|
||
|
cmp $inp, $end
|
||
|
csel $inp, $end, $inp, hi
|
||
|
csel $step, xzr, $step, hi
|
||
|
prfm pldl1keep, [$inp]
|
||
|
add $inp, $inp, $step
|
||
|
cmp $inp, $end
|
||
|
csel $inp, $end, $inp, hi
|
||
|
csel $step, xzr, $step, hi
|
||
|
prfm pldl1keep, [$inp]
|
||
|
add $inp, $inp, $step
|
||
|
cmp $inp, $end
|
||
|
csel $inp, $end, $inp, hi
|
||
|
prfm pldl1keep, [$inp]
|
||
|
ret
|
||
|
.size vec_prefetch,.-vec_prefetch
|
||
|
___
|
||
|
}
|
||
|
|
||
|
print $code;
|
||
|
|
||
|
close STDOUT;
|