#!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } ($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); @mod=map("x$_",(4..9)); @a=map("x$_",(10..15)); @b=map("x$_",(16,17,19..22)); $carry=$n_ptr; $code.=<<___; .text .globl add_mod_384 .hidden add_mod_384 .type add_mod_384,%function .align 5 add_mod_384: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __add_mod_384 ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size add_mod_384,.-add_mod_384 .type __add_mod_384,%function .align 5 __add_mod_384: ldp @a[0],@a[1],[$a_ptr] ldp @b[0],@b[1],[$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @b[2],@b[3],[$b_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @b[4],@b[5],[$b_ptr,#32] __add_mod_384_ab_are_loaded: adds @a[0],@a[0],@b[0] adcs @a[1],@a[1],@b[1] adcs @a[2],@a[2],@b[2] adcs @a[3],@a[3],@b[3] adcs @a[4],@a[4],@b[4] adcs @a[5],@a[5],@b[5] adc $carry,xzr,xzr subs @b[0],@a[0],@mod[0] sbcs @b[1],@a[1],@mod[1] sbcs @b[2],@a[2],@mod[2] sbcs @b[3],@a[3],@mod[3] sbcs @b[4],@a[4],@mod[4] sbcs @b[5],@a[5],@mod[5] sbcs xzr,$carry,xzr csel @a[0],@a[0],@b[0],lo csel @a[1],@a[1],@b[1],lo csel @a[2],@a[2],@b[2],lo csel @a[3],@a[3],@b[3],lo csel @a[4],@a[4],@b[4],lo csel @a[5],@a[5],@b[5],lo ret .size __add_mod_384,.-__add_mod_384 .globl add_mod_384x .hidden add_mod_384x .type add_mod_384x,%function .align 5 add_mod_384x: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __add_mod_384 stp @a[0],@a[1],[$r_ptr] add $a_ptr,$a_ptr,#48 stp @a[2],@a[3],[$r_ptr,#16] add $b_ptr,$b_ptr,#48 stp @a[4],@a[5],[$r_ptr,#32] bl __add_mod_384 ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr,#48] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size add_mod_384x,.-add_mod_384x .globl rshift_mod_384 .hidden rshift_mod_384 .type rshift_mod_384,%function .align 5 rshift_mod_384: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] .Loop_rshift_mod_384: sub $b_ptr,$b_ptr,#1 bl __rshift_mod_384 cbnz $b_ptr,.Loop_rshift_mod_384 ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size rshift_mod_384,.-rshift_mod_384 .type __rshift_mod_384,%function .align 5 __rshift_mod_384: sbfx @b[5],@a[0],#0,#1 and @b[0],@b[5],@mod[0] and @b[1],@b[5],@mod[1] adds @a[0],@a[0],@b[0] and @b[2],@b[5],@mod[2] adcs @a[1],@a[1],@b[1] and @b[3],@b[5],@mod[3] adcs @a[2],@a[2],@b[2] and @b[4],@b[5],@mod[4] adcs @a[3],@a[3],@b[3] and @b[5],@b[5],@mod[5] adcs @a[4],@a[4],@b[4] extr @a[0],@a[1],@a[0],#1 // a[0:5] >>= 1 adcs @a[5],@a[5],@b[5] extr @a[1],@a[2],@a[1],#1 adc @b[5],xzr,xzr extr @a[2],@a[3],@a[2],#1 extr @a[3],@a[4],@a[3],#1 extr @a[4],@a[5],@a[4],#1 extr @a[5],@b[5],@a[5],#1 ret .size __rshift_mod_384,.-__rshift_mod_384 .globl div_by_2_mod_384 .hidden div_by_2_mod_384 .type div_by_2_mod_384,%function .align 5 div_by_2_mod_384: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __rshift_mod_384 ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size div_by_2_mod_384,.-div_by_2_mod_384 .globl lshift_mod_384 .hidden lshift_mod_384 .type lshift_mod_384,%function .align 5 lshift_mod_384: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] .Loop_lshift_mod_384: sub $b_ptr,$b_ptr,#1 bl __lshift_mod_384 cbnz $b_ptr,.Loop_lshift_mod_384 ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size lshift_mod_384,.-lshift_mod_384 .type __lshift_mod_384,%function .align 5 __lshift_mod_384: adds @a[0],@a[0],@a[0] adcs @a[1],@a[1],@a[1] adcs @a[2],@a[2],@a[2] adcs @a[3],@a[3],@a[3] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc $carry,xzr,xzr subs @b[0],@a[0],@mod[0] sbcs @b[1],@a[1],@mod[1] sbcs @b[2],@a[2],@mod[2] sbcs @b[3],@a[3],@mod[3] sbcs @b[4],@a[4],@mod[4] sbcs @b[5],@a[5],@mod[5] sbcs xzr,$carry,xzr csel @a[0],@a[0],@b[0],lo csel @a[1],@a[1],@b[1],lo csel @a[2],@a[2],@b[2],lo csel @a[3],@a[3],@b[3],lo csel @a[4],@a[4],@b[4],lo csel @a[5],@a[5],@b[5],lo ret .size __lshift_mod_384,.-__lshift_mod_384 .globl mul_by_3_mod_384 .hidden mul_by_3_mod_384 .type mul_by_3_mod_384,%function .align 5 mul_by_3_mod_384: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __lshift_mod_384 ldp @b[0],@b[1],[$a_ptr] ldp @b[2],@b[3],[$a_ptr,#16] ldp @b[4],@b[5],[$a_ptr,#32] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size mul_by_3_mod_384,.-mul_by_3_mod_384 .globl mul_by_8_mod_384 .hidden mul_by_8_mod_384 .type mul_by_8_mod_384,%function .align 5 mul_by_8_mod_384: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size mul_by_8_mod_384,.-mul_by_8_mod_384 .globl mul_by_3_mod_384x .hidden mul_by_3_mod_384x .type mul_by_3_mod_384x,%function .align 5 mul_by_3_mod_384x: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __lshift_mod_384 ldp @b[0],@b[1],[$a_ptr] ldp @b[2],@b[3],[$a_ptr,#16] ldp @b[4],@b[5],[$a_ptr,#32] bl __add_mod_384_ab_are_loaded stp @a[0],@a[1],[$r_ptr] ldp @a[0],@a[1],[$a_ptr,#48] stp @a[2],@a[3],[$r_ptr,#16] ldp @a[2],@a[3],[$a_ptr,#64] stp @a[4],@a[5],[$r_ptr,#32] ldp @a[4],@a[5],[$a_ptr,#80] bl __lshift_mod_384 ldp @b[0],@b[1],[$a_ptr,#48] ldp @b[2],@b[3],[$a_ptr,#64] ldp @b[4],@b[5],[$a_ptr,#80] bl __add_mod_384_ab_are_loaded ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr,#48] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size mul_by_3_mod_384x,.-mul_by_3_mod_384x .globl mul_by_8_mod_384x .hidden mul_by_8_mod_384x .type mul_by_8_mod_384x,%function .align 5 mul_by_8_mod_384x: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @a[0],@a[1],[$a_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 stp @a[0],@a[1],[$r_ptr] ldp @a[0],@a[1],[$a_ptr,#48] stp @a[2],@a[3],[$r_ptr,#16] ldp @a[2],@a[3],[$a_ptr,#64] stp @a[4],@a[5],[$r_ptr,#32] ldp @a[4],@a[5],[$a_ptr,#80] bl __lshift_mod_384 bl __lshift_mod_384 bl __lshift_mod_384 ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr,#48] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size mul_by_8_mod_384x,.-mul_by_8_mod_384x .globl cneg_mod_384 .hidden cneg_mod_384 .type cneg_mod_384,%function .align 5 cneg_mod_384: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @a[0],@a[1],[$a_ptr] ldp @mod[0],@mod[1],[$n_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @mod[2],@mod[3],[$n_ptr,#16] subs @b[0],@mod[0],@a[0] ldp @a[4],@a[5],[$a_ptr,#32] ldp @mod[4],@mod[5],[$n_ptr,#32] orr $carry,@a[0],@a[1] sbcs @b[1],@mod[1],@a[1] orr $carry,$carry,@a[2] sbcs @b[2],@mod[2],@a[2] orr $carry,$carry,@a[3] sbcs @b[3],@mod[3],@a[3] orr $carry,$carry,@a[4] sbcs @b[4],@mod[4],@a[4] orr $carry,$carry,@a[5] sbc @b[5],@mod[5],@a[5] cmp $carry,#0 csetm $carry,ne ands $b_ptr,$b_ptr,$carry csel @a[0],@a[0],@b[0],eq csel @a[1],@a[1],@b[1],eq csel @a[2],@a[2],@b[2],eq csel @a[3],@a[3],@b[3],eq stp @a[0],@a[1],[$r_ptr] csel @a[4],@a[4],@b[4],eq stp @a[2],@a[3],[$r_ptr,#16] csel @a[5],@a[5],@b[5],eq stp @a[4],@a[5],[$r_ptr,#32] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size cneg_mod_384,.-cneg_mod_384 .globl sub_mod_384 .hidden sub_mod_384 .type sub_mod_384,%function .align 5 sub_mod_384: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __sub_mod_384 ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr] stp @a[2],@a[3],[$r_ptr,#16] stp @a[4],@a[5],[$r_ptr,#32] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size sub_mod_384,.-sub_mod_384 .type __sub_mod_384,%function .align 5 __sub_mod_384: ldp @a[0],@a[1],[$a_ptr] ldp @b[0],@b[1],[$b_ptr] ldp @a[2],@a[3],[$a_ptr,#16] ldp @b[2],@b[3],[$b_ptr,#16] ldp @a[4],@a[5],[$a_ptr,#32] ldp @b[4],@b[5],[$b_ptr,#32] subs @a[0],@a[0],@b[0] sbcs @a[1],@a[1],@b[1] sbcs @a[2],@a[2],@b[2] sbcs @a[3],@a[3],@b[3] sbcs @a[4],@a[4],@b[4] sbcs @a[5],@a[5],@b[5] sbc $carry,xzr,xzr and @b[0],@mod[0],$carry and @b[1],@mod[1],$carry adds @a[0],@a[0],@b[0] and @b[2],@mod[2],$carry adcs @a[1],@a[1],@b[1] and @b[3],@mod[3],$carry adcs @a[2],@a[2],@b[2] and @b[4],@mod[4],$carry adcs @a[3],@a[3],@b[3] and @b[5],@mod[5],$carry adcs @a[4],@a[4],@b[4] adc @a[5],@a[5],@b[5] ret .size __sub_mod_384,.-__sub_mod_384 .globl sub_mod_384x .hidden sub_mod_384x .type sub_mod_384x,%function .align 5 sub_mod_384x: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @mod[0],@mod[1],[$n_ptr] ldp @mod[2],@mod[3],[$n_ptr,#16] ldp @mod[4],@mod[5],[$n_ptr,#32] bl __sub_mod_384 stp @a[0],@a[1],[$r_ptr] add $a_ptr,$a_ptr,#48 stp @a[2],@a[3],[$r_ptr,#16] add $b_ptr,$b_ptr,#48 stp @a[4],@a[5],[$r_ptr,#32] bl __sub_mod_384 ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr,#48] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size sub_mod_384x,.-sub_mod_384x .globl mul_by_1_plus_i_mod_384x .hidden mul_by_1_plus_i_mod_384x .type mul_by_1_plus_i_mod_384x,%function .align 5 mul_by_1_plus_i_mod_384x: paciasp stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] ldp @mod[0],@mod[1],[$b_ptr] ldp @mod[2],@mod[3],[$b_ptr,#16] ldp @mod[4],@mod[5],[$b_ptr,#32] add $b_ptr,$a_ptr,#48 bl __sub_mod_384 // a->re - a->im ldp @b[0],@b[1],[$a_ptr] ldp @b[2],@b[3],[$a_ptr,#16] ldp @b[4],@b[5],[$a_ptr,#32] stp @a[0],@a[1],[$r_ptr] ldp @a[0],@a[1],[$a_ptr,#48] stp @a[2],@a[3],[$r_ptr,#16] ldp @a[2],@a[3],[$a_ptr,#64] stp @a[4],@a[5],[$r_ptr,#32] ldp @a[4],@a[5],[$a_ptr,#80] bl __add_mod_384_ab_are_loaded // a->re + a->im ldr x30,[sp,#8] stp @a[0],@a[1],[$r_ptr,#48] stp @a[2],@a[3],[$r_ptr,#64] stp @a[4],@a[5],[$r_ptr,#80] ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldr x29,[sp],#48 autiasp ret .size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x .globl sgn0_pty_mod_384 .hidden sgn0_pty_mod_384 .type sgn0_pty_mod_384,%function .align 5 sgn0_pty_mod_384: ldp @a[0],@a[1],[$r_ptr] ldp @a[2],@a[3],[$r_ptr,#16] ldp @a[4],@a[5],[$r_ptr,#32] ldp @mod[0],@mod[1],[$a_ptr] ldp @mod[2],@mod[3],[$a_ptr,#16] ldp @mod[4],@mod[5],[$a_ptr,#32] and $r_ptr,@a[0],#1 adds @a[0],@a[0],@a[0] adcs @a[1],@a[1],@a[1] adcs @a[2],@a[2],@a[2] adcs @a[3],@a[3],@a[3] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc $carry,xzr,xzr subs @a[0],@a[0],@mod[0] sbcs @a[1],@a[1],@mod[1] sbcs @a[2],@a[2],@mod[2] sbcs @a[3],@a[3],@mod[3] sbcs @a[4],@a[4],@mod[4] sbcs @a[5],@a[5],@mod[5] sbc $carry,$carry,xzr mvn $carry,$carry and $carry,$carry,#2 orr $r_ptr,$r_ptr,$carry ret .size sgn0_pty_mod_384,.-sgn0_pty_mod_384 .globl sgn0_pty_mod_384x .hidden sgn0_pty_mod_384x .type sgn0_pty_mod_384x,%function .align 5 sgn0_pty_mod_384x: ldp @a[0],@a[1],[$r_ptr] ldp @a[2],@a[3],[$r_ptr,#16] ldp @a[4],@a[5],[$r_ptr,#32] ldp @mod[0],@mod[1],[$a_ptr] ldp @mod[2],@mod[3],[$a_ptr,#16] ldp @mod[4],@mod[5],[$a_ptr,#32] and $b_ptr,@a[0],#1 orr $n_ptr,@a[0],@a[1] adds @a[0],@a[0],@a[0] orr $n_ptr,$n_ptr,@a[2] adcs @a[1],@a[1],@a[1] orr $n_ptr,$n_ptr,@a[3] adcs @a[2],@a[2],@a[2] orr $n_ptr,$n_ptr,@a[4] adcs @a[3],@a[3],@a[3] orr $n_ptr,$n_ptr,@a[5] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc @b[0],xzr,xzr subs @a[0],@a[0],@mod[0] sbcs @a[1],@a[1],@mod[1] sbcs @a[2],@a[2],@mod[2] sbcs @a[3],@a[3],@mod[3] sbcs @a[4],@a[4],@mod[4] sbcs @a[5],@a[5],@mod[5] sbc @b[0],@b[0],xzr ldp @a[0],@a[1],[$r_ptr,#48] ldp @a[2],@a[3],[$r_ptr,#64] ldp @a[4],@a[5],[$r_ptr,#80] mvn @b[0],@b[0] and @b[0],@b[0],#2 orr $b_ptr,$b_ptr,@b[0] and $r_ptr,@a[0],#1 orr $a_ptr,@a[0],@a[1] adds @a[0],@a[0],@a[0] orr $a_ptr,$a_ptr,@a[2] adcs @a[1],@a[1],@a[1] orr $a_ptr,$a_ptr,@a[3] adcs @a[2],@a[2],@a[2] orr $a_ptr,$a_ptr,@a[4] adcs @a[3],@a[3],@a[3] orr $a_ptr,$a_ptr,@a[5] adcs @a[4],@a[4],@a[4] adcs @a[5],@a[5],@a[5] adc @b[0],xzr,xzr subs @a[0],@a[0],@mod[0] sbcs @a[1],@a[1],@mod[1] sbcs @a[2],@a[2],@mod[2] sbcs @a[3],@a[3],@mod[3] sbcs @a[4],@a[4],@mod[4] sbcs @a[5],@a[5],@mod[5] sbc @b[0],@b[0],xzr mvn @b[0],@b[0] and @b[0],@b[0],#2 orr $r_ptr,$r_ptr,@b[0] cmp $n_ptr,#0 csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) cmp $a_ptr,#0 csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) and $n_ptr,$n_ptr,#1 and $a_ptr,$a_ptr,#2 orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity ret .size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x ___ if (1) { sub vec_select { my $sz = shift; my @v=map("v$_",(0..5,16..21)); $code.=<<___; .globl vec_select_$sz .hidden vec_select_$sz .type vec_select_$sz,%function .align 5 vec_select_$sz: dup v6.2d, $n_ptr ld1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48 cmeq v6.2d, v6.2d, #0 ld1 {@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48 ___ for($i=0; $i<$sz-48; $i+=48) { $code.=<<___; bit @v[0].16b, @v[3].16b, v6.16b ld1 {@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48 bit @v[1].16b, @v[4].16b, v6.16b ld1 {@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48 bit @v[2].16b, @v[5].16b, v6.16b st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48 ___ @v = @v[6..11,0..5]; } $code.=<<___; bit @v[0].16b, @v[3].16b, v6.16b bit @v[1].16b, @v[4].16b, v6.16b bit @v[2].16b, @v[5].16b, v6.16b st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr] ret .size vec_select_$sz,.-vec_select_$sz ___ } vec_select(48); vec_select(96); vec_select(192); vec_select(144); vec_select(288); } { my ($inp, $end, $step) = map("x$_", (0..2)); $code.=<<___; .globl vec_prefetch .hidden vec_prefetch .type vec_prefetch,%function .align 5 vec_prefetch: add $end, $end, $inp sub $end, $end, #1 mov $step, #64 prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi csel $step, xzr, $step, hi prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi csel $step, xzr, $step, hi prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi csel $step, xzr, $step, hi prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi csel $step, xzr, $step, hi prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi csel $step, xzr, $step, hi prfm pldl1keep, [$inp] add $inp, $inp, $step cmp $inp, $end csel $inp, $end, $inp, hi prfm pldl1keep, [$inp] ret .size vec_prefetch,.-vec_prefetch ___ } print $code; close STDOUT;