2016 lines
46 KiB
Perl
Executable file
2016 lines
46 KiB
Perl
Executable file
#!/usr/bin/env perl
|
|
#
|
|
# Copyright Supranational LLC
|
|
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
|
|
if ($flavour && $flavour ne "void") {
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
|
} else {
|
|
open STDOUT,">$output";
|
|
}
|
|
|
|
($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
|
|
|
|
@mod = map("x$_",(5..10));
|
|
@a = map("x$_",(11..16));
|
|
$bi = "x17";
|
|
@acc = map("x$_",(19..25));
|
|
@tmp = map("x$_",(26..28,0,1,3));
|
|
|
|
$code.=<<___;
|
|
.text
|
|
|
|
.globl add_mod_384x384
|
|
.type add_mod_384x384,%function
|
|
.align 5
|
|
add_mod_384x384:
|
|
paciasp
|
|
stp x29,x30,[sp,#-64]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
|
|
ldp @mod[0],@mod[1],[$n_ptr]
|
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
|
|
|
bl __add_mod_384x384
|
|
ldr x30,[x29,#8]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldr x29,[sp],#64
|
|
autiasp
|
|
ret
|
|
.size add_mod_384x384,.-add_mod_384x384
|
|
|
|
.type __add_mod_384x384,%function
|
|
.align 5
|
|
__add_mod_384x384:
|
|
ldp @a[0], @a[1], [$a_ptr]
|
|
ldp @acc[0],@acc[1],[$b_ptr]
|
|
ldp @a[2], @a[3], [$a_ptr,#16]
|
|
adds @a[0],@a[0],@acc[0]
|
|
ldp @acc[2],@acc[3],[$b_ptr,#16]
|
|
adcs @a[1],@a[1],@acc[1]
|
|
ldp @a[4], @a[5], [$a_ptr,#32]
|
|
adcs @a[2],@a[2],@acc[2]
|
|
ldp @acc[4],@acc[5],[$b_ptr,#32]
|
|
adcs @a[3],@a[3],@acc[3]
|
|
stp @a[0], @a[1], [$r_ptr]
|
|
adcs @a[4],@a[4],@acc[4]
|
|
ldp @a[0], @a[1], [$a_ptr,#48]
|
|
adcs @a[5],@a[5],@acc[5]
|
|
|
|
ldp @acc[0],@acc[1],[$b_ptr,#48]
|
|
stp @a[2], @a[3], [$r_ptr,#16]
|
|
ldp @a[2], @a[3], [$a_ptr,#64]
|
|
ldp @acc[2],@acc[3],[$b_ptr,#64]
|
|
|
|
adcs @a[0],@a[0],@acc[0]
|
|
stp @a[4], @a[5], [$r_ptr,#32]
|
|
adcs @a[1],@a[1],@acc[1]
|
|
ldp @a[4], @a[5], [$a_ptr,#80]
|
|
adcs @a[2],@a[2],@acc[2]
|
|
ldp @acc[4],@acc[5],[$b_ptr,#80]
|
|
adcs @a[3],@a[3],@acc[3]
|
|
adcs @a[4],@a[4],@acc[4]
|
|
adcs @a[5],@a[5],@acc[5]
|
|
adc $bi,xzr,xzr
|
|
|
|
subs @acc[0],@a[0],@mod[0]
|
|
sbcs @acc[1],@a[1],@mod[1]
|
|
sbcs @acc[2],@a[2],@mod[2]
|
|
sbcs @acc[3],@a[3],@mod[3]
|
|
sbcs @acc[4],@a[4],@mod[4]
|
|
sbcs @acc[5],@a[5],@mod[5]
|
|
sbcs xzr,$bi,xzr
|
|
|
|
csel @a[0],@a[0],@acc[0],lo
|
|
csel @a[1],@a[1],@acc[1],lo
|
|
csel @a[2],@a[2],@acc[2],lo
|
|
csel @a[3],@a[3],@acc[3],lo
|
|
stp @a[0],@a[1],[$r_ptr,#48]
|
|
csel @a[4],@a[4],@acc[4],lo
|
|
stp @a[2],@a[3],[$r_ptr,#64]
|
|
csel @a[5],@a[5],@acc[5],lo
|
|
stp @a[4],@a[5],[$r_ptr,#80]
|
|
|
|
ret
|
|
.size __add_mod_384x384,.-__add_mod_384x384
|
|
|
|
.globl sub_mod_384x384
|
|
.type sub_mod_384x384,%function
|
|
.align 5
|
|
sub_mod_384x384:
|
|
paciasp
|
|
stp x29,x30,[sp,#-64]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
|
|
ldp @mod[0],@mod[1],[$n_ptr]
|
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
|
|
|
bl __sub_mod_384x384
|
|
ldr x30,[x29,#8]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldr x29,[sp],#64
|
|
autiasp
|
|
ret
|
|
.size sub_mod_384x384,.-sub_mod_384x384
|
|
|
|
.type __sub_mod_384x384,%function
|
|
.align 5
|
|
__sub_mod_384x384:
|
|
ldp @a[0], @a[1], [$a_ptr]
|
|
ldp @acc[0],@acc[1],[$b_ptr]
|
|
ldp @a[2], @a[3], [$a_ptr,#16]
|
|
subs @a[0],@a[0],@acc[0]
|
|
ldp @acc[2],@acc[3],[$b_ptr,#16]
|
|
sbcs @a[1],@a[1],@acc[1]
|
|
ldp @a[4], @a[5], [$a_ptr,#32]
|
|
sbcs @a[2],@a[2],@acc[2]
|
|
ldp @acc[4],@acc[5],[$b_ptr,#32]
|
|
sbcs @a[3],@a[3],@acc[3]
|
|
stp @a[0], @a[1], [$r_ptr]
|
|
sbcs @a[4],@a[4],@acc[4]
|
|
ldp @a[0], @a[1], [$a_ptr,#48]
|
|
sbcs @a[5],@a[5],@acc[5]
|
|
|
|
ldp @acc[0],@acc[1],[$b_ptr,#48]
|
|
stp @a[2], @a[3], [$r_ptr,#16]
|
|
ldp @a[2], @a[3], [$a_ptr,#64]
|
|
ldp @acc[2],@acc[3],[$b_ptr,#64]
|
|
|
|
sbcs @a[0],@a[0],@acc[0]
|
|
stp @a[4], @a[5], [$r_ptr,#32]
|
|
sbcs @a[1],@a[1],@acc[1]
|
|
ldp @a[4], @a[5], [$a_ptr,#80]
|
|
sbcs @a[2],@a[2],@acc[2]
|
|
ldp @acc[4],@acc[5],[$b_ptr,#80]
|
|
sbcs @a[3],@a[3],@acc[3]
|
|
sbcs @a[4],@a[4],@acc[4]
|
|
sbcs @a[5],@a[5],@acc[5]
|
|
sbc $bi,xzr,xzr
|
|
|
|
and @acc[0],@mod[0],$bi
|
|
and @acc[1],@mod[1],$bi
|
|
adds @a[0],@a[0],@acc[0]
|
|
and @acc[2],@mod[2],$bi
|
|
adcs @a[1],@a[1],@acc[1]
|
|
and @acc[3],@mod[3],$bi
|
|
adcs @a[2],@a[2],@acc[2]
|
|
and @acc[4],@mod[4],$bi
|
|
adcs @a[3],@a[3],@acc[3]
|
|
and @acc[5],@mod[5],$bi
|
|
adcs @a[4],@a[4],@acc[4]
|
|
stp @a[0],@a[1],[$r_ptr,#48]
|
|
adc @a[5],@a[5],@acc[5]
|
|
stp @a[2],@a[3],[$r_ptr,#64]
|
|
stp @a[4],@a[5],[$r_ptr,#80]
|
|
|
|
ret
|
|
.size __sub_mod_384x384,.-__sub_mod_384x384
|
|
|
|
.type __add_mod_384,%function
|
|
.align 5
|
|
__add_mod_384:
|
|
ldp @a[0], @a[1], [$a_ptr]
|
|
ldp @acc[0],@acc[1],[$b_ptr]
|
|
ldp @a[2], @a[3], [$a_ptr,#16]
|
|
adds @a[0],@a[0],@acc[0]
|
|
ldp @acc[2],@acc[3],[$b_ptr,#16]
|
|
adcs @a[1],@a[1],@acc[1]
|
|
ldp @a[4], @a[5], [$a_ptr,#32]
|
|
adcs @a[2],@a[2],@acc[2]
|
|
ldp @acc[4],@acc[5],[$b_ptr,#32]
|
|
adcs @a[3],@a[3],@acc[3]
|
|
adcs @a[4],@a[4],@acc[4]
|
|
adcs @a[5],@a[5],@acc[5]
|
|
adc $bi,xzr,xzr
|
|
|
|
subs @acc[0],@a[0],@mod[0]
|
|
sbcs @acc[1],@a[1],@mod[1]
|
|
sbcs @acc[2],@a[2],@mod[2]
|
|
sbcs @acc[3],@a[3],@mod[3]
|
|
sbcs @acc[4],@a[4],@mod[4]
|
|
sbcs @acc[5],@a[5],@mod[5]
|
|
sbcs xzr,$bi,xzr
|
|
|
|
csel @a[0],@a[0],@acc[0],lo
|
|
csel @a[1],@a[1],@acc[1],lo
|
|
csel @a[2],@a[2],@acc[2],lo
|
|
csel @a[3],@a[3],@acc[3],lo
|
|
csel @a[4],@a[4],@acc[4],lo
|
|
stp @a[0],@a[1],[$r_ptr]
|
|
csel @a[5],@a[5],@acc[5],lo
|
|
stp @a[2],@a[3],[$r_ptr,#16]
|
|
stp @a[4],@a[5],[$r_ptr,#32]
|
|
|
|
ret
|
|
.size __add_mod_384,.-__add_mod_384
|
|
|
|
.type __sub_mod_384,%function
|
|
.align 5
|
|
__sub_mod_384:
|
|
ldp @a[0], @a[1], [$a_ptr]
|
|
ldp @acc[0],@acc[1],[$b_ptr]
|
|
ldp @a[2], @a[3], [$a_ptr,#16]
|
|
subs @a[0],@a[0],@acc[0]
|
|
ldp @acc[2],@acc[3],[$b_ptr,#16]
|
|
sbcs @a[1],@a[1],@acc[1]
|
|
ldp @a[4], @a[5], [$a_ptr,#32]
|
|
sbcs @a[2],@a[2],@acc[2]
|
|
ldp @acc[4],@acc[5],[$b_ptr,#32]
|
|
sbcs @a[3],@a[3],@acc[3]
|
|
sbcs @a[4],@a[4],@acc[4]
|
|
sbcs @a[5],@a[5],@acc[5]
|
|
sbc $bi,xzr,xzr
|
|
|
|
and @acc[0],@mod[0],$bi
|
|
and @acc[1],@mod[1],$bi
|
|
adds @a[0],@a[0],@acc[0]
|
|
and @acc[2],@mod[2],$bi
|
|
adcs @a[1],@a[1],@acc[1]
|
|
and @acc[3],@mod[3],$bi
|
|
adcs @a[2],@a[2],@acc[2]
|
|
and @acc[4],@mod[4],$bi
|
|
adcs @a[3],@a[3],@acc[3]
|
|
and @acc[5],@mod[5],$bi
|
|
adcs @a[4],@a[4],@acc[4]
|
|
stp @a[0],@a[1],[$r_ptr]
|
|
adc @a[5],@a[5],@acc[5]
|
|
stp @a[2],@a[3],[$r_ptr,#16]
|
|
stp @a[4],@a[5],[$r_ptr,#32]
|
|
|
|
ret
|
|
.size __sub_mod_384,.-__sub_mod_384
|
|
|
|
.globl mul_mont_384x
|
|
.hidden mul_mont_384x
|
|
.type mul_mont_384x,%function
|
|
.align 5
|
|
mul_mont_384x:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
sub sp,sp,#288 // space for 3 768-bit vectors
|
|
|
|
mov @tmp[0],$r_ptr // save r_ptr
|
|
mov @tmp[1],$a_ptr // save b_ptr
|
|
mov @tmp[2],$b_ptr // save b_ptr
|
|
|
|
sub $r_ptr,sp,#0 // mul_384(t0, a->re, b->re)
|
|
bl __mul_384
|
|
|
|
add $a_ptr,$a_ptr,#48 // mul_384(t1, a->im, b->im)
|
|
add $b_ptr,$b_ptr,#48
|
|
add $r_ptr,sp,#96
|
|
bl __mul_384
|
|
|
|
ldp @mod[0],@mod[1],[$n_ptr]
|
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
|
|
|
sub $b_ptr,$a_ptr,#48
|
|
add $r_ptr,sp,#240
|
|
bl __add_mod_384
|
|
|
|
add $a_ptr,@tmp[2],#0
|
|
add $b_ptr,@tmp[2],#48
|
|
add $r_ptr,sp,#192 // t2
|
|
bl __add_mod_384
|
|
|
|
add $a_ptr,$r_ptr,#0
|
|
add $b_ptr,$r_ptr,#48
|
|
bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im)
|
|
|
|
ldp @mod[0],@mod[1],[$n_ptr]
|
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
|
|
|
mov $a_ptr,$r_ptr
|
|
add $b_ptr,sp,#0
|
|
bl __sub_mod_384x384
|
|
|
|
add $b_ptr,sp,#96
|
|
bl __sub_mod_384x384 // t2 = t2-t0-t1
|
|
|
|
add $a_ptr,sp,#0
|
|
add $b_ptr,sp,#96
|
|
add $r_ptr,sp,#0
|
|
bl __sub_mod_384x384 // t0 = t0-t1
|
|
|
|
add $a_ptr,sp,#0 // ret->re = redc(t0)
|
|
add $r_ptr,@tmp[0],#0
|
|
bl __mul_by_1_mont_384
|
|
bl __redc_tail_mont_384
|
|
|
|
add $a_ptr,sp,#192 // ret->im = redc(t2)
|
|
add $r_ptr,$r_ptr,#48
|
|
bl __mul_by_1_mont_384
|
|
bl __redc_tail_mont_384
|
|
ldr x30,[x29,#8]
|
|
|
|
add sp,sp,#288
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size mul_mont_384x,.-mul_mont_384x
|
|
|
|
.globl sqr_mont_384x
|
|
.hidden sqr_mont_384x
|
|
.type sqr_mont_384x,%function
|
|
.align 5
|
|
sqr_mont_384x:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
stp $n_ptr,$r_ptr,[sp,#96] // __mul_mont_384 wants them there
|
|
sub sp,sp,#96 // space for 2 384-bit vectors
|
|
mov $n0,$n_ptr // adjust for missing b_ptr
|
|
|
|
ldp @mod[0],@mod[1],[$b_ptr]
|
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
|
|
|
add $b_ptr,$a_ptr,#48
|
|
add $r_ptr,sp,#0
|
|
bl __add_mod_384 // t0 = a->re + a->im
|
|
|
|
add $r_ptr,sp,#48
|
|
bl __sub_mod_384 // t1 = a->re - a->im
|
|
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldr $bi, [$b_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
|
|
bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im)
|
|
|
|
adds @a[0],@a[0],@a[0] // add with itself
|
|
adcs @a[1],@a[1],@a[1]
|
|
adcs @a[2],@a[2],@a[2]
|
|
adcs @a[3],@a[3],@a[3]
|
|
adcs @a[4],@a[4],@a[4]
|
|
adcs @a[5],@a[5],@a[5]
|
|
adc @acc[6],xzr,xzr
|
|
|
|
subs @acc[0],@a[0],@mod[0]
|
|
sbcs @acc[1],@a[1],@mod[1]
|
|
sbcs @acc[2],@a[2],@mod[2]
|
|
sbcs @acc[3],@a[3],@mod[3]
|
|
sbcs @acc[4],@a[4],@mod[4]
|
|
sbcs @acc[5],@a[5],@mod[5]
|
|
sbcs xzr,@acc[6],xzr
|
|
|
|
csel @acc[0],@a[0],@acc[0],lo
|
|
csel @acc[1],@a[1],@acc[1],lo
|
|
csel @acc[2],@a[2],@acc[2],lo
|
|
ldp @a[0],@a[1],[sp]
|
|
csel @acc[3],@a[3],@acc[3],lo
|
|
ldr $bi, [sp,#48]
|
|
csel @acc[4],@a[4],@acc[4],lo
|
|
ldp @a[2],@a[3],[sp,#16]
|
|
csel @acc[5],@a[5],@acc[5],lo
|
|
ldp @a[4],@a[5],[sp,#32]
|
|
|
|
stp @acc[0],@acc[1],[$b_ptr,#48]
|
|
stp @acc[2],@acc[3],[$b_ptr,#64]
|
|
stp @acc[4],@acc[5],[$b_ptr,#80]
|
|
|
|
add $b_ptr,sp,#48
|
|
bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1)
|
|
ldr x30,[x29,#8]
|
|
|
|
stp @a[0],@a[1],[$b_ptr]
|
|
stp @a[2],@a[3],[$b_ptr,#16]
|
|
stp @a[4],@a[5],[$b_ptr,#32]
|
|
|
|
add sp,sp,#96
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size sqr_mont_384x,.-sqr_mont_384x
|
|
|
|
.globl mul_mont_384
|
|
.hidden mul_mont_384
|
|
.type mul_mont_384,%function
|
|
.align 5
|
|
mul_mont_384:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
stp $n0,$r_ptr,[sp,#96] // __mul_mont_384 wants them there
|
|
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldr $bi, [$b_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
|
|
ldp @mod[0],@mod[1],[$n_ptr]
|
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
|
|
|
bl __mul_mont_384
|
|
ldr x30,[x29,#8]
|
|
|
|
stp @a[0],@a[1],[$b_ptr]
|
|
stp @a[2],@a[3],[$b_ptr,#16]
|
|
stp @a[4],@a[5],[$b_ptr,#32]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size mul_mont_384,.-mul_mont_384
|
|
|
|
.type __mul_mont_384,%function
|
|
.align 5
|
|
__mul_mont_384:
|
|
mul @acc[0],@a[0],$bi
|
|
mul @acc[1],@a[1],$bi
|
|
mul @acc[2],@a[2],$bi
|
|
mul @acc[3],@a[3],$bi
|
|
mul @acc[4],@a[4],$bi
|
|
mul @acc[5],@a[5],$bi
|
|
mul $n0,$n0,@acc[0]
|
|
|
|
umulh @tmp[0],@a[0],$bi
|
|
umulh @tmp[1],@a[1],$bi
|
|
umulh @tmp[2],@a[2],$bi
|
|
umulh @tmp[3],@a[3],$bi
|
|
umulh @tmp[4],@a[4],$bi
|
|
umulh @tmp[5],@a[5],$bi
|
|
|
|
adds @acc[1],@acc[1],@tmp[0]
|
|
// mul @tmp[0],@mod[0],$n0
|
|
adcs @acc[2],@acc[2],@tmp[1]
|
|
mul @tmp[1],@mod[1],$n0
|
|
adcs @acc[3],@acc[3],@tmp[2]
|
|
mul @tmp[2],@mod[2],$n0
|
|
adcs @acc[4],@acc[4],@tmp[3]
|
|
mul @tmp[3],@mod[3],$n0
|
|
adcs @acc[5],@acc[5],@tmp[4]
|
|
mul @tmp[4],@mod[4],$n0
|
|
adc @acc[6],xzr, @tmp[5]
|
|
mul @tmp[5],@mod[5],$n0
|
|
mov $bi,xzr
|
|
___
|
|
for ($i=1;$i<6;$i++) {
|
|
$code.=<<___;
|
|
subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0]
|
|
umulh @tmp[0],@mod[0],$n0
|
|
adcs @acc[1],@acc[1],@tmp[1]
|
|
umulh @tmp[1],@mod[1],$n0
|
|
adcs @acc[2],@acc[2],@tmp[2]
|
|
umulh @tmp[2],@mod[2],$n0
|
|
adcs @acc[3],@acc[3],@tmp[3]
|
|
umulh @tmp[3],@mod[3],$n0
|
|
adcs @acc[4],@acc[4],@tmp[4]
|
|
umulh @tmp[4],@mod[4],$n0
|
|
adcs @acc[5],@acc[5],@tmp[5]
|
|
umulh @tmp[5],@mod[5],$n0
|
|
adcs @acc[6],@acc[6],xzr
|
|
adc $n0,$bi,xzr
|
|
ldr $bi,[$b_ptr,8*$i]
|
|
|
|
adds @acc[0],@acc[1],@tmp[0]
|
|
mul @tmp[0],@a[0],$bi
|
|
adcs @acc[1],@acc[2],@tmp[1]
|
|
mul @tmp[1],@a[1],$bi
|
|
adcs @acc[2],@acc[3],@tmp[2]
|
|
mul @tmp[2],@a[2],$bi
|
|
adcs @acc[3],@acc[4],@tmp[3]
|
|
mul @tmp[3],@a[3],$bi
|
|
adcs @acc[4],@acc[5],@tmp[4]
|
|
mul @tmp[4],@a[4],$bi
|
|
adcs @acc[5],@acc[6],@tmp[5]
|
|
mul @tmp[5],@a[5],$bi
|
|
adc @acc[6],$n0,xzr
|
|
ldr $n0,[x29,#96]
|
|
|
|
adds @acc[0],@acc[0],@tmp[0]
|
|
umulh @tmp[0],@a[0],$bi
|
|
adcs @acc[1],@acc[1],@tmp[1]
|
|
umulh @tmp[1],@a[1],$bi
|
|
adcs @acc[2],@acc[2],@tmp[2]
|
|
mul $n0,$n0,@acc[0]
|
|
umulh @tmp[2],@a[2],$bi
|
|
adcs @acc[3],@acc[3],@tmp[3]
|
|
umulh @tmp[3],@a[3],$bi
|
|
adcs @acc[4],@acc[4],@tmp[4]
|
|
umulh @tmp[4],@a[4],$bi
|
|
adcs @acc[5],@acc[5],@tmp[5]
|
|
umulh @tmp[5],@a[5],$bi
|
|
adcs @acc[6],@acc[6],xzr
|
|
adc $bi,xzr,xzr
|
|
|
|
adds @acc[1],@acc[1],@tmp[0]
|
|
// mul @tmp[0],@mod[0],$n0
|
|
adcs @acc[2],@acc[2],@tmp[1]
|
|
mul @tmp[1],@mod[1],$n0
|
|
adcs @acc[3],@acc[3],@tmp[2]
|
|
mul @tmp[2],@mod[2],$n0
|
|
adcs @acc[4],@acc[4],@tmp[3]
|
|
mul @tmp[3],@mod[3],$n0
|
|
adcs @acc[5],@acc[5],@tmp[4]
|
|
mul @tmp[4],@mod[4],$n0
|
|
adcs @acc[6],@acc[6],@tmp[5]
|
|
mul @tmp[5],@mod[5],$n0
|
|
adc $bi,$bi,xzr
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0]
|
|
umulh @tmp[0],@mod[0],$n0
|
|
adcs @acc[1],@acc[1],@tmp[1]
|
|
umulh @tmp[1],@mod[1],$n0
|
|
adcs @acc[2],@acc[2],@tmp[2]
|
|
umulh @tmp[2],@mod[2],$n0
|
|
adcs @acc[3],@acc[3],@tmp[3]
|
|
umulh @tmp[3],@mod[3],$n0
|
|
adcs @acc[4],@acc[4],@tmp[4]
|
|
umulh @tmp[4],@mod[4],$n0
|
|
adcs @acc[5],@acc[5],@tmp[5]
|
|
umulh @tmp[5],@mod[5],$n0
|
|
adcs @acc[6],@acc[6],xzr
|
|
ldp $n0,$b_ptr,[x29,#96] // pull r_ptr
|
|
adc $bi,$bi,xzr
|
|
|
|
adds @acc[0],@acc[1],@tmp[0]
|
|
adcs @acc[1],@acc[2],@tmp[1]
|
|
adcs @acc[2],@acc[3],@tmp[2]
|
|
adcs @acc[3],@acc[4],@tmp[3]
|
|
adcs @acc[4],@acc[5],@tmp[4]
|
|
adcs @acc[5],@acc[6],@tmp[5]
|
|
adc @acc[6],$bi,xzr
|
|
|
|
subs @tmp[0],@acc[0],@mod[0]
|
|
sbcs @tmp[1],@acc[1],@mod[1]
|
|
sbcs @tmp[2],@acc[2],@mod[2]
|
|
sbcs @tmp[3],@acc[3],@mod[3]
|
|
sbcs @tmp[4],@acc[4],@mod[4]
|
|
sbcs @tmp[5],@acc[5],@mod[5]
|
|
sbcs xzr, @acc[6],xzr
|
|
|
|
csel @a[0],@acc[0],@tmp[0],lo
|
|
csel @a[1],@acc[1],@tmp[1],lo
|
|
csel @a[2],@acc[2],@tmp[2],lo
|
|
csel @a[3],@acc[3],@tmp[3],lo
|
|
csel @a[4],@acc[4],@tmp[4],lo
|
|
csel @a[5],@acc[5],@tmp[5],lo
|
|
ret
|
|
.size __mul_mont_384,.-__mul_mont_384
|
|
|
|
.globl sqr_mont_384
|
|
.hidden sqr_mont_384
|
|
.type sqr_mont_384,%function
|
|
.align 5
|
|
sqr_mont_384:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
sub sp,sp,#96 // space for 768-bit vector
|
|
mov $n0,$n_ptr // adjust for missing b_ptr
|
|
|
|
mov $n_ptr,$r_ptr // save r_ptr
|
|
mov $r_ptr,sp
|
|
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
|
|
bl __sqr_384
|
|
|
|
ldp @mod[0],@mod[1],[$b_ptr]
|
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
|
|
|
mov $a_ptr,sp
|
|
mov $r_ptr,$n_ptr // restore r_ptr
|
|
bl __mul_by_1_mont_384
|
|
bl __redc_tail_mont_384
|
|
ldr x30,[x29,#8]
|
|
|
|
add sp,sp,#96
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size sqr_mont_384,.-sqr_mont_384
|
|
|
|
.globl sqr_n_mul_mont_383
|
|
.hidden sqr_n_mul_mont_383
|
|
.type sqr_n_mul_mont_383,%function
|
|
.align 5
|
|
sqr_n_mul_mont_383:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
stp $n0,$r_ptr,[sp,#96] // __mul_mont_384 wants them there
|
|
sub sp,sp,#96 // space for 768-bit vector
|
|
mov $bi,x5 // save b_ptr
|
|
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
mov $r_ptr,sp
|
|
.Loop_sqr_383:
|
|
bl __sqr_384
|
|
sub $b_ptr,$b_ptr,#1 // counter
|
|
|
|
ldp @mod[0],@mod[1],[$n_ptr]
|
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
|
|
|
mov $a_ptr,sp
|
|
bl __mul_by_1_mont_384
|
|
|
|
ldp @acc[0],@acc[1],[$a_ptr,#48]
|
|
ldp @acc[2],@acc[3],[$a_ptr,#64]
|
|
ldp @acc[4],@acc[5],[$a_ptr,#80]
|
|
|
|
adds @a[0],@a[0],@acc[0] // just accumulate upper half
|
|
adcs @a[1],@a[1],@acc[1]
|
|
adcs @a[2],@a[2],@acc[2]
|
|
adcs @a[3],@a[3],@acc[3]
|
|
adcs @a[4],@a[4],@acc[4]
|
|
adc @a[5],@a[5],@acc[5]
|
|
|
|
cbnz $b_ptr,.Loop_sqr_383
|
|
|
|
mov $b_ptr,$bi
|
|
ldr $bi,[$bi]
|
|
bl __mul_mont_384
|
|
ldr x30,[x29,#8]
|
|
|
|
stp @a[0],@a[1],[$b_ptr]
|
|
stp @a[2],@a[3],[$b_ptr,#16]
|
|
stp @a[4],@a[5],[$b_ptr,#32]
|
|
|
|
add sp,sp,#96
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
|
|
___
|
|
{
|
|
my @acc=(@acc,@tmp[0..2]);
|
|
|
|
$code.=<<___;
|
|
.type __sqr_384,%function
|
|
.align 5
|
|
__sqr_384:
|
|
mul @acc[0],@a[1],@a[0]
|
|
mul @acc[1],@a[2],@a[0]
|
|
mul @acc[2],@a[3],@a[0]
|
|
mul @acc[3],@a[4],@a[0]
|
|
mul @acc[4],@a[5],@a[0]
|
|
|
|
umulh @mod[1],@a[1],@a[0]
|
|
umulh @mod[2],@a[2],@a[0]
|
|
umulh @mod[3],@a[3],@a[0]
|
|
umulh @mod[4],@a[4],@a[0]
|
|
adds @acc[1],@acc[1],@mod[1]
|
|
umulh @mod[5],@a[5],@a[0]
|
|
adcs @acc[2],@acc[2],@mod[2]
|
|
mul @mod[2],@a[2],@a[1]
|
|
adcs @acc[3],@acc[3],@mod[3]
|
|
mul @mod[3],@a[3],@a[1]
|
|
adcs @acc[4],@acc[4],@mod[4]
|
|
mul @mod[4],@a[4],@a[1]
|
|
adc @acc[5],xzr, @mod[5]
|
|
mul @mod[5],@a[5],@a[1]
|
|
|
|
adds @acc[2],@acc[2],@mod[2]
|
|
umulh @mod[2],@a[2],@a[1]
|
|
adcs @acc[3],@acc[3],@mod[3]
|
|
umulh @mod[3],@a[3],@a[1]
|
|
adcs @acc[4],@acc[4],@mod[4]
|
|
umulh @mod[4],@a[4],@a[1]
|
|
adcs @acc[5],@acc[5],@mod[5]
|
|
umulh @mod[5],@a[5],@a[1]
|
|
adc @acc[6],xzr,xzr
|
|
|
|
mul @mod[0],@a[0],@a[0]
|
|
adds @acc[3],@acc[3],@mod[2]
|
|
umulh @a[0], @a[0],@a[0]
|
|
adcs @acc[4],@acc[4],@mod[3]
|
|
mul @mod[3],@a[3],@a[2]
|
|
adcs @acc[5],@acc[5],@mod[4]
|
|
mul @mod[4],@a[4],@a[2]
|
|
adc @acc[6],@acc[6],@mod[5]
|
|
mul @mod[5],@a[5],@a[2]
|
|
|
|
adds @acc[4],@acc[4],@mod[3]
|
|
umulh @mod[3],@a[3],@a[2]
|
|
adcs @acc[5],@acc[5],@mod[4]
|
|
umulh @mod[4],@a[4],@a[2]
|
|
adcs @acc[6],@acc[6],@mod[5]
|
|
umulh @mod[5],@a[5],@a[2]
|
|
adc @acc[7],xzr,xzr
|
|
|
|
mul @mod[1],@a[1],@a[1]
|
|
adds @acc[5],@acc[5],@mod[3]
|
|
umulh @a[1], @a[1],@a[1]
|
|
adcs @acc[6],@acc[6],@mod[4]
|
|
mul @mod[4],@a[4],@a[3]
|
|
adc @acc[7],@acc[7],@mod[5]
|
|
mul @mod[5],@a[5],@a[3]
|
|
|
|
adds @acc[6],@acc[6],@mod[4]
|
|
umulh @mod[4],@a[4],@a[3]
|
|
adcs @acc[7],@acc[7],@mod[5]
|
|
umulh @mod[5],@a[5],@a[3]
|
|
adc @acc[8],xzr,xzr
|
|
mul @mod[2],@a[2],@a[2]
|
|
adds @acc[7],@acc[7],@mod[4]
|
|
umulh @a[2], @a[2],@a[2]
|
|
adc @acc[8],@acc[8],@mod[5]
|
|
mul @mod[3],@a[3],@a[3]
|
|
|
|
mul @mod[5],@a[5],@a[4]
|
|
umulh @a[3], @a[3],@a[3]
|
|
adds @acc[8],@acc[8],@mod[5]
|
|
umulh @mod[5],@a[5],@a[4]
|
|
mul @mod[4],@a[4],@a[4]
|
|
adc @acc[9],@mod[5],xzr
|
|
|
|
adds @acc[0],@acc[0],@acc[0]
|
|
adcs @acc[1],@acc[1],@acc[1]
|
|
adcs @acc[2],@acc[2],@acc[2]
|
|
adcs @acc[3],@acc[3],@acc[3]
|
|
adcs @acc[4],@acc[4],@acc[4]
|
|
adcs @acc[5],@acc[5],@acc[5]
|
|
adcs @acc[6],@acc[6],@acc[6]
|
|
adcs @acc[7],@acc[7],@acc[7]
|
|
umulh @a[4], @a[4],@a[4]
|
|
adcs @acc[8],@acc[8],@acc[8]
|
|
mul @mod[5],@a[5],@a[5]
|
|
adcs @acc[9],@acc[9],@acc[9]
|
|
umulh @a[5], @a[5],@a[5]
|
|
adc $a_ptr,xzr,xzr
|
|
|
|
adds @acc[0],@acc[0],@a[0]
|
|
adcs @acc[1],@acc[1],@mod[1]
|
|
adcs @acc[2],@acc[2],@a[1]
|
|
adcs @acc[3],@acc[3],@mod[2]
|
|
adcs @acc[4],@acc[4],@a[2]
|
|
adcs @acc[5],@acc[5],@mod[3]
|
|
adcs @acc[6],@acc[6],@a[3]
|
|
stp @mod[0],@acc[0],[$r_ptr]
|
|
adcs @acc[7],@acc[7],@mod[4]
|
|
stp @acc[1],@acc[2],[$r_ptr,#16]
|
|
adcs @acc[8],@acc[8],@a[4]
|
|
stp @acc[3],@acc[4],[$r_ptr,#32]
|
|
adcs @acc[9],@acc[9],@mod[5]
|
|
stp @acc[5],@acc[6],[$r_ptr,#48]
|
|
adc @a[5],@a[5],$a_ptr
|
|
stp @acc[7],@acc[8],[$r_ptr,#64]
|
|
stp @acc[9],@a[5],[$r_ptr,#80]
|
|
|
|
ret
|
|
.size __sqr_384,.-__sqr_384
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
.globl sqr_384
|
|
.hidden sqr_384
|
|
.type sqr_384,%function
|
|
.align 5
|
|
sqr_384:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
|
|
bl __sqr_384
|
|
ldr x30,[x29,#8]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size sqr_384,.-sqr_384
|
|
|
|
.globl redc_mont_384
|
|
.hidden redc_mont_384
|
|
.type redc_mont_384,%function
|
|
.align 5
|
|
redc_mont_384:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
mov $n0,$n_ptr // adjust for missing b_ptr
|
|
|
|
ldp @mod[0],@mod[1],[$b_ptr]
|
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
|
|
|
bl __mul_by_1_mont_384
|
|
bl __redc_tail_mont_384
|
|
ldr x30,[x29,#8]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size redc_mont_384,.-redc_mont_384
|
|
|
|
.globl from_mont_384
|
|
.hidden from_mont_384
|
|
.type from_mont_384,%function
|
|
.align 5
|
|
from_mont_384:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
mov $n0,$n_ptr // adjust for missing b_ptr
|
|
|
|
ldp @mod[0],@mod[1],[$b_ptr]
|
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
|
|
|
bl __mul_by_1_mont_384
|
|
ldr x30,[x29,#8]
|
|
|
|
subs @acc[0],@a[0],@mod[0]
|
|
sbcs @acc[1],@a[1],@mod[1]
|
|
sbcs @acc[2],@a[2],@mod[2]
|
|
sbcs @acc[3],@a[3],@mod[3]
|
|
sbcs @acc[4],@a[4],@mod[4]
|
|
sbcs @acc[5],@a[5],@mod[5]
|
|
|
|
csel @a[0],@a[0],@acc[0],lo
|
|
csel @a[1],@a[1],@acc[1],lo
|
|
csel @a[2],@a[2],@acc[2],lo
|
|
csel @a[3],@a[3],@acc[3],lo
|
|
csel @a[4],@a[4],@acc[4],lo
|
|
csel @a[5],@a[5],@acc[5],lo
|
|
|
|
stp @a[0],@a[1],[$r_ptr]
|
|
stp @a[2],@a[3],[$r_ptr,#16]
|
|
stp @a[4],@a[5],[$r_ptr,#32]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size from_mont_384,.-from_mont_384
|
|
|
|
.type __mul_by_1_mont_384,%function
|
|
.align 5
|
|
__mul_by_1_mont_384:
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
mul @tmp[0],$n0,@a[0]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
|
|
// mul @acc[0],@mod[0],@tmp[0]
|
|
mul @acc[1],@mod[1],@tmp[0]
|
|
mul @acc[2],@mod[2],@tmp[0]
|
|
mul @acc[3],@mod[3],@tmp[0]
|
|
mul @acc[4],@mod[4],@tmp[0]
|
|
mul @acc[5],@mod[5],@tmp[0]
|
|
subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0]
|
|
umulh @a[0],@mod[0],@tmp[0]
|
|
adcs @acc[1],@acc[1],@a[1]
|
|
umulh @a[1],@mod[1],@tmp[0]
|
|
adcs @acc[2],@acc[2],@a[2]
|
|
umulh @a[2],@mod[2],@tmp[0]
|
|
adcs @acc[3],@acc[3],@a[3]
|
|
umulh @a[3],@mod[3],@tmp[0]
|
|
adcs @acc[4],@acc[4],@a[4]
|
|
umulh @a[4],@mod[4],@tmp[0]
|
|
adcs @acc[5],@acc[5],@a[5]
|
|
umulh @a[5],@mod[5],@tmp[0]
|
|
adc @acc[6],xzr,xzr
|
|
___
|
|
for ($i=1;$i<6;$i++) {
|
|
$code.=<<___;
|
|
adds @a[0],@a[0],@acc[1]
|
|
adcs @a[1],@a[1],@acc[2]
|
|
adcs @a[2],@a[2],@acc[3]
|
|
mul @tmp[0],$n0,@a[0]
|
|
adcs @a[3],@a[3],@acc[4]
|
|
adcs @a[4],@a[4],@acc[5]
|
|
adc @a[5],@a[5],@acc[6]
|
|
|
|
// mul @acc[0],@mod[0],@tmp[0]
|
|
mul @acc[1],@mod[1],@tmp[0]
|
|
mul @acc[2],@mod[2],@tmp[0]
|
|
mul @acc[3],@mod[3],@tmp[0]
|
|
mul @acc[4],@mod[4],@tmp[0]
|
|
mul @acc[5],@mod[5],@tmp[0]
|
|
subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0]
|
|
umulh @a[0],@mod[0],@tmp[0]
|
|
adcs @acc[1],@acc[1],@a[1]
|
|
umulh @a[1],@mod[1],@tmp[0]
|
|
adcs @acc[2],@acc[2],@a[2]
|
|
umulh @a[2],@mod[2],@tmp[0]
|
|
adcs @acc[3],@acc[3],@a[3]
|
|
umulh @a[3],@mod[3],@tmp[0]
|
|
adcs @acc[4],@acc[4],@a[4]
|
|
umulh @a[4],@mod[4],@tmp[0]
|
|
adcs @acc[5],@acc[5],@a[5]
|
|
umulh @a[5],@mod[5],@tmp[0]
|
|
adc @acc[6],xzr,xzr
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
adds @a[0],@a[0],@acc[1]
|
|
adcs @a[1],@a[1],@acc[2]
|
|
adcs @a[2],@a[2],@acc[3]
|
|
adcs @a[3],@a[3],@acc[4]
|
|
adcs @a[4],@a[4],@acc[5]
|
|
adc @a[5],@a[5],@acc[6]
|
|
|
|
ret
|
|
.size __mul_by_1_mont_384,.-__mul_by_1_mont_384
|
|
|
|
.type __redc_tail_mont_384,%function
|
|
.align 5
|
|
__redc_tail_mont_384:
|
|
ldp @acc[0],@acc[1],[$a_ptr,#48]
|
|
ldp @acc[2],@acc[3],[$a_ptr,#64]
|
|
ldp @acc[4],@acc[5],[$a_ptr,#80]
|
|
|
|
adds @a[0],@a[0],@acc[0] // accumulate upper half
|
|
adcs @a[1],@a[1],@acc[1]
|
|
adcs @a[2],@a[2],@acc[2]
|
|
adcs @a[3],@a[3],@acc[3]
|
|
adcs @a[4],@a[4],@acc[4]
|
|
adcs @a[5],@a[5],@acc[5]
|
|
adc @acc[6],xzr,xzr
|
|
|
|
subs @acc[0],@a[0],@mod[0]
|
|
sbcs @acc[1],@a[1],@mod[1]
|
|
sbcs @acc[2],@a[2],@mod[2]
|
|
sbcs @acc[3],@a[3],@mod[3]
|
|
sbcs @acc[4],@a[4],@mod[4]
|
|
sbcs @acc[5],@a[5],@mod[5]
|
|
sbcs xzr,@acc[6],xzr
|
|
|
|
csel @a[0],@a[0],@acc[0],lo
|
|
csel @a[1],@a[1],@acc[1],lo
|
|
csel @a[2],@a[2],@acc[2],lo
|
|
csel @a[3],@a[3],@acc[3],lo
|
|
csel @a[4],@a[4],@acc[4],lo
|
|
csel @a[5],@a[5],@acc[5],lo
|
|
|
|
stp @a[0],@a[1],[$r_ptr]
|
|
stp @a[2],@a[3],[$r_ptr,#16]
|
|
stp @a[4],@a[5],[$r_ptr,#32]
|
|
|
|
ret
|
|
.size __redc_tail_mont_384,.-__redc_tail_mont_384
|
|
|
|
.globl mul_384
|
|
.hidden mul_384
|
|
.type mul_384,%function
|
|
.align 5
|
|
mul_384:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
|
|
bl __mul_384
|
|
ldr x30,[x29,#8]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size mul_384,.-mul_384
|
|
|
|
.type __mul_384,%function
|
|
.align 5
|
|
__mul_384:
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldr $bi, [$b_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
|
|
mul @acc[0],@a[0],$bi
|
|
mul @acc[1],@a[1],$bi
|
|
mul @acc[2],@a[2],$bi
|
|
mul @acc[3],@a[3],$bi
|
|
mul @acc[4],@a[4],$bi
|
|
mul @acc[5],@a[5],$bi
|
|
|
|
umulh @mod[0],@a[0],$bi
|
|
umulh @mod[1],@a[1],$bi
|
|
umulh @mod[2],@a[2],$bi
|
|
umulh @mod[3],@a[3],$bi
|
|
umulh @mod[4],@a[4],$bi
|
|
umulh @mod[5],@a[5],$bi
|
|
ldr $bi,[$b_ptr,8*1]
|
|
|
|
str @acc[0],[$r_ptr]
|
|
adds @acc[0],@acc[1],@mod[0]
|
|
mul @mod[0],@a[0],$bi
|
|
adcs @acc[1],@acc[2],@mod[1]
|
|
mul @mod[1],@a[1],$bi
|
|
adcs @acc[2],@acc[3],@mod[2]
|
|
mul @mod[2],@a[2],$bi
|
|
adcs @acc[3],@acc[4],@mod[3]
|
|
mul @mod[3],@a[3],$bi
|
|
adcs @acc[4],@acc[5],@mod[4]
|
|
mul @mod[4],@a[4],$bi
|
|
adc @acc[5],xzr, @mod[5]
|
|
mul @mod[5],@a[5],$bi
|
|
___
|
|
for ($i=1;$i<5;$i++) {
|
|
$code.=<<___;
|
|
adds @acc[0],@acc[0],@mod[0]
|
|
umulh @mod[0],@a[0],$bi
|
|
adcs @acc[1],@acc[1],@mod[1]
|
|
umulh @mod[1],@a[1],$bi
|
|
adcs @acc[2],@acc[2],@mod[2]
|
|
umulh @mod[2],@a[2],$bi
|
|
adcs @acc[3],@acc[3],@mod[3]
|
|
umulh @mod[3],@a[3],$bi
|
|
adcs @acc[4],@acc[4],@mod[4]
|
|
umulh @mod[4],@a[4],$bi
|
|
adcs @acc[5],@acc[5],@mod[5]
|
|
umulh @mod[5],@a[5],$bi
|
|
ldr $bi,[$b_ptr,#8*($i+1)]
|
|
adc @acc[6],xzr,xzr
|
|
|
|
str @acc[0],[$r_ptr,8*$i]
|
|
adds @acc[0],@acc[1],@mod[0]
|
|
mul @mod[0],@a[0],$bi
|
|
adcs @acc[1],@acc[2],@mod[1]
|
|
mul @mod[1],@a[1],$bi
|
|
adcs @acc[2],@acc[3],@mod[2]
|
|
mul @mod[2],@a[2],$bi
|
|
adcs @acc[3],@acc[4],@mod[3]
|
|
mul @mod[3],@a[3],$bi
|
|
adcs @acc[4],@acc[5],@mod[4]
|
|
mul @mod[4],@a[4],$bi
|
|
adc @acc[5],@acc[6],@mod[5]
|
|
mul @mod[5],@a[5],$bi
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
adds @acc[0],@acc[0],@mod[0]
|
|
umulh @mod[0],@a[0],$bi
|
|
adcs @acc[1],@acc[1],@mod[1]
|
|
umulh @mod[1],@a[1],$bi
|
|
adcs @acc[2],@acc[2],@mod[2]
|
|
umulh @mod[2],@a[2],$bi
|
|
adcs @acc[3],@acc[3],@mod[3]
|
|
umulh @mod[3],@a[3],$bi
|
|
adcs @acc[4],@acc[4],@mod[4]
|
|
umulh @mod[4],@a[4],$bi
|
|
adcs @acc[5],@acc[5],@mod[5]
|
|
umulh @mod[5],@a[5],$bi
|
|
adc @acc[6],xzr,xzr
|
|
|
|
str @acc[0],[$r_ptr,8*$i]
|
|
adds @acc[0],@acc[1],@mod[0]
|
|
adcs @acc[1],@acc[2],@mod[1]
|
|
adcs @acc[2],@acc[3],@mod[2]
|
|
adcs @acc[3],@acc[4],@mod[3]
|
|
adcs @acc[4],@acc[5],@mod[4]
|
|
adc @acc[5],@acc[6],@mod[5]
|
|
|
|
stp @acc[0],@acc[1],[$r_ptr,#48]
|
|
stp @acc[2],@acc[3],[$r_ptr,#64]
|
|
stp @acc[4],@acc[5],[$r_ptr,#80]
|
|
|
|
ret
|
|
.size __mul_384,.-__mul_384
|
|
|
|
.globl mul_382x
|
|
.hidden mul_382x
|
|
.type mul_382x,%function
|
|
.align 5
|
|
mul_382x:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
sub sp,sp,#96 // space for two 384-bit vectors
|
|
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
mov @tmp[0],$r_ptr // save r_ptr
|
|
ldp @acc[0],@acc[1],[$a_ptr,#48]
|
|
mov @tmp[1],$a_ptr // save a_ptr
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
mov @tmp[2],$b_ptr // save b_ptr
|
|
ldp @acc[2],@acc[3],[$a_ptr,#64]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im
|
|
ldp @acc[4],@acc[5],[$a_ptr,#80]
|
|
adcs @mod[1],$a[1],@acc[1]
|
|
ldp @a[0],@a[1],[$b_ptr]
|
|
adcs @mod[2],$a[2],@acc[2]
|
|
ldp @acc[0],@acc[1],[$b_ptr,#48]
|
|
adcs @mod[3],$a[3],@acc[3]
|
|
ldp @a[2],@a[3],[$b_ptr,#16]
|
|
adcs @mod[4],$a[4],@acc[4]
|
|
ldp @acc[2],@acc[3],[$b_ptr,#64]
|
|
adc @mod[5],$a[5],@acc[5]
|
|
ldp @a[4],@a[5],[$b_ptr,#32]
|
|
|
|
stp @mod[0],@mod[1],[sp]
|
|
adds @mod[0],$a[0],@acc[0] // t1 = b->re + b->im
|
|
ldp @acc[4],@acc[5],[$b_ptr,#80]
|
|
adcs @mod[1],$a[1],@acc[1]
|
|
stp @mod[2],@mod[3],[sp,#16]
|
|
adcs @mod[2],$a[2],@acc[2]
|
|
adcs @mod[3],$a[3],@acc[3]
|
|
stp @mod[4],@mod[5],[sp,#32]
|
|
adcs @mod[4],$a[4],@acc[4]
|
|
stp @mod[0],@mod[1],[sp,#48]
|
|
adc @mod[5],$a[5],@acc[5]
|
|
stp @mod[2],@mod[3],[sp,#64]
|
|
stp @mod[4],@mod[5],[sp,#80]
|
|
|
|
bl __mul_384 // mul_384(ret->re, a->re, b->re)
|
|
|
|
add $a_ptr,sp,#0 // mul_384(ret->im, t0, t1)
|
|
add $b_ptr,sp,#48
|
|
add $r_ptr,@tmp[0],#96
|
|
bl __mul_384
|
|
|
|
add $a_ptr,@tmp[1],#48 // mul_384(tx, a->im, b->im)
|
|
add $b_ptr,@tmp[2],#48
|
|
add $r_ptr,sp,#0
|
|
bl __mul_384
|
|
|
|
ldp @mod[0],@mod[1],[$n_ptr]
|
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
|
|
|
add $a_ptr,@tmp[0],#96 // ret->im -= tx
|
|
add $b_ptr,sp,#0
|
|
add $r_ptr,@tmp[0],#96
|
|
bl __sub_mod_384x384
|
|
|
|
add $b_ptr,@tmp[0],#0 // ret->im -= ret->re
|
|
bl __sub_mod_384x384
|
|
|
|
add $a_ptr,@tmp[0],#0 // ret->re -= tx
|
|
add $b_ptr,sp,#0
|
|
add $r_ptr,@tmp[0],#0
|
|
bl __sub_mod_384x384
|
|
ldr x30,[x29,#8]
|
|
|
|
add sp,sp,#96
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size mul_382x,.-mul_382x
|
|
|
|
.globl sqr_382x
|
|
.hidden sqr_382x
|
|
.type sqr_382x,%function
|
|
.align 5
|
|
sqr_382x:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldp @acc[0],@acc[1],[$a_ptr,#48]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im
|
|
ldp @acc[2],@acc[3],[$a_ptr,#64]
|
|
adcs @mod[1],$a[1],@acc[1]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
adcs @mod[2],$a[2],@acc[2]
|
|
ldp @acc[4],@acc[5],[$a_ptr,#80]
|
|
adcs @mod[3],$a[3],@acc[3]
|
|
stp @mod[0],@mod[1],[$r_ptr]
|
|
adcs @mod[4],$a[4],@acc[4]
|
|
ldp @mod[0],@mod[1],[$b_ptr]
|
|
adc @mod[5],$a[5],@acc[5]
|
|
stp @mod[2],@mod[3],[$r_ptr,#16]
|
|
|
|
subs @a[0],$a[0],@acc[0] // t1 = a->re - a->im
|
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
|
sbcs @a[1],$a[1],@acc[1]
|
|
stp @mod[4],@mod[5],[$r_ptr,#32]
|
|
sbcs @a[2],$a[2],@acc[2]
|
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
|
sbcs @a[3],$a[3],@acc[3]
|
|
sbcs @a[4],$a[4],@acc[4]
|
|
sbcs @a[5],$a[5],@acc[5]
|
|
sbc @acc[6],xzr,xzr
|
|
|
|
and @acc[0],@mod[0],@acc[6]
|
|
and @acc[1],@mod[1],@acc[6]
|
|
adds @a[0],@a[0],@acc[0]
|
|
and @acc[2],@mod[2],@acc[6]
|
|
adcs @a[1],@a[1],@acc[1]
|
|
and @acc[3],@mod[3],@acc[6]
|
|
adcs @a[2],@a[2],@acc[2]
|
|
and @acc[4],@mod[4],@acc[6]
|
|
adcs @a[3],@a[3],@acc[3]
|
|
and @acc[5],@mod[5],@acc[6]
|
|
adcs @a[4],@a[4],@acc[4]
|
|
stp @a[0],@a[1],[$r_ptr,#48]
|
|
adc @a[5],@a[5],@acc[5]
|
|
stp @a[2],@a[3],[$r_ptr,#64]
|
|
stp @a[4],@a[5],[$r_ptr,#80]
|
|
|
|
mov $n0,$a_ptr // save a_ptr
|
|
add $a_ptr,$r_ptr,#0 // mul_384(ret->re, t0, t1)
|
|
add $b_ptr,$r_ptr,#48
|
|
bl __mul_384
|
|
|
|
add $a_ptr,$n0,#0 // mul_384(ret->im, a->re, a->im)
|
|
add $b_ptr,$n0,#48
|
|
add $r_ptr,$r_ptr,#96
|
|
bl __mul_384
|
|
ldr x30,[x29,#8]
|
|
|
|
ldp @a[0],@a[1],[$r_ptr]
|
|
ldp @a[2],@a[3],[$r_ptr,#16]
|
|
adds @a[0],@a[0],@a[0] // add with itself
|
|
ldp @a[4],@a[5],[$r_ptr,#32]
|
|
adcs @a[1],@a[1],@a[1]
|
|
adcs @a[2],@a[2],@a[2]
|
|
adcs @a[3],@a[3],@a[3]
|
|
adcs @a[4],@a[4],@a[4]
|
|
adcs @a[5],@a[5],@a[5]
|
|
adcs @acc[0],@acc[0],@acc[0]
|
|
adcs @acc[1],@acc[1],@acc[1]
|
|
stp @a[0],@a[1],[$r_ptr]
|
|
adcs @acc[2],@acc[2],@acc[2]
|
|
stp @a[2],@a[3],[$r_ptr,#16]
|
|
adcs @acc[3],@acc[3],@acc[3]
|
|
stp @a[4],@a[5],[$r_ptr,#32]
|
|
adcs @acc[4],@acc[4],@acc[4]
|
|
stp @acc[0],@acc[1],[$r_ptr,#48]
|
|
adc @acc[5],@acc[5],@acc[5]
|
|
stp @acc[2],@acc[3],[$r_ptr,#64]
|
|
stp @acc[4],@acc[5],[$r_ptr,#80]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size sqr_382x,.-sqr_382x
|
|
|
|
.globl sqr_mont_382x
|
|
.hidden sqr_mont_382x
|
|
.type sqr_mont_382x,%function
|
|
.align 5
|
|
sqr_mont_382x:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
stp $n_ptr,$r_ptr,[sp,#96] // __mul_mont_384 wants them there
|
|
sub sp,sp,#112 // space for two 384-bit vectors + word
|
|
mov $n0,$n_ptr // adjust for missing b_ptr
|
|
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
|
|
ldp $bi,@acc[1],[$a_ptr,#48]
|
|
ldp @acc[2],@acc[3],[$a_ptr,#64]
|
|
ldp @acc[4],@acc[5],[$a_ptr,#80]
|
|
|
|
adds @mod[0],$a[0],$bi // t0 = a->re + a->im
|
|
adcs @mod[1],$a[1],@acc[1]
|
|
adcs @mod[2],$a[2],@acc[2]
|
|
adcs @mod[3],$a[3],@acc[3]
|
|
adcs @mod[4],$a[4],@acc[4]
|
|
adc @mod[5],$a[5],@acc[5]
|
|
|
|
subs @acc[0],$a[0],$bi // t1 = a->re - a->im
|
|
sbcs @acc[1],$a[1],@acc[1]
|
|
sbcs @acc[2],$a[2],@acc[2]
|
|
sbcs @acc[3],$a[3],@acc[3]
|
|
sbcs @acc[4],$a[4],@acc[4]
|
|
sbcs @acc[5],$a[5],@acc[5]
|
|
sbc @acc[6],xzr,xzr // borrow flag as mask
|
|
|
|
stp @mod[0],@mod[1],[sp]
|
|
stp @mod[2],@mod[3],[sp,#16]
|
|
stp @mod[4],@mod[5],[sp,#32]
|
|
stp @acc[0],@acc[1],[sp,#48]
|
|
stp @acc[2],@acc[3],[sp,#64]
|
|
stp @acc[4],@acc[5],[sp,#80]
|
|
str @acc[6],[sp,#96]
|
|
|
|
ldp @mod[0],@mod[1],[$b_ptr]
|
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
|
|
|
add $b_ptr,$a_ptr,#48
|
|
bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im)
|
|
|
|
adds @acc[0],@a[0],@a[0] // add with itself
|
|
adcs @acc[1],@a[1],@a[1]
|
|
adcs @acc[2],@a[2],@a[2]
|
|
adcs @acc[3],@a[3],@a[3]
|
|
adcs @acc[4],@a[4],@a[4]
|
|
adc @acc[5],@a[5],@a[5]
|
|
|
|
stp @acc[0],@acc[1],[$b_ptr,#48]
|
|
stp @acc[2],@acc[3],[$b_ptr,#64]
|
|
stp @acc[4],@acc[5],[$b_ptr,#80]
|
|
|
|
ldp @a[0],@a[1],[sp]
|
|
ldr $bi,[sp,#48]
|
|
ldp @a[2],@a[3],[sp,#16]
|
|
ldp @a[4],@a[5],[sp,#32]
|
|
|
|
add $b_ptr,sp,#48
|
|
bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1)
|
|
ldr x30,[x29,#8]
|
|
|
|
ldr @acc[6],[sp,#96] // account for sign from a->re - a->im
|
|
ldp @acc[0],@acc[1],[sp]
|
|
ldp @acc[2],@acc[3],[sp,#16]
|
|
ldp @acc[4],@acc[5],[sp,#32]
|
|
|
|
and @acc[0],@acc[0],@acc[6]
|
|
and @acc[1],@acc[1],@acc[6]
|
|
and @acc[2],@acc[2],@acc[6]
|
|
and @acc[3],@acc[3],@acc[6]
|
|
and @acc[4],@acc[4],@acc[6]
|
|
and @acc[5],@acc[5],@acc[6]
|
|
|
|
subs @a[0],@a[0],@acc[0]
|
|
sbcs @a[1],@a[1],@acc[1]
|
|
sbcs @a[2],@a[2],@acc[2]
|
|
sbcs @a[3],@a[3],@acc[3]
|
|
sbcs @a[4],@a[4],@acc[4]
|
|
sbcs @a[5],@a[5],@acc[5]
|
|
sbc @acc[6],xzr,xzr
|
|
|
|
and @acc[0],@mod[0],@acc[6]
|
|
and @acc[1],@mod[1],@acc[6]
|
|
and @acc[2],@mod[2],@acc[6]
|
|
and @acc[3],@mod[3],@acc[6]
|
|
and @acc[4],@mod[4],@acc[6]
|
|
and @acc[5],@mod[5],@acc[6]
|
|
|
|
adds @a[0],@a[0],@acc[0]
|
|
adcs @a[1],@a[1],@acc[1]
|
|
adcs @a[2],@a[2],@acc[2]
|
|
adcs @a[3],@a[3],@acc[3]
|
|
adcs @a[4],@a[4],@acc[4]
|
|
adc @a[5],@a[5],@acc[5]
|
|
|
|
stp @a[0],@a[1],[$b_ptr]
|
|
stp @a[2],@a[3],[$b_ptr,#16]
|
|
stp @a[4],@a[5],[$b_ptr,#32]
|
|
|
|
add sp,sp,#112
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size sqr_mont_382x,.-sqr_mont_382x
|
|
|
|
.type __mul_mont_383_nonred,%function
|
|
.align 5
|
|
__mul_mont_383_nonred:
|
|
mul @acc[0],@a[0],$bi
|
|
mul @acc[1],@a[1],$bi
|
|
mul @acc[2],@a[2],$bi
|
|
mul @acc[3],@a[3],$bi
|
|
mul @acc[4],@a[4],$bi
|
|
mul @acc[5],@a[5],$bi
|
|
mul $n0,$n0,@acc[0]
|
|
|
|
umulh @tmp[0],@a[0],$bi
|
|
umulh @tmp[1],@a[1],$bi
|
|
umulh @tmp[2],@a[2],$bi
|
|
umulh @tmp[3],@a[3],$bi
|
|
umulh @tmp[4],@a[4],$bi
|
|
umulh @tmp[5],@a[5],$bi
|
|
|
|
adds @acc[1],@acc[1],@tmp[0]
|
|
mul @tmp[0],@mod[0],$n0
|
|
adcs @acc[2],@acc[2],@tmp[1]
|
|
mul @tmp[1],@mod[1],$n0
|
|
adcs @acc[3],@acc[3],@tmp[2]
|
|
mul @tmp[2],@mod[2],$n0
|
|
adcs @acc[4],@acc[4],@tmp[3]
|
|
mul @tmp[3],@mod[3],$n0
|
|
adcs @acc[5],@acc[5],@tmp[4]
|
|
mul @tmp[4],@mod[4],$n0
|
|
adc @acc[6],xzr, @tmp[5]
|
|
mul @tmp[5],@mod[5],$n0
|
|
___
|
|
for ($i=1;$i<6;$i++) {
|
|
$code.=<<___;
|
|
ldr $bi,[$b_ptr,8*$i]
|
|
adds @acc[0],@acc[0],@tmp[0]
|
|
umulh @tmp[0],@mod[0],$n0
|
|
adcs @acc[1],@acc[1],@tmp[1]
|
|
umulh @tmp[1],@mod[1],$n0
|
|
adcs @acc[2],@acc[2],@tmp[2]
|
|
umulh @tmp[2],@mod[2],$n0
|
|
adcs @acc[3],@acc[3],@tmp[3]
|
|
umulh @tmp[3],@mod[3],$n0
|
|
adcs @acc[4],@acc[4],@tmp[4]
|
|
umulh @tmp[4],@mod[4],$n0
|
|
adcs @acc[5],@acc[5],@tmp[5]
|
|
umulh @tmp[5],@mod[5],$n0
|
|
adc @acc[6],@acc[6],xzr
|
|
|
|
ldr $n0,[x29,#96]
|
|
adds @acc[0],@acc[1],@tmp[0]
|
|
mul @tmp[0],@a[0],$bi
|
|
adcs @acc[1],@acc[2],@tmp[1]
|
|
mul @tmp[1],@a[1],$bi
|
|
adcs @acc[2],@acc[3],@tmp[2]
|
|
mul @tmp[2],@a[2],$bi
|
|
adcs @acc[3],@acc[4],@tmp[3]
|
|
mul @tmp[3],@a[3],$bi
|
|
adcs @acc[4],@acc[5],@tmp[4]
|
|
mul @tmp[4],@a[4],$bi
|
|
adcs @acc[5],@acc[6],@tmp[5]
|
|
mul @tmp[5],@a[5],$bi
|
|
adc @acc[6],xzr,xzr
|
|
|
|
adds @acc[0],@acc[0],@tmp[0]
|
|
umulh @tmp[0],@a[0],$bi
|
|
adcs @acc[1],@acc[1],@tmp[1]
|
|
umulh @tmp[1],@a[1],$bi
|
|
adcs @acc[2],@acc[2],@tmp[2]
|
|
mul $n0,$n0,@acc[0]
|
|
umulh @tmp[2],@a[2],$bi
|
|
adcs @acc[3],@acc[3],@tmp[3]
|
|
umulh @tmp[3],@a[3],$bi
|
|
adcs @acc[4],@acc[4],@tmp[4]
|
|
umulh @tmp[4],@a[4],$bi
|
|
adcs @acc[5],@acc[5],@tmp[5]
|
|
umulh @tmp[5],@a[5],$bi
|
|
adc @acc[6],@acc[6],xzr
|
|
|
|
adds @acc[1],@acc[1],@tmp[0]
|
|
mul @tmp[0],@mod[0],$n0
|
|
adcs @acc[2],@acc[2],@tmp[1]
|
|
mul @tmp[1],@mod[1],$n0
|
|
adcs @acc[3],@acc[3],@tmp[2]
|
|
mul @tmp[2],@mod[2],$n0
|
|
adcs @acc[4],@acc[4],@tmp[3]
|
|
mul @tmp[3],@mod[3],$n0
|
|
adcs @acc[5],@acc[5],@tmp[4]
|
|
mul @tmp[4],@mod[4],$n0
|
|
adc @acc[6],@acc[6],@tmp[5]
|
|
mul @tmp[5],@mod[5],$n0
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
adds @acc[0],@acc[0],@tmp[0]
|
|
umulh @tmp[0],@mod[0],$n0
|
|
adcs @acc[1],@acc[1],@tmp[1]
|
|
umulh @tmp[1],@mod[1],$n0
|
|
adcs @acc[2],@acc[2],@tmp[2]
|
|
umulh @tmp[2],@mod[2],$n0
|
|
adcs @acc[3],@acc[3],@tmp[3]
|
|
umulh @tmp[3],@mod[3],$n0
|
|
adcs @acc[4],@acc[4],@tmp[4]
|
|
umulh @tmp[4],@mod[4],$n0
|
|
adcs @acc[5],@acc[5],@tmp[5]
|
|
umulh @tmp[5],@mod[5],$n0
|
|
adc @acc[6],@acc[6],xzr
|
|
ldp $n0,$b_ptr,[x29,#96] // pull r_ptr
|
|
|
|
adds @a[0],@acc[1],@tmp[0]
|
|
adcs @a[1],@acc[2],@tmp[1]
|
|
adcs @a[2],@acc[3],@tmp[2]
|
|
adcs @a[3],@acc[4],@tmp[3]
|
|
adcs @a[4],@acc[5],@tmp[4]
|
|
adcs @a[5],@acc[6],@tmp[5]
|
|
|
|
ret
|
|
.size __mul_mont_383_nonred,.-__mul_mont_383_nonred
|
|
|
|
.globl sgn0_pty_mont_384
|
|
.hidden sgn0_pty_mont_384
|
|
.type sgn0_pty_mont_384,%function
|
|
.align 5
|
|
sgn0_pty_mont_384:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
|
|
mov $n0,$b_ptr
|
|
ldp @mod[0],@mod[1],[$a_ptr]
|
|
ldp @mod[2],@mod[3],[$a_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$a_ptr,#32]
|
|
mov $a_ptr,$r_ptr
|
|
|
|
bl __mul_by_1_mont_384
|
|
ldr x30,[x29,#8]
|
|
|
|
and $r_ptr,@a[0],#1
|
|
adds @a[0],@a[0],@a[0]
|
|
adcs @a[1],@a[1],@a[1]
|
|
adcs @a[2],@a[2],@a[2]
|
|
adcs @a[3],@a[3],@a[3]
|
|
adcs @a[4],@a[4],@a[4]
|
|
adcs @a[5],@a[5],@a[5]
|
|
adc $bi,xzr,xzr
|
|
|
|
subs @a[0],@a[0],@mod[0]
|
|
sbcs @a[1],@a[1],@mod[1]
|
|
sbcs @a[2],@a[2],@mod[2]
|
|
sbcs @a[3],@a[3],@mod[3]
|
|
sbcs @a[4],@a[4],@mod[4]
|
|
sbcs @a[5],@a[5],@mod[5]
|
|
sbc $bi,$bi,xzr
|
|
|
|
mvn $bi,$bi
|
|
and $bi,$bi,#2
|
|
orr $r_ptr,$r_ptr,$bi
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size sgn0_pty_mont_384,.-sgn0_pty_mont_384
|
|
|
|
.globl sgn0_pty_mont_384x
|
|
.hidden sgn0_pty_mont_384x
|
|
.type sgn0_pty_mont_384x,%function
|
|
.align 5
|
|
sgn0_pty_mont_384x:
|
|
paciasp
|
|
stp x29,x30,[sp,#-128]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
|
|
mov $n0,$b_ptr
|
|
ldp @mod[0],@mod[1],[$a_ptr]
|
|
ldp @mod[2],@mod[3],[$a_ptr,#16]
|
|
ldp @mod[4],@mod[5],[$a_ptr,#32]
|
|
mov $a_ptr,$r_ptr
|
|
|
|
bl __mul_by_1_mont_384
|
|
add $a_ptr,$a_ptr,#48
|
|
|
|
and $b_ptr,@a[0],#1
|
|
orr $n_ptr,@a[0],@a[1]
|
|
adds @a[0],@a[0],@a[0]
|
|
orr $n_ptr,$n_ptr,@a[2]
|
|
adcs @a[1],@a[1],@a[1]
|
|
orr $n_ptr,$n_ptr,@a[3]
|
|
adcs @a[2],@a[2],@a[2]
|
|
orr $n_ptr,$n_ptr,@a[4]
|
|
adcs @a[3],@a[3],@a[3]
|
|
orr $n_ptr,$n_ptr,@a[5]
|
|
adcs @a[4],@a[4],@a[4]
|
|
adcs @a[5],@a[5],@a[5]
|
|
adc $bi,xzr,xzr
|
|
|
|
subs @a[0],@a[0],@mod[0]
|
|
sbcs @a[1],@a[1],@mod[1]
|
|
sbcs @a[2],@a[2],@mod[2]
|
|
sbcs @a[3],@a[3],@mod[3]
|
|
sbcs @a[4],@a[4],@mod[4]
|
|
sbcs @a[5],@a[5],@mod[5]
|
|
sbc $bi,$bi,xzr
|
|
|
|
mvn $bi,$bi
|
|
and $bi,$bi,#2
|
|
orr $b_ptr,$b_ptr,$bi
|
|
|
|
bl __mul_by_1_mont_384
|
|
ldr x30,[x29,#8]
|
|
|
|
and $r_ptr,@a[0],#1
|
|
orr $a_ptr,@a[0],@a[1]
|
|
adds @a[0],@a[0],@a[0]
|
|
orr $a_ptr,$a_ptr,@a[2]
|
|
adcs @a[1],@a[1],@a[1]
|
|
orr $a_ptr,$a_ptr,@a[3]
|
|
adcs @a[2],@a[2],@a[2]
|
|
orr $a_ptr,$a_ptr,@a[4]
|
|
adcs @a[3],@a[3],@a[3]
|
|
orr $a_ptr,$a_ptr,@a[5]
|
|
adcs @a[4],@a[4],@a[4]
|
|
adcs @a[5],@a[5],@a[5]
|
|
adc $bi,xzr,xzr
|
|
|
|
subs @a[0],@a[0],@mod[0]
|
|
sbcs @a[1],@a[1],@mod[1]
|
|
sbcs @a[2],@a[2],@mod[2]
|
|
sbcs @a[3],@a[3],@mod[3]
|
|
sbcs @a[4],@a[4],@mod[4]
|
|
sbcs @a[5],@a[5],@mod[5]
|
|
sbc $bi,$bi,xzr
|
|
|
|
mvn $bi,$bi
|
|
and $bi,$bi,#2
|
|
orr $r_ptr,$r_ptr,$bi
|
|
|
|
cmp $n_ptr,#0
|
|
csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re)
|
|
|
|
cmp $a_ptr,#0
|
|
csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re)
|
|
|
|
and $n_ptr,$n_ptr,#1
|
|
and $a_ptr,$a_ptr,#2
|
|
orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldp x25,x26,[x29,#64]
|
|
ldp x27,x28,[x29,#80]
|
|
ldr x29,[sp],#128
|
|
autiasp
|
|
ret
|
|
.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
|
|
___
|
|
|
|
if (0) {
|
|
my @b = ($bi, @mod[0..4]);
|
|
my @comba = @acc[4..6];
|
|
|
|
$code.=<<___;
|
|
.type __mul_384_comba,%function
|
|
.align 5
|
|
__mul_384_comba:
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldp @b[0],@b[1],[$b_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
ldp @a[4],@a[5],[$a_ptr,#32]
|
|
ldp @b[2],@b[3],[$b_ptr,#16]
|
|
ldp @b[4],@b[5],[$b_ptr,#32]
|
|
|
|
mul @comba[0],@a[0],@b[0]
|
|
umulh @comba[1],@a[0],@b[0]
|
|
mul @acc[0],@a[1],@b[0]
|
|
umulh @acc[1],@a[1],@b[0]
|
|
str @comba[0],[$r_ptr]
|
|
___
|
|
push(@comba,shift(@comba));
|
|
$code.=<<___;
|
|
mul @acc[2],@a[0],@b[1]
|
|
umulh @acc[3],@a[0],@b[1]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],xzr, @acc[1]
|
|
adc @comba[2],xzr,xzr
|
|
mul @acc[0],@a[2],@b[0]
|
|
umulh @acc[1],@a[2],@b[0]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
str @comba[0],[$r_ptr,#8]
|
|
___
|
|
push(@comba,shift(@comba));
|
|
$code.=<<___;
|
|
mul @acc[2],@a[1],@b[1]
|
|
umulh @acc[3],@a[1],@b[1]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],xzr,xzr
|
|
mul @acc[0],@a[0],@b[2]
|
|
umulh @acc[1],@a[0],@b[2]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[2],@a[3],@b[0]
|
|
umulh @acc[3],@a[3],@b[0]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
str @comba[0],[$r_ptr,#16]
|
|
___
|
|
push(@comba,shift(@comba));
|
|
$code.=<<___;
|
|
mul @acc[0],@a[2],@b[1]
|
|
umulh @acc[1],@a[2],@b[1]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],xzr,xzr
|
|
mul @acc[2],@a[1],@b[2]
|
|
umulh @acc[3],@a[1],@b[2]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[0],@a[0],@b[3]
|
|
umulh @acc[1],@a[0],@b[3]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[2],@a[4],@b[0]
|
|
umulh @acc[3],@a[4],@b[0]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
str @comba[0],[$r_ptr,#24]
|
|
___
|
|
push(@comba,shift(@comba));
|
|
$code.=<<___;
|
|
mul @acc[0],@a[3],@b[1]
|
|
umulh @acc[1],@a[3],@b[1]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],xzr,xzr
|
|
mul @acc[2],@a[2],@b[2]
|
|
umulh @acc[3],@a[2],@b[2]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[0],@a[1],@b[3]
|
|
umulh @acc[1],@a[1],@b[3]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[2],@a[0],@b[4]
|
|
umulh @acc[3],@a[0],@b[4]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[0],@a[5],@b[0]
|
|
umulh @acc[1],@a[5],@b[0]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
str @comba[0],[$r_ptr,#32]
|
|
___
|
|
push(@comba,shift(@comba));
|
|
$code.=<<___;
|
|
mul @acc[2],@a[4],@b[1]
|
|
umulh @acc[3],@a[4],@b[1]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],xzr,xzr
|
|
mul @acc[0],@a[3],@b[2]
|
|
umulh @acc[1],@a[3],@b[2]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[2],@a[2],@b[3]
|
|
umulh @acc[3],@a[2],@b[3]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[0],@a[1],@b[4]
|
|
umulh @acc[1],@a[1],@b[4]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[2],@a[0],@b[5]
|
|
umulh @acc[3],@a[0],@b[5]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[0],@a[5],@b[1]
|
|
umulh @acc[1],@a[5],@b[1]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
str @comba[0],[$r_ptr,#40]
|
|
___
|
|
push(@comba,shift(@comba));
|
|
$code.=<<___;
|
|
mul @acc[2],@a[4],@b[2]
|
|
umulh @acc[3],@a[4],@b[2]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],xzr,xzr
|
|
mul @acc[0],@a[3],@b[3]
|
|
umulh @acc[1],@a[3],@b[3]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[2],@a[2],@b[4]
|
|
umulh @acc[3],@a[2],@b[4]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[0],@a[1],@b[5]
|
|
umulh @acc[1],@a[1],@b[5]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[2],@a[5],@b[2]
|
|
umulh @acc[3],@a[5],@b[2]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
str @comba[0],[$r_ptr,#48]
|
|
___
|
|
push(@comba,shift(@comba));
|
|
$code.=<<___;
|
|
mul @acc[0],@a[4],@b[3]
|
|
umulh @acc[1],@a[4],@b[3]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],xzr,xzr
|
|
mul @acc[2],@a[3],@b[4]
|
|
umulh @acc[3],@a[3],@b[4]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[0],@a[2],@b[5]
|
|
umulh @acc[1],@a[2],@b[5]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[2],@a[5],@b[3]
|
|
umulh @acc[3],@a[5],@b[3]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
str @comba[0],[$r_ptr,#56]
|
|
___
|
|
push(@comba,shift(@comba));
|
|
$code.=<<___;
|
|
mul @acc[0],@a[4],@b[4]
|
|
umulh @acc[1],@a[4],@b[4]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],xzr,xzr
|
|
mul @acc[2],@a[3],@b[5]
|
|
umulh @acc[3],@a[3],@b[5]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],@comba[2],xzr
|
|
mul @acc[0],@a[5],@b[4]
|
|
umulh @acc[1],@a[5],@b[4]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
str @comba[0],[$r_ptr,#64]
|
|
___
|
|
push(@comba,shift(@comba));
|
|
$code.=<<___;
|
|
mul @acc[2],@a[4],@b[5]
|
|
umulh @acc[3],@a[4],@b[5]
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adcs @comba[1],@comba[1],@acc[1]
|
|
adc @comba[2],xzr,xzr
|
|
mul @acc[0],@a[5],@b[5]
|
|
umulh @acc[1],@a[5],@b[5]
|
|
adds @comba[0],@comba[0],@acc[2]
|
|
adcs @comba[1],@comba[1],@acc[3]
|
|
adc @comba[2],@comba[2],xzr
|
|
str @comba[0],[$r_ptr,#72]
|
|
___
|
|
push(@comba,shift(@comba));
|
|
$code.=<<___;
|
|
adds @comba[0],@comba[0],@acc[0]
|
|
adc @comba[1],@comba[1],@acc[1]
|
|
stp @comba[0],@comba[1],[$r_ptr,#80]
|
|
|
|
ret
|
|
.size __mul_384_comba,.-__mul_384_comba
|
|
___
|
|
}
|
|
print $code;
|
|
|
|
close STDOUT;
|