410 lines
9.3 KiB
Raku
Executable file
410 lines
9.3 KiB
Raku
Executable file
#!/usr/bin/env perl
|
|
#
|
|
# Copyright Supranational LLC
|
|
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# As for "sparse" in subroutine names, see commentary in the
|
|
# asm/mulx_mont_256-x86_64.pl module.
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
|
|
if ($flavour && $flavour ne "void") {
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
|
} else {
|
|
open STDOUT,">$output";
|
|
}
|
|
|
|
($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
|
|
|
|
@mod=map("x$_",(5..8));
|
|
$bi="x9";
|
|
@a=map("x$_",(10..13));
|
|
@tmp=map("x$_",(14..17));
|
|
@acc=map("x$_",(19..24));
|
|
$m0=$n_ptr;
|
|
|
|
$code.=<<___;
|
|
.text
|
|
|
|
.globl mul_mont_sparse_256
|
|
.hidden mul_mont_sparse_256
|
|
.type mul_mont_sparse_256,%function
|
|
.align 5
|
|
mul_mont_sparse_256:
|
|
stp x29,x30,[sp,#-64]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldr $bi, [$b_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
|
|
mul @acc[0],@a[0],$bi
|
|
ldp @mod[0],@mod[1],[$n_ptr]
|
|
mul @acc[1],@a[1],$bi
|
|
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
|
mul @acc[2],@a[2],$bi
|
|
mul @acc[3],@a[3],$bi
|
|
|
|
umulh @tmp[0],@a[0],$bi
|
|
umulh @tmp[1],@a[1],$bi
|
|
mul $m0,$n0,@acc[0]
|
|
umulh @tmp[2],@a[2],$bi
|
|
umulh @tmp[3],@a[3],$bi
|
|
adds @acc[1],@acc[1],@tmp[0]
|
|
//mul @tmp[0],@mod[0],$m0
|
|
adcs @acc[2],@acc[2],@tmp[1]
|
|
mul @tmp[1],@mod[1],$m0
|
|
adcs @acc[3],@acc[3],@tmp[2]
|
|
mul @tmp[2],@mod[2],$m0
|
|
adc @acc[4],xzr, @tmp[3]
|
|
mul @tmp[3],@mod[3],$m0
|
|
___
|
|
for ($i=1;$i<4;$i++) {
|
|
$code.=<<___;
|
|
ldr $bi,[$b_ptr,8*$i]
|
|
subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0]
|
|
umulh @tmp[0],@mod[0],$m0
|
|
adcs @acc[1],@acc[1],@tmp[1]
|
|
umulh @tmp[1],@mod[1],$m0
|
|
adcs @acc[2],@acc[2],@tmp[2]
|
|
umulh @tmp[2],@mod[2],$m0
|
|
adcs @acc[3],@acc[3],@tmp[3]
|
|
umulh @tmp[3],@mod[3],$m0
|
|
adc @acc[4],@acc[4],xzr
|
|
|
|
adds @acc[0],@acc[1],@tmp[0]
|
|
mul @tmp[0],@a[0],$bi
|
|
adcs @acc[1],@acc[2],@tmp[1]
|
|
mul @tmp[1],@a[1],$bi
|
|
adcs @acc[2],@acc[3],@tmp[2]
|
|
mul @tmp[2],@a[2],$bi
|
|
adcs @acc[3],@acc[4],@tmp[3]
|
|
mul @tmp[3],@a[3],$bi
|
|
adc @acc[4],xzr,xzr
|
|
|
|
adds @acc[0],@acc[0],@tmp[0]
|
|
umulh @tmp[0],@a[0],$bi
|
|
adcs @acc[1],@acc[1],@tmp[1]
|
|
umulh @tmp[1],@a[1],$bi
|
|
adcs @acc[2],@acc[2],@tmp[2]
|
|
mul $m0,$n0,@acc[0]
|
|
umulh @tmp[2],@a[2],$bi
|
|
adcs @acc[3],@acc[3],@tmp[3]
|
|
umulh @tmp[3],@a[3],$bi
|
|
adc @acc[4],@acc[4],xzr
|
|
|
|
adds @acc[1],@acc[1],@tmp[0]
|
|
//mul @tmp[0],@mod[0],$m0
|
|
adcs @acc[2],@acc[2],@tmp[1]
|
|
mul @tmp[1],@mod[1],$m0
|
|
adcs @acc[3],@acc[3],@tmp[2]
|
|
mul @tmp[2],@mod[2],$m0
|
|
adc @acc[4],@acc[4],@tmp[3]
|
|
mul @tmp[3],@mod[3],$m0
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0]
|
|
umulh @tmp[0],@mod[0],$m0
|
|
adcs @acc[1],@acc[1],@tmp[1]
|
|
umulh @tmp[1],@mod[1],$m0
|
|
adcs @acc[2],@acc[2],@tmp[2]
|
|
umulh @tmp[2],@mod[2],$m0
|
|
adcs @acc[3],@acc[3],@tmp[3]
|
|
umulh @tmp[3],@mod[3],$m0
|
|
adc @acc[4],@acc[4],xzr
|
|
|
|
adds @acc[0],@acc[1],@tmp[0]
|
|
adcs @acc[1],@acc[2],@tmp[1]
|
|
adcs @acc[2],@acc[3],@tmp[2]
|
|
adcs @acc[3],@acc[4],@tmp[3]
|
|
adc @acc[4],xzr,xzr
|
|
|
|
subs @tmp[0],@acc[0],@mod[0]
|
|
sbcs @tmp[1],@acc[1],@mod[1]
|
|
sbcs @tmp[2],@acc[2],@mod[2]
|
|
sbcs @tmp[3],@acc[3],@mod[3]
|
|
sbcs xzr, @acc[4],xzr
|
|
|
|
csel @acc[0],@acc[0],@tmp[0],lo
|
|
csel @acc[1],@acc[1],@tmp[1],lo
|
|
csel @acc[2],@acc[2],@tmp[2],lo
|
|
csel @acc[3],@acc[3],@tmp[3],lo
|
|
|
|
stp @acc[0],@acc[1],[$r_ptr]
|
|
stp @acc[2],@acc[3],[$r_ptr,#16]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldp x23,x24,[x29,#48]
|
|
ldr x29,[sp],#64
|
|
ret
|
|
.size mul_mont_sparse_256,.-mul_mont_sparse_256
|
|
___
|
|
{
|
|
my @acc = (@a,@acc[0..3]);
|
|
my @a = @mod;
|
|
|
|
$code.=<<___;
|
|
.globl sqr_mont_sparse_256
|
|
.hidden sqr_mont_sparse_256
|
|
.type sqr_mont_sparse_256,%function
|
|
.align 5
|
|
sqr_mont_sparse_256:
|
|
paciasp
|
|
stp x29,x30,[sp,#-48]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
mov $n0,$n_ptr
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// | | | | | |a1*a0| |
|
|
// | | | | |a2*a0| | |
|
|
// | |a3*a2|a3*a0| | | |
|
|
// | | | |a2*a1| | | |
|
|
// | | |a3*a1| | | | |
|
|
// *| | | | | | | | 2|
|
|
// +|a3*a3|a2*a2|a1*a1|a0*a0|
|
|
// |--+--+--+--+--+--+--+--|
|
|
// |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x]
|
|
//
|
|
// "can't overflow" below mark carrying into high part of
|
|
// multiplication result, which can't overflow, because it
|
|
// can never be all ones.
|
|
|
|
mul @acc[1],@a[1],@a[0] // a[1]*a[0]
|
|
umulh @tmp[1],@a[1],@a[0]
|
|
mul @acc[2],@a[2],@a[0] // a[2]*a[0]
|
|
umulh @tmp[2],@a[2],@a[0]
|
|
mul @acc[3],@a[3],@a[0] // a[3]*a[0]
|
|
umulh @acc[4],@a[3],@a[0]
|
|
|
|
adds @acc[2],@acc[2],@tmp[1] // accumulate high parts of multiplication
|
|
mul @tmp[0],@a[2],@a[1] // a[2]*a[1]
|
|
umulh @tmp[1],@a[2],@a[1]
|
|
adcs @acc[3],@acc[3],@tmp[2]
|
|
mul @tmp[2],@a[3],@a[1] // a[3]*a[1]
|
|
umulh @tmp[3],@a[3],@a[1]
|
|
adc @acc[4],@acc[4],xzr // can't overflow
|
|
|
|
mul @acc[5],@a[3],@a[2] // a[3]*a[2]
|
|
umulh @acc[6],@a[3],@a[2]
|
|
|
|
adds @tmp[1],@tmp[1],@tmp[2] // accumulate high parts of multiplication
|
|
mul @acc[0],@a[0],@a[0] // a[0]*a[0]
|
|
adc @tmp[2],@tmp[3],xzr // can't overflow
|
|
|
|
adds @acc[3],@acc[3],@tmp[0] // accumulate low parts of multiplication
|
|
umulh @a[0],@a[0],@a[0]
|
|
adcs @acc[4],@acc[4],@tmp[1]
|
|
mul @tmp[1],@a[1],@a[1] // a[1]*a[1]
|
|
adcs @acc[5],@acc[5],@tmp[2]
|
|
umulh @a[1],@a[1],@a[1]
|
|
adc @acc[6],@acc[6],xzr // can't overflow
|
|
|
|
adds @acc[1],@acc[1],@acc[1] // acc[1-6]*=2
|
|
mul @tmp[2],@a[2],@a[2] // a[2]*a[2]
|
|
adcs @acc[2],@acc[2],@acc[2]
|
|
umulh @a[2],@a[2],@a[2]
|
|
adcs @acc[3],@acc[3],@acc[3]
|
|
mul @tmp[3],@a[3],@a[3] // a[3]*a[3]
|
|
adcs @acc[4],@acc[4],@acc[4]
|
|
umulh @a[3],@a[3],@a[3]
|
|
adcs @acc[5],@acc[5],@acc[5]
|
|
adcs @acc[6],@acc[6],@acc[6]
|
|
adc @acc[7],xzr,xzr
|
|
|
|
adds @acc[1],@acc[1],@a[0] // +a[i]*a[i]
|
|
adcs @acc[2],@acc[2],@tmp[1]
|
|
adcs @acc[3],@acc[3],@a[1]
|
|
adcs @acc[4],@acc[4],@tmp[2]
|
|
adcs @acc[5],@acc[5],@a[2]
|
|
adcs @acc[6],@acc[6],@tmp[3]
|
|
adc @acc[7],@acc[7],@a[3]
|
|
|
|
bl __mul_by_1_mont_256
|
|
ldr x30,[x29,#8]
|
|
|
|
adds @acc[0],@acc[0],@acc[4] // accumulate upper half
|
|
adcs @acc[1],@acc[1],@acc[5]
|
|
adcs @acc[2],@acc[2],@acc[6]
|
|
adcs @acc[3],@acc[3],@acc[7]
|
|
adc @acc[4],xzr,xzr
|
|
|
|
subs @tmp[0],@acc[0],@mod[0]
|
|
sbcs @tmp[1],@acc[1],@mod[1]
|
|
sbcs @tmp[2],@acc[2],@mod[2]
|
|
sbcs @tmp[3],@acc[3],@mod[3]
|
|
sbcs xzr, @acc[4],xzr
|
|
|
|
csel @acc[0],@acc[0],@tmp[0],lo
|
|
csel @acc[1],@acc[1],@tmp[1],lo
|
|
csel @acc[2],@acc[2],@tmp[2],lo
|
|
csel @acc[3],@acc[3],@tmp[3],lo
|
|
|
|
stp @acc[0],@acc[1],[$r_ptr]
|
|
stp @acc[2],@acc[3],[$r_ptr,#16]
|
|
|
|
ldp x19,x20,[x29,#16]
|
|
ldp x21,x22,[x29,#32]
|
|
ldr x29,[sp],#48
|
|
autiasp
|
|
ret
|
|
.size sqr_mont_sparse_256,.-sqr_mont_sparse_256
|
|
___
|
|
}
|
|
{
|
|
my @a = (@a, $bi);
|
|
|
|
$code.=<<___;
|
|
.globl from_mont_256
|
|
.hidden from_mont_256
|
|
.type from_mont_256,%function
|
|
.align 5
|
|
from_mont_256:
|
|
paciasp
|
|
stp x29,x30,[sp,#-16]!
|
|
add x29,sp,#0
|
|
|
|
mov $n0,$n_ptr
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
|
|
bl __mul_by_1_mont_256
|
|
ldr x30,[x29,#8]
|
|
|
|
subs @tmp[0],@a[0],@mod[0]
|
|
sbcs @tmp[1],@a[1],@mod[1]
|
|
sbcs @tmp[2],@a[2],@mod[2]
|
|
sbcs @tmp[3],@a[3],@mod[3]
|
|
|
|
csel @a[0],@a[0],@tmp[0],lo
|
|
csel @a[1],@a[1],@tmp[1],lo
|
|
csel @a[2],@a[2],@tmp[2],lo
|
|
csel @a[3],@a[3],@tmp[3],lo
|
|
|
|
stp @a[0],@a[1],[$r_ptr]
|
|
stp @a[2],@a[3],[$r_ptr,#16]
|
|
|
|
ldr x29,[sp],#16
|
|
autiasp
|
|
ret
|
|
.size from_mont_256,.-from_mont_256
|
|
|
|
.globl redc_mont_256
|
|
.hidden redc_mont_256
|
|
.type redc_mont_256,%function
|
|
.align 5
|
|
redc_mont_256:
|
|
paciasp
|
|
stp x29,x30,[sp,#-16]!
|
|
add x29,sp,#0
|
|
|
|
mov $n0,$n_ptr
|
|
ldp @a[0],@a[1],[$a_ptr]
|
|
ldp @a[2],@a[3],[$a_ptr,#16]
|
|
|
|
bl __mul_by_1_mont_256
|
|
ldr x30,[x29,#8]
|
|
|
|
ldp @tmp[0],@tmp[1],[$a_ptr,#32]
|
|
ldp @tmp[2],@tmp[3],[$a_ptr,#48]
|
|
|
|
adds @a[0],@a[0],@tmp[0]
|
|
adcs @a[1],@a[1],@tmp[1]
|
|
adcs @a[2],@a[2],@tmp[2]
|
|
adcs @a[3],@a[3],@tmp[3]
|
|
adc @a[4],xzr,xzr
|
|
|
|
subs @tmp[0],@a[0],@mod[0]
|
|
sbcs @tmp[1],@a[1],@mod[1]
|
|
sbcs @tmp[2],@a[2],@mod[2]
|
|
sbcs @tmp[3],@a[3],@mod[3]
|
|
sbcs xzr, @a[4],xzr
|
|
|
|
csel @a[0],@a[0],@tmp[0],lo
|
|
csel @a[1],@a[1],@tmp[1],lo
|
|
csel @a[2],@a[2],@tmp[2],lo
|
|
csel @a[3],@a[3],@tmp[3],lo
|
|
|
|
stp @a[0],@a[1],[$r_ptr]
|
|
stp @a[2],@a[3],[$r_ptr,#16]
|
|
|
|
ldr x29,[sp],#16
|
|
autiasp
|
|
ret
|
|
.size redc_mont_256,.-redc_mont_256
|
|
|
|
.type __mul_by_1_mont_256,%function
|
|
.align 5
|
|
__mul_by_1_mont_256:
|
|
mul $m0,$n0,@a[0]
|
|
ldp @mod[0],@mod[1],[$b_ptr]
|
|
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
|
___
|
|
for ($i=1;$i<4;$i++) {
|
|
$code.=<<___;
|
|
//mul @tmp[0],@mod[0],$m0
|
|
mul @tmp[1],@mod[1],$m0
|
|
mul @tmp[2],@mod[2],$m0
|
|
mul @tmp[3],@mod[3],$m0
|
|
subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0]
|
|
umulh @tmp[0],@mod[0],$m0
|
|
adcs @a[1],@a[1],@tmp[1]
|
|
umulh @tmp[1],@mod[1],$m0
|
|
adcs @a[2],@a[2],@tmp[2]
|
|
umulh @tmp[2],@mod[2],$m0
|
|
adcs @a[3],@a[3],@tmp[3]
|
|
umulh @tmp[3],@mod[3],$m0
|
|
adc @a[4],xzr,xzr
|
|
|
|
adds @a[0],@a[1],@tmp[0]
|
|
adcs @a[1],@a[2],@tmp[1]
|
|
adcs @a[2],@a[3],@tmp[2]
|
|
mul $m0,$n0,@a[0]
|
|
adc @a[3],@a[4],@tmp[3]
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
//mul @tmp[0],@mod[0],$m0
|
|
mul @tmp[1],@mod[1],$m0
|
|
mul @tmp[2],@mod[2],$m0
|
|
mul @tmp[3],@mod[3],$m0
|
|
subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0]
|
|
umulh @tmp[0],@mod[0],$m0
|
|
adcs @a[1],@a[1],@tmp[1]
|
|
umulh @tmp[1],@mod[1],$m0
|
|
adcs @a[2],@a[2],@tmp[2]
|
|
umulh @tmp[2],@mod[2],$m0
|
|
adcs @a[3],@a[3],@tmp[3]
|
|
umulh @tmp[3],@mod[3],$m0
|
|
adc @a[4],xzr,xzr
|
|
|
|
adds @a[0],@a[1],@tmp[0]
|
|
adcs @a[1],@a[2],@tmp[1]
|
|
adcs @a[2],@a[3],@tmp[2]
|
|
adc @a[3],@a[4],@tmp[3]
|
|
|
|
ret
|
|
.size __mul_by_1_mont_256,.-__mul_by_1_mont_256
|
|
___
|
|
}
|
|
|
|
print $code;
|
|
|
|
close STDOUT;
|