initial stuff
This commit is contained in:
commit
943c07066e
99 changed files with 58786 additions and 0 deletions
412
blst/asm/add_mod_256-armv8.pl
Executable file
412
blst/asm/add_mod_256-armv8.pl
Executable file
|
@ -0,0 +1,412 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
|
||||
|
||||
@mod=map("x$_",(4..7));
|
||||
@a=map("x$_",(8..11));
|
||||
@b=map("x$_",(12..15));
|
||||
@t=map("x$_",(16,17,1..3));
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl add_mod_256
|
||||
.hidden add_mod_256
|
||||
.type add_mod_256,%function
|
||||
.align 5
|
||||
add_mod_256:
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @b[0],@b[1],[$b_ptr]
|
||||
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
adds @a[0],@a[0],@b[0]
|
||||
ldp @b[2],@b[3],[$b_ptr,#16]
|
||||
adcs @a[1],@a[1],@b[1]
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
adcs @a[2],@a[2],@b[2]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
adcs @a[3],@a[3],@b[3]
|
||||
adc @t[4],xzr,xzr
|
||||
|
||||
subs @t[0],@a[0],@mod[0]
|
||||
sbcs @t[1],@a[1],@mod[1]
|
||||
sbcs @t[2],@a[2],@mod[2]
|
||||
sbcs @t[3],@a[3],@mod[3]
|
||||
sbcs xzr,@t[4],xzr
|
||||
|
||||
csel @a[0],@a[0],@t[0],lo
|
||||
csel @a[1],@a[1],@t[1],lo
|
||||
csel @a[2],@a[2],@t[2],lo
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
csel @a[3],@a[3],@t[3],lo
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
|
||||
ret
|
||||
.size add_mod_256,.-add_mod_256
|
||||
|
||||
.globl mul_by_3_mod_256
|
||||
.hidden mul_by_3_mod_256
|
||||
.type mul_by_3_mod_256,%function
|
||||
.align 5
|
||||
mul_by_3_mod_256:
|
||||
ldp @b[0],@b[1],[$a_ptr]
|
||||
ldp @b[2],@b[3],[$a_ptr,#16]
|
||||
|
||||
adds @a[0],@b[0],@b[0]
|
||||
ldp @mod[0],@mod[1],[$b_ptr]
|
||||
adcs @a[1],@b[1],@b[1]
|
||||
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||||
adcs @a[2],@b[2],@b[2]
|
||||
adcs @a[3],@b[3],@b[3]
|
||||
adc @t[4],xzr,xzr
|
||||
|
||||
subs @t[0],@a[0],@mod[0]
|
||||
sbcs @t[1],@a[1],@mod[1]
|
||||
sbcs @t[2],@a[2],@mod[2]
|
||||
sbcs @t[3],@a[3],@mod[3]
|
||||
sbcs xzr,@t[4],xzr
|
||||
|
||||
csel @a[0],@a[0],@t[0],lo
|
||||
csel @a[1],@a[1],@t[1],lo
|
||||
csel @a[2],@a[2],@t[2],lo
|
||||
csel @a[3],@a[3],@t[3],lo
|
||||
|
||||
adds @a[0],@a[0],@b[0]
|
||||
adcs @a[1],@a[1],@b[1]
|
||||
adcs @a[2],@a[2],@b[2]
|
||||
adcs @a[3],@a[3],@b[3]
|
||||
adc @t[4],xzr,xzr
|
||||
|
||||
subs @t[0],@a[0],@mod[0]
|
||||
sbcs @t[1],@a[1],@mod[1]
|
||||
sbcs @t[2],@a[2],@mod[2]
|
||||
sbcs @t[3],@a[3],@mod[3]
|
||||
sbcs xzr,@t[4],xzr
|
||||
|
||||
csel @a[0],@a[0],@t[0],lo
|
||||
csel @a[1],@a[1],@t[1],lo
|
||||
csel @a[2],@a[2],@t[2],lo
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
csel @a[3],@a[3],@t[3],lo
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
|
||||
ret
|
||||
.size mul_by_3_mod_256,.-mul_by_3_mod_256
|
||||
|
||||
.globl lshift_mod_256
|
||||
.hidden lshift_mod_256
|
||||
.type lshift_mod_256,%function
|
||||
.align 5
|
||||
lshift_mod_256:
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
|
||||
.Loop_lshift_mod_256:
|
||||
adds @a[0],@a[0],@a[0]
|
||||
sub $b_ptr,$b_ptr,#1
|
||||
adcs @a[1],@a[1],@a[1]
|
||||
adcs @a[2],@a[2],@a[2]
|
||||
adcs @a[3],@a[3],@a[3]
|
||||
adc @t[4],xzr,xzr
|
||||
|
||||
subs @b[0],@a[0],@mod[0]
|
||||
sbcs @b[1],@a[1],@mod[1]
|
||||
sbcs @b[2],@a[2],@mod[2]
|
||||
sbcs @b[3],@a[3],@mod[3]
|
||||
sbcs xzr,@t[4],xzr
|
||||
|
||||
csel @a[0],@a[0],@b[0],lo
|
||||
csel @a[1],@a[1],@b[1],lo
|
||||
csel @a[2],@a[2],@b[2],lo
|
||||
csel @a[3],@a[3],@b[3],lo
|
||||
|
||||
cbnz $b_ptr,.Loop_lshift_mod_256
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
|
||||
ret
|
||||
.size lshift_mod_256,.-lshift_mod_256
|
||||
|
||||
.globl rshift_mod_256
|
||||
.hidden rshift_mod_256
|
||||
.type rshift_mod_256,%function
|
||||
.align 5
|
||||
rshift_mod_256:
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
|
||||
.Loop_rshift:
|
||||
adds @b[0],@a[0],@mod[0]
|
||||
sub $b_ptr,$b_ptr,#1
|
||||
adcs @b[1],@a[1],@mod[1]
|
||||
adcs @b[2],@a[2],@mod[2]
|
||||
adcs @b[3],@a[3],@mod[3]
|
||||
adc @t[4],xzr,xzr
|
||||
tst @a[0],#1
|
||||
|
||||
csel @b[0],@b[0],@a[0],ne
|
||||
csel @b[1],@b[1],@a[1],ne
|
||||
csel @b[2],@b[2],@a[2],ne
|
||||
csel @b[3],@b[3],@a[3],ne
|
||||
csel @t[4],@t[4],xzr,ne
|
||||
|
||||
extr @a[0],@b[1],@b[0],#1
|
||||
extr @a[1],@b[2],@b[1],#1
|
||||
extr @a[2],@b[3],@b[2],#1
|
||||
extr @a[3],@t[4],@b[3],#1
|
||||
|
||||
cbnz $b_ptr,.Loop_rshift
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
|
||||
ret
|
||||
.size rshift_mod_256,.-rshift_mod_256
|
||||
|
||||
.globl cneg_mod_256
|
||||
.hidden cneg_mod_256
|
||||
.type cneg_mod_256,%function
|
||||
.align 5
|
||||
cneg_mod_256:
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
subs @b[0],@mod[0],@a[0]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
orr @mod[0],@a[0],@a[1]
|
||||
sbcs @b[1],@mod[1],@a[1]
|
||||
orr @mod[1],@a[2],@a[3]
|
||||
sbcs @b[2],@mod[2],@a[2]
|
||||
orr @t[4],@mod[0],@mod[1]
|
||||
sbc @b[3],@mod[3],@a[3]
|
||||
|
||||
cmp @t[4],#0
|
||||
csetm @t[4],ne
|
||||
ands $b_ptr,$b_ptr,@t[4]
|
||||
|
||||
csel @a[0],@a[0],@b[0],eq
|
||||
csel @a[1],@a[1],@b[1],eq
|
||||
csel @a[2],@a[2],@b[2],eq
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
csel @a[3],@a[3],@b[3],eq
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
|
||||
ret
|
||||
.size cneg_mod_256,.-cneg_mod_256
|
||||
|
||||
.globl sub_mod_256
|
||||
.hidden sub_mod_256
|
||||
.type sub_mod_256,%function
|
||||
.align 5
|
||||
sub_mod_256:
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @b[0],@b[1],[$b_ptr]
|
||||
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
subs @a[0],@a[0],@b[0]
|
||||
ldp @b[2],@b[3],[$b_ptr,#16]
|
||||
sbcs @a[1],@a[1],@b[1]
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
sbcs @a[2],@a[2],@b[2]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
sbcs @a[3],@a[3],@b[3]
|
||||
sbc @t[4],xzr,xzr
|
||||
|
||||
and @mod[0],@mod[0],@t[4]
|
||||
and @mod[1],@mod[1],@t[4]
|
||||
adds @a[0],@a[0],@mod[0]
|
||||
and @mod[2],@mod[2],@t[4]
|
||||
adcs @a[1],@a[1],@mod[1]
|
||||
and @mod[3],@mod[3],@t[4]
|
||||
adcs @a[2],@a[2],@mod[2]
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
adc @a[3],@a[3],@mod[3]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
|
||||
ret
|
||||
.size sub_mod_256,.-sub_mod_256
|
||||
|
||||
.globl check_mod_256
|
||||
.hidden check_mod_256
|
||||
.type check_mod_256,%function
|
||||
.align 5
|
||||
check_mod_256:
|
||||
ldp @a[0],@a[1],[$r_ptr]
|
||||
ldp @a[2],@a[3],[$r_ptr,#16]
|
||||
ldp @mod[0],@mod[1],[$a_ptr]
|
||||
ldp @mod[2],@mod[3],[$a_ptr,#16]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev @a[0],@a[0]
|
||||
rev @a[1],@a[1]
|
||||
rev @a[2],@a[2]
|
||||
rev @a[3],@a[3]
|
||||
#endif
|
||||
|
||||
subs xzr,@a[0],@mod[0]
|
||||
sbcs xzr,@a[1],@mod[1]
|
||||
orr @a[0],@a[0],@a[1]
|
||||
sbcs xzr,@a[2],@mod[2]
|
||||
orr @a[0],@a[0],@a[2]
|
||||
sbcs xzr,@a[3],@mod[3]
|
||||
orr @a[0],@a[0],@a[3]
|
||||
sbc $a_ptr,xzr,xzr
|
||||
|
||||
cmp @a[0],#0
|
||||
mov x0,#1
|
||||
csel x0,x0,xzr,ne
|
||||
and x0,x0,$a_ptr
|
||||
|
||||
ret
|
||||
.size check_mod_256,.-check_mod_256
|
||||
|
||||
.globl add_n_check_mod_256
|
||||
.hidden add_n_check_mod_256
|
||||
.type add_n_check_mod_256,%function
|
||||
.align 5
|
||||
add_n_check_mod_256:
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @b[0],@b[1],[$b_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @b[2],@b[3],[$b_ptr,#16]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev @a[0],@a[0]
|
||||
rev @b[0],@b[0]
|
||||
rev @a[1],@a[1]
|
||||
rev @b[1],@b[1]
|
||||
rev @a[2],@a[2]
|
||||
rev @b[2],@b[2]
|
||||
rev @a[3],@a[3]
|
||||
rev @b[3],@b[3]
|
||||
#endif
|
||||
|
||||
adds @a[0],@a[0],@b[0]
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
adcs @a[1],@a[1],@b[1]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
adcs @a[2],@a[2],@b[2]
|
||||
adcs @a[3],@a[3],@b[3]
|
||||
adc @t[4],xzr,xzr
|
||||
|
||||
subs @t[0],@a[0],@mod[0]
|
||||
sbcs @t[1],@a[1],@mod[1]
|
||||
sbcs @t[2],@a[2],@mod[2]
|
||||
sbcs @t[3],@a[3],@mod[3]
|
||||
sbcs xzr,@t[4],xzr
|
||||
|
||||
csel @a[0],@a[0],@t[0],lo
|
||||
csel @a[1],@a[1],@t[1],lo
|
||||
csel @a[2],@a[2],@t[2],lo
|
||||
csel @a[3],@a[3],@t[3],lo
|
||||
|
||||
orr @t[0], @a[0], @a[1]
|
||||
orr @t[1], @a[2], @a[3]
|
||||
orr @t[0], @t[0], @t[1]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev @a[0],@a[0]
|
||||
rev @a[1],@a[1]
|
||||
rev @a[2],@a[2]
|
||||
rev @a[3],@a[3]
|
||||
#endif
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
|
||||
mov @t[1], #1
|
||||
cmp @t[0], #0
|
||||
csel x0, @t[1], xzr, ne
|
||||
|
||||
ret
|
||||
.size add_n_check_mod_256,.-add_n_check_mod_256
|
||||
|
||||
.globl sub_n_check_mod_256
|
||||
.hidden sub_n_check_mod_256
|
||||
.type sub_n_check_mod_256,%function
|
||||
.align 5
|
||||
sub_n_check_mod_256:
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @b[0],@b[1],[$b_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @b[2],@b[3],[$b_ptr,#16]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev @a[0],@a[0]
|
||||
rev @b[0],@b[0]
|
||||
rev @a[1],@a[1]
|
||||
rev @b[1],@b[1]
|
||||
rev @a[2],@a[2]
|
||||
rev @b[2],@b[2]
|
||||
rev @a[3],@a[3]
|
||||
rev @b[3],@b[3]
|
||||
#endif
|
||||
|
||||
subs @a[0],@a[0],@b[0]
|
||||
sbcs @a[1],@a[1],@b[1]
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
sbcs @a[2],@a[2],@b[2]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
sbcs @a[3],@a[3],@b[3]
|
||||
sbc @t[4],xzr,xzr
|
||||
|
||||
and @mod[0],@mod[0],@t[4]
|
||||
and @mod[1],@mod[1],@t[4]
|
||||
adds @a[0],@a[0],@mod[0]
|
||||
and @mod[2],@mod[2],@t[4]
|
||||
adcs @a[1],@a[1],@mod[1]
|
||||
and @mod[3],@mod[3],@t[4]
|
||||
adcs @a[2],@a[2],@mod[2]
|
||||
adc @a[3],@a[3],@mod[3]
|
||||
|
||||
orr @t[0], @a[0], @a[1]
|
||||
orr @t[1], @a[2], @a[3]
|
||||
orr @t[0], @t[0], @t[1]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev @a[0],@a[0]
|
||||
rev @a[1],@a[1]
|
||||
rev @a[2],@a[2]
|
||||
rev @a[3],@a[3]
|
||||
#endif
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
|
||||
mov @t[1], #1
|
||||
cmp @t[0], #0
|
||||
csel x0, @t[1], xzr, ne
|
||||
|
||||
ret
|
||||
.size sub_n_check_mod_256,.-sub_n_check_mod_256
|
||||
___
|
||||
|
||||
print $code;
|
||||
|
||||
close STDOUT;
|
547
blst/asm/add_mod_256-x86_64.pl
Executable file
547
blst/asm/add_mod_256-x86_64.pl
Executable file
|
@ -0,0 +1,547 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
# common argument layout
|
||||
($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx");
|
||||
$b_ptr = "%rbx";
|
||||
|
||||
{ ############################################################## 256 bits add
|
||||
my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12));
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl add_mod_256
|
||||
.hidden add_mod_256
|
||||
.type add_mod_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
add_mod_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov 8*0($a_ptr), @acc[0]
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
|
||||
.Loaded_a_add_mod_256:
|
||||
add 8*0($b_org), @acc[0]
|
||||
adc 8*1($b_org), @acc[1]
|
||||
mov @acc[0], @acc[4]
|
||||
adc 8*2($b_org), @acc[2]
|
||||
mov @acc[1], @acc[5]
|
||||
adc 8*3($b_org), @acc[3]
|
||||
sbb $b_org, $b_org
|
||||
|
||||
mov @acc[2], @acc[6]
|
||||
sub 8*0($n_ptr), @acc[0]
|
||||
sbb 8*1($n_ptr), @acc[1]
|
||||
sbb 8*2($n_ptr), @acc[2]
|
||||
mov @acc[3], @acc[7]
|
||||
sbb 8*3($n_ptr), @acc[3]
|
||||
sbb \$0, $b_org
|
||||
|
||||
cmovc @acc[4], @acc[0]
|
||||
cmovc @acc[5], @acc[1]
|
||||
mov @acc[0], 8*0($r_ptr)
|
||||
cmovc @acc[6], @acc[2]
|
||||
mov @acc[1], 8*1($r_ptr)
|
||||
cmovc @acc[7], @acc[3]
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
|
||||
mov 8(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 16(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 24(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -24
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size add_mod_256,.-add_mod_256
|
||||
|
||||
########################################################################
|
||||
.globl mul_by_3_mod_256
|
||||
.hidden mul_by_3_mod_256
|
||||
.type mul_by_3_mod_256,\@function,3,"unwind"
|
||||
.align 32
|
||||
mul_by_3_mod_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
.cfi_end_prologue
|
||||
|
||||
mov $b_org,$n_ptr
|
||||
mov 8*0($a_ptr), @acc[0]
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov $a_ptr,$b_org
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
|
||||
call __lshift_mod_256
|
||||
mov 0(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
jmp .Loaded_a_add_mod_256
|
||||
|
||||
mov 8(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 16(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 24(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -24
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size mul_by_3_mod_256,.-mul_by_3_mod_256
|
||||
|
||||
.type __lshift_mod_256,\@abi-omnipotent
|
||||
.align 32
|
||||
__lshift_mod_256:
|
||||
add @acc[0], @acc[0]
|
||||
adc @acc[1], @acc[1]
|
||||
mov @acc[0], @acc[4]
|
||||
adc @acc[2], @acc[2]
|
||||
mov @acc[1], @acc[5]
|
||||
adc @acc[3], @acc[3]
|
||||
sbb @acc[8], @acc[8]
|
||||
|
||||
mov @acc[2], @acc[6]
|
||||
sub 8*0($n_ptr), @acc[0]
|
||||
sbb 8*1($n_ptr), @acc[1]
|
||||
sbb 8*2($n_ptr), @acc[2]
|
||||
mov @acc[3], @acc[7]
|
||||
sbb 8*3($n_ptr), @acc[3]
|
||||
sbb \$0, @acc[8]
|
||||
|
||||
cmovc @acc[4], @acc[0]
|
||||
cmovc @acc[5], @acc[1]
|
||||
cmovc @acc[6], @acc[2]
|
||||
cmovc @acc[7], @acc[3]
|
||||
|
||||
ret
|
||||
.size __lshift_mod_256,.-__lshift_mod_256
|
||||
|
||||
########################################################################
|
||||
.globl lshift_mod_256
|
||||
.hidden lshift_mod_256
|
||||
.type lshift_mod_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
lshift_mod_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
.cfi_end_prologue
|
||||
|
||||
mov 8*0($a_ptr), @acc[0]
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
|
||||
.Loop_lshift_mod_256:
|
||||
call __lshift_mod_256
|
||||
dec %edx
|
||||
jnz .Loop_lshift_mod_256
|
||||
|
||||
mov @acc[0], 8*0($r_ptr)
|
||||
mov @acc[1], 8*1($r_ptr)
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
|
||||
mov 0(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 8(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 16(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 24(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -24
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size lshift_mod_256,.-lshift_mod_256
|
||||
|
||||
########################################################################
|
||||
.globl rshift_mod_256
|
||||
.hidden rshift_mod_256
|
||||
.type rshift_mod_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
rshift_mod_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov 8*0($a_ptr), @acc[7]
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
|
||||
.Loop_rshift_mod_256:
|
||||
mov @acc[7], @acc[0]
|
||||
and \$1, @acc[7]
|
||||
mov 8*0($n_ptr), @acc[4]
|
||||
neg @acc[7]
|
||||
mov 8*1($n_ptr), @acc[5]
|
||||
mov 8*2($n_ptr), @acc[6]
|
||||
|
||||
and @acc[7], @acc[4]
|
||||
and @acc[7], @acc[5]
|
||||
and @acc[7], @acc[6]
|
||||
and 8*3($n_ptr), @acc[7]
|
||||
|
||||
add @acc[4], @acc[0]
|
||||
adc @acc[5], @acc[1]
|
||||
adc @acc[6], @acc[2]
|
||||
adc @acc[7], @acc[3]
|
||||
sbb @acc[4], @acc[4]
|
||||
|
||||
shr \$1, @acc[0]
|
||||
mov @acc[1], @acc[7]
|
||||
shr \$1, @acc[1]
|
||||
mov @acc[2], @acc[6]
|
||||
shr \$1, @acc[2]
|
||||
mov @acc[3], @acc[5]
|
||||
shr \$1, @acc[3]
|
||||
|
||||
shl \$63, @acc[7]
|
||||
shl \$63, @acc[6]
|
||||
or @acc[0], @acc[7]
|
||||
shl \$63, @acc[5]
|
||||
or @acc[6], @acc[1]
|
||||
shl \$63, @acc[4]
|
||||
or @acc[5], @acc[2]
|
||||
or @acc[4], @acc[3]
|
||||
|
||||
dec %edx
|
||||
jnz .Loop_rshift_mod_256
|
||||
|
||||
mov @acc[7], 8*0($r_ptr)
|
||||
mov @acc[1], 8*1($r_ptr)
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
|
||||
mov 8(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 16(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 24(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -24
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size rshift_mod_256,.-rshift_mod_256
|
||||
|
||||
########################################################################
|
||||
.globl cneg_mod_256
|
||||
.hidden cneg_mod_256
|
||||
.type cneg_mod_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
cneg_mod_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
.cfi_end_prologue
|
||||
|
||||
mov 8*0($a_ptr), @acc[8] # load a[0:3]
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov @acc[8], @acc[0]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
or @acc[1], @acc[8]
|
||||
or @acc[2], @acc[8]
|
||||
or @acc[3], @acc[8]
|
||||
mov \$-1, @acc[7]
|
||||
|
||||
mov 8*0($n_ptr), @acc[4] # load n[0:3]
|
||||
cmovnz @acc[7], @acc[8] # mask = a[0:3] ? -1 : 0
|
||||
mov 8*1($n_ptr), @acc[5]
|
||||
mov 8*2($n_ptr), @acc[6]
|
||||
and @acc[8], @acc[4] # n[0:3] &= mask
|
||||
mov 8*3($n_ptr), @acc[7]
|
||||
and @acc[8], @acc[5]
|
||||
and @acc[8], @acc[6]
|
||||
and @acc[8], @acc[7]
|
||||
|
||||
sub @acc[0], @acc[4] # a[0:3] ? n[0:3]-a[0:3] : 0-0
|
||||
sbb @acc[1], @acc[5]
|
||||
sbb @acc[2], @acc[6]
|
||||
sbb @acc[3], @acc[7]
|
||||
|
||||
or $b_org, $b_org # check condition flag
|
||||
|
||||
cmovz @acc[0], @acc[4] # flag ? n[0:3]-a[0:3] : a[0:3]
|
||||
cmovz @acc[1], @acc[5]
|
||||
mov @acc[4], 8*0($r_ptr)
|
||||
cmovz @acc[2], @acc[6]
|
||||
mov @acc[5], 8*1($r_ptr)
|
||||
cmovz @acc[3], @acc[7]
|
||||
mov @acc[6], 8*2($r_ptr)
|
||||
mov @acc[7], 8*3($r_ptr)
|
||||
|
||||
mov 0(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 8(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 16(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 24(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -24
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size cneg_mod_256,.-cneg_mod_256
|
||||
|
||||
########################################################################
|
||||
.globl sub_mod_256
|
||||
.hidden sub_mod_256
|
||||
.type sub_mod_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
sub_mod_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov 8*0($a_ptr), @acc[0]
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
|
||||
sub 8*0($b_org), @acc[0]
|
||||
mov 8*0($n_ptr), @acc[4]
|
||||
sbb 8*1($b_org), @acc[1]
|
||||
mov 8*1($n_ptr), @acc[5]
|
||||
sbb 8*2($b_org), @acc[2]
|
||||
mov 8*2($n_ptr), @acc[6]
|
||||
sbb 8*3($b_org), @acc[3]
|
||||
mov 8*3($n_ptr), @acc[7]
|
||||
sbb $b_org, $b_org
|
||||
|
||||
and $b_org, @acc[4]
|
||||
and $b_org, @acc[5]
|
||||
and $b_org, @acc[6]
|
||||
and $b_org, @acc[7]
|
||||
|
||||
add @acc[4], @acc[0]
|
||||
adc @acc[5], @acc[1]
|
||||
mov @acc[0], 8*0($r_ptr)
|
||||
adc @acc[6], @acc[2]
|
||||
mov @acc[1], 8*1($r_ptr)
|
||||
adc @acc[7], @acc[3]
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
|
||||
mov 8(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 16(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 24(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -24
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size sub_mod_256,.-sub_mod_256
|
||||
|
||||
########################################################################
|
||||
.globl check_mod_256
|
||||
.hidden check_mod_256
|
||||
.type check_mod_256,\@function,2,"unwind"
|
||||
.align 32
|
||||
check_mod_256:
|
||||
.cfi_startproc
|
||||
mov 8*0($r_ptr), %rax
|
||||
mov 8*1($r_ptr), @acc[1]
|
||||
mov 8*2($r_ptr), @acc[2]
|
||||
mov 8*3($r_ptr), @acc[3]
|
||||
|
||||
mov %rax, @acc[0] # see if it's zero
|
||||
or @acc[1], %rax
|
||||
or @acc[2], %rax
|
||||
or @acc[3], %rax
|
||||
|
||||
sub 8*0($a_ptr), @acc[0] # does subtracting modulus borrow?
|
||||
sbb 8*1($a_ptr), @acc[1]
|
||||
sbb 8*2($a_ptr), @acc[2]
|
||||
sbb 8*3($a_ptr), @acc[3]
|
||||
sbb $a_ptr, $a_ptr
|
||||
|
||||
mov \$1, %rdx
|
||||
cmp \$0, %rax
|
||||
cmovne %rdx, %rax
|
||||
and $a_ptr, %rax
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size check_mod_256,.-check_mod_256
|
||||
|
||||
########################################################################
|
||||
.globl add_n_check_mod_256
|
||||
.hidden add_n_check_mod_256
|
||||
.type add_n_check_mod_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
add_n_check_mod_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov 8*0($a_ptr), @acc[0]
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
|
||||
add 8*0($b_org), @acc[0]
|
||||
adc 8*1($b_org), @acc[1]
|
||||
mov @acc[0], @acc[4]
|
||||
adc 8*2($b_org), @acc[2]
|
||||
mov @acc[1], @acc[5]
|
||||
adc 8*3($b_org), @acc[3]
|
||||
sbb $b_org, $b_org
|
||||
|
||||
mov @acc[2], @acc[6]
|
||||
sub 8*0($n_ptr), @acc[0]
|
||||
sbb 8*1($n_ptr), @acc[1]
|
||||
sbb 8*2($n_ptr), @acc[2]
|
||||
mov @acc[3], @acc[7]
|
||||
sbb 8*3($n_ptr), @acc[3]
|
||||
sbb \$0, $b_org
|
||||
|
||||
cmovc @acc[4], @acc[0]
|
||||
cmovc @acc[5], @acc[1]
|
||||
mov @acc[0], 8*0($r_ptr)
|
||||
cmovc @acc[6], @acc[2]
|
||||
mov @acc[1], 8*1($r_ptr)
|
||||
cmovc @acc[7], @acc[3]
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
|
||||
or @acc[1], @acc[0]
|
||||
or @acc[3], @acc[2]
|
||||
or @acc[2], @acc[0]
|
||||
mov \$1, %rax
|
||||
cmovz @acc[0], %rax
|
||||
|
||||
mov 8(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 16(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 24(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -24
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size add_n_check_mod_256,.-add_n_check_mod_256
|
||||
|
||||
########################################################################
|
||||
.globl sub_n_check_mod_256
|
||||
.hidden sub_n_check_mod_256
|
||||
.type sub_n_check_mod_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
sub_n_check_mod_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov 8*0($a_ptr), @acc[0]
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
|
||||
sub 8*0($b_org), @acc[0]
|
||||
mov 8*0($n_ptr), @acc[4]
|
||||
sbb 8*1($b_org), @acc[1]
|
||||
mov 8*1($n_ptr), @acc[5]
|
||||
sbb 8*2($b_org), @acc[2]
|
||||
mov 8*2($n_ptr), @acc[6]
|
||||
sbb 8*3($b_org), @acc[3]
|
||||
mov 8*3($n_ptr), @acc[7]
|
||||
sbb $b_org, $b_org
|
||||
|
||||
and $b_org, @acc[4]
|
||||
and $b_org, @acc[5]
|
||||
and $b_org, @acc[6]
|
||||
and $b_org, @acc[7]
|
||||
|
||||
add @acc[4], @acc[0]
|
||||
adc @acc[5], @acc[1]
|
||||
mov @acc[0], 8*0($r_ptr)
|
||||
adc @acc[6], @acc[2]
|
||||
mov @acc[1], 8*1($r_ptr)
|
||||
adc @acc[7], @acc[3]
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
|
||||
or @acc[1], @acc[0]
|
||||
or @acc[3], @acc[2]
|
||||
or @acc[2], @acc[0]
|
||||
mov \$1, %rax
|
||||
cmovz @acc[0], %rax
|
||||
|
||||
mov 8(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 16(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 24(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -24
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size sub_n_check_mod_256,.-sub_n_check_mod_256
|
||||
___
|
||||
}
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
872
blst/asm/add_mod_384-armv8.pl
Executable file
872
blst/asm/add_mod_384-armv8.pl
Executable file
|
@ -0,0 +1,872 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
|
||||
|
||||
@mod=map("x$_",(4..9));
|
||||
@a=map("x$_",(10..15));
|
||||
@b=map("x$_",(16,17,19..22));
|
||||
$carry=$n_ptr;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl add_mod_384
|
||||
.hidden add_mod_384
|
||||
.type add_mod_384,%function
|
||||
.align 5
|
||||
add_mod_384:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||||
|
||||
bl __add_mod_384
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size add_mod_384,.-add_mod_384
|
||||
|
||||
.type __add_mod_384,%function
|
||||
.align 5
|
||||
__add_mod_384:
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @b[0],@b[1],[$b_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @b[2],@b[3],[$b_ptr,#16]
|
||||
ldp @a[4],@a[5],[$a_ptr,#32]
|
||||
ldp @b[4],@b[5],[$b_ptr,#32]
|
||||
|
||||
__add_mod_384_ab_are_loaded:
|
||||
adds @a[0],@a[0],@b[0]
|
||||
adcs @a[1],@a[1],@b[1]
|
||||
adcs @a[2],@a[2],@b[2]
|
||||
adcs @a[3],@a[3],@b[3]
|
||||
adcs @a[4],@a[4],@b[4]
|
||||
adcs @a[5],@a[5],@b[5]
|
||||
adc $carry,xzr,xzr
|
||||
|
||||
subs @b[0],@a[0],@mod[0]
|
||||
sbcs @b[1],@a[1],@mod[1]
|
||||
sbcs @b[2],@a[2],@mod[2]
|
||||
sbcs @b[3],@a[3],@mod[3]
|
||||
sbcs @b[4],@a[4],@mod[4]
|
||||
sbcs @b[5],@a[5],@mod[5]
|
||||
sbcs xzr,$carry,xzr
|
||||
|
||||
csel @a[0],@a[0],@b[0],lo
|
||||
csel @a[1],@a[1],@b[1],lo
|
||||
csel @a[2],@a[2],@b[2],lo
|
||||
csel @a[3],@a[3],@b[3],lo
|
||||
csel @a[4],@a[4],@b[4],lo
|
||||
csel @a[5],@a[5],@b[5],lo
|
||||
|
||||
ret
|
||||
.size __add_mod_384,.-__add_mod_384
|
||||
|
||||
.globl add_mod_384x
|
||||
.hidden add_mod_384x
|
||||
.type add_mod_384x,%function
|
||||
.align 5
|
||||
add_mod_384x:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||||
|
||||
bl __add_mod_384
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
add $a_ptr,$a_ptr,#48
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
add $b_ptr,$b_ptr,#48
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
bl __add_mod_384
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr,#48]
|
||||
stp @a[2],@a[3],[$r_ptr,#64]
|
||||
stp @a[4],@a[5],[$r_ptr,#80]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size add_mod_384x,.-add_mod_384x
|
||||
|
||||
.globl rshift_mod_384
|
||||
.hidden rshift_mod_384
|
||||
.type rshift_mod_384,%function
|
||||
.align 5
|
||||
rshift_mod_384:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @a[4],@a[5],[$a_ptr,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||||
|
||||
.Loop_rshift_mod_384:
|
||||
sub $b_ptr,$b_ptr,#1
|
||||
bl __rshift_mod_384
|
||||
cbnz $b_ptr,.Loop_rshift_mod_384
|
||||
|
||||
ldr x30,[sp,#8]
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size rshift_mod_384,.-rshift_mod_384
|
||||
|
||||
.type __rshift_mod_384,%function
|
||||
.align 5
|
||||
__rshift_mod_384:
|
||||
sbfx @b[5],@a[0],#0,#1
|
||||
and @b[0],@b[5],@mod[0]
|
||||
and @b[1],@b[5],@mod[1]
|
||||
adds @a[0],@a[0],@b[0]
|
||||
and @b[2],@b[5],@mod[2]
|
||||
adcs @a[1],@a[1],@b[1]
|
||||
and @b[3],@b[5],@mod[3]
|
||||
adcs @a[2],@a[2],@b[2]
|
||||
and @b[4],@b[5],@mod[4]
|
||||
adcs @a[3],@a[3],@b[3]
|
||||
and @b[5],@b[5],@mod[5]
|
||||
adcs @a[4],@a[4],@b[4]
|
||||
extr @a[0],@a[1],@a[0],#1 // a[0:5] >>= 1
|
||||
adcs @a[5],@a[5],@b[5]
|
||||
extr @a[1],@a[2],@a[1],#1
|
||||
adc @b[5],xzr,xzr
|
||||
extr @a[2],@a[3],@a[2],#1
|
||||
extr @a[3],@a[4],@a[3],#1
|
||||
extr @a[4],@a[5],@a[4],#1
|
||||
extr @a[5],@b[5],@a[5],#1
|
||||
ret
|
||||
.size __rshift_mod_384,.-__rshift_mod_384
|
||||
|
||||
.globl div_by_2_mod_384
|
||||
.hidden div_by_2_mod_384
|
||||
.type div_by_2_mod_384,%function
|
||||
.align 5
|
||||
div_by_2_mod_384:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @a[4],@a[5],[$a_ptr,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$b_ptr]
|
||||
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||||
|
||||
bl __rshift_mod_384
|
||||
|
||||
ldr x30,[sp,#8]
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size div_by_2_mod_384,.-div_by_2_mod_384
|
||||
|
||||
.globl lshift_mod_384
|
||||
.hidden lshift_mod_384
|
||||
.type lshift_mod_384,%function
|
||||
.align 5
|
||||
lshift_mod_384:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @a[4],@a[5],[$a_ptr,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||||
|
||||
.Loop_lshift_mod_384:
|
||||
sub $b_ptr,$b_ptr,#1
|
||||
bl __lshift_mod_384
|
||||
cbnz $b_ptr,.Loop_lshift_mod_384
|
||||
|
||||
ldr x30,[sp,#8]
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size lshift_mod_384,.-lshift_mod_384
|
||||
|
||||
.type __lshift_mod_384,%function
|
||||
.align 5
|
||||
__lshift_mod_384:
|
||||
adds @a[0],@a[0],@a[0]
|
||||
adcs @a[1],@a[1],@a[1]
|
||||
adcs @a[2],@a[2],@a[2]
|
||||
adcs @a[3],@a[3],@a[3]
|
||||
adcs @a[4],@a[4],@a[4]
|
||||
adcs @a[5],@a[5],@a[5]
|
||||
adc $carry,xzr,xzr
|
||||
|
||||
subs @b[0],@a[0],@mod[0]
|
||||
sbcs @b[1],@a[1],@mod[1]
|
||||
sbcs @b[2],@a[2],@mod[2]
|
||||
sbcs @b[3],@a[3],@mod[3]
|
||||
sbcs @b[4],@a[4],@mod[4]
|
||||
sbcs @b[5],@a[5],@mod[5]
|
||||
sbcs xzr,$carry,xzr
|
||||
|
||||
csel @a[0],@a[0],@b[0],lo
|
||||
csel @a[1],@a[1],@b[1],lo
|
||||
csel @a[2],@a[2],@b[2],lo
|
||||
csel @a[3],@a[3],@b[3],lo
|
||||
csel @a[4],@a[4],@b[4],lo
|
||||
csel @a[5],@a[5],@b[5],lo
|
||||
|
||||
ret
|
||||
.size __lshift_mod_384,.-__lshift_mod_384
|
||||
|
||||
.globl mul_by_3_mod_384
|
||||
.hidden mul_by_3_mod_384
|
||||
.type mul_by_3_mod_384,%function
|
||||
.align 5
|
||||
mul_by_3_mod_384:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @a[4],@a[5],[$a_ptr,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$b_ptr]
|
||||
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||||
|
||||
bl __lshift_mod_384
|
||||
|
||||
ldp @b[0],@b[1],[$a_ptr]
|
||||
ldp @b[2],@b[3],[$a_ptr,#16]
|
||||
ldp @b[4],@b[5],[$a_ptr,#32]
|
||||
|
||||
bl __add_mod_384_ab_are_loaded
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size mul_by_3_mod_384,.-mul_by_3_mod_384
|
||||
|
||||
.globl mul_by_8_mod_384
|
||||
.hidden mul_by_8_mod_384
|
||||
.type mul_by_8_mod_384,%function
|
||||
.align 5
|
||||
mul_by_8_mod_384:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @a[4],@a[5],[$a_ptr,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$b_ptr]
|
||||
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||||
|
||||
bl __lshift_mod_384
|
||||
bl __lshift_mod_384
|
||||
bl __lshift_mod_384
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size mul_by_8_mod_384,.-mul_by_8_mod_384
|
||||
|
||||
.globl mul_by_3_mod_384x
|
||||
.hidden mul_by_3_mod_384x
|
||||
.type mul_by_3_mod_384x,%function
|
||||
.align 5
|
||||
mul_by_3_mod_384x:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @a[4],@a[5],[$a_ptr,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$b_ptr]
|
||||
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||||
|
||||
bl __lshift_mod_384
|
||||
|
||||
ldp @b[0],@b[1],[$a_ptr]
|
||||
ldp @b[2],@b[3],[$a_ptr,#16]
|
||||
ldp @b[4],@b[5],[$a_ptr,#32]
|
||||
|
||||
bl __add_mod_384_ab_are_loaded
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
ldp @a[0],@a[1],[$a_ptr,#48]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
ldp @a[2],@a[3],[$a_ptr,#64]
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
ldp @a[4],@a[5],[$a_ptr,#80]
|
||||
|
||||
bl __lshift_mod_384
|
||||
|
||||
ldp @b[0],@b[1],[$a_ptr,#48]
|
||||
ldp @b[2],@b[3],[$a_ptr,#64]
|
||||
ldp @b[4],@b[5],[$a_ptr,#80]
|
||||
|
||||
bl __add_mod_384_ab_are_loaded
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr,#48]
|
||||
stp @a[2],@a[3],[$r_ptr,#64]
|
||||
stp @a[4],@a[5],[$r_ptr,#80]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size mul_by_3_mod_384x,.-mul_by_3_mod_384x
|
||||
|
||||
.globl mul_by_8_mod_384x
|
||||
.hidden mul_by_8_mod_384x
|
||||
.type mul_by_8_mod_384x,%function
|
||||
.align 5
|
||||
mul_by_8_mod_384x:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @a[4],@a[5],[$a_ptr,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$b_ptr]
|
||||
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||||
|
||||
bl __lshift_mod_384
|
||||
bl __lshift_mod_384
|
||||
bl __lshift_mod_384
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
ldp @a[0],@a[1],[$a_ptr,#48]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
ldp @a[2],@a[3],[$a_ptr,#64]
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
ldp @a[4],@a[5],[$a_ptr,#80]
|
||||
|
||||
bl __lshift_mod_384
|
||||
bl __lshift_mod_384
|
||||
bl __lshift_mod_384
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr,#48]
|
||||
stp @a[2],@a[3],[$r_ptr,#64]
|
||||
stp @a[4],@a[5],[$r_ptr,#80]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size mul_by_8_mod_384x,.-mul_by_8_mod_384x
|
||||
|
||||
.globl cneg_mod_384
|
||||
.hidden cneg_mod_384
|
||||
.type cneg_mod_384,%function
|
||||
.align 5
|
||||
cneg_mod_384:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
|
||||
subs @b[0],@mod[0],@a[0]
|
||||
ldp @a[4],@a[5],[$a_ptr,#32]
|
||||
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||||
orr $carry,@a[0],@a[1]
|
||||
sbcs @b[1],@mod[1],@a[1]
|
||||
orr $carry,$carry,@a[2]
|
||||
sbcs @b[2],@mod[2],@a[2]
|
||||
orr $carry,$carry,@a[3]
|
||||
sbcs @b[3],@mod[3],@a[3]
|
||||
orr $carry,$carry,@a[4]
|
||||
sbcs @b[4],@mod[4],@a[4]
|
||||
orr $carry,$carry,@a[5]
|
||||
sbc @b[5],@mod[5],@a[5]
|
||||
|
||||
cmp $carry,#0
|
||||
csetm $carry,ne
|
||||
ands $b_ptr,$b_ptr,$carry
|
||||
|
||||
csel @a[0],@a[0],@b[0],eq
|
||||
csel @a[1],@a[1],@b[1],eq
|
||||
csel @a[2],@a[2],@b[2],eq
|
||||
csel @a[3],@a[3],@b[3],eq
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
csel @a[4],@a[4],@b[4],eq
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
csel @a[5],@a[5],@b[5],eq
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size cneg_mod_384,.-cneg_mod_384
|
||||
|
||||
.globl sub_mod_384
|
||||
.hidden sub_mod_384
|
||||
.type sub_mod_384,%function
|
||||
.align 5
|
||||
sub_mod_384:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||||
|
||||
bl __sub_mod_384
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size sub_mod_384,.-sub_mod_384
|
||||
|
||||
.type __sub_mod_384,%function
|
||||
.align 5
|
||||
__sub_mod_384:
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @b[0],@b[1],[$b_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
ldp @b[2],@b[3],[$b_ptr,#16]
|
||||
ldp @a[4],@a[5],[$a_ptr,#32]
|
||||
ldp @b[4],@b[5],[$b_ptr,#32]
|
||||
|
||||
subs @a[0],@a[0],@b[0]
|
||||
sbcs @a[1],@a[1],@b[1]
|
||||
sbcs @a[2],@a[2],@b[2]
|
||||
sbcs @a[3],@a[3],@b[3]
|
||||
sbcs @a[4],@a[4],@b[4]
|
||||
sbcs @a[5],@a[5],@b[5]
|
||||
sbc $carry,xzr,xzr
|
||||
|
||||
and @b[0],@mod[0],$carry
|
||||
and @b[1],@mod[1],$carry
|
||||
adds @a[0],@a[0],@b[0]
|
||||
and @b[2],@mod[2],$carry
|
||||
adcs @a[1],@a[1],@b[1]
|
||||
and @b[3],@mod[3],$carry
|
||||
adcs @a[2],@a[2],@b[2]
|
||||
and @b[4],@mod[4],$carry
|
||||
adcs @a[3],@a[3],@b[3]
|
||||
and @b[5],@mod[5],$carry
|
||||
adcs @a[4],@a[4],@b[4]
|
||||
adc @a[5],@a[5],@b[5]
|
||||
|
||||
ret
|
||||
.size __sub_mod_384,.-__sub_mod_384
|
||||
|
||||
.globl sub_mod_384x
|
||||
.hidden sub_mod_384x
|
||||
.type sub_mod_384x,%function
|
||||
.align 5
|
||||
sub_mod_384x:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$n_ptr,#32]
|
||||
|
||||
bl __sub_mod_384
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
add $a_ptr,$a_ptr,#48
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
add $b_ptr,$b_ptr,#48
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
bl __sub_mod_384
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr,#48]
|
||||
stp @a[2],@a[3],[$r_ptr,#64]
|
||||
stp @a[4],@a[5],[$r_ptr,#80]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size sub_mod_384x,.-sub_mod_384x
|
||||
|
||||
.globl mul_by_1_plus_i_mod_384x
|
||||
.hidden mul_by_1_plus_i_mod_384x
|
||||
.type mul_by_1_plus_i_mod_384x,%function
|
||||
.align 5
|
||||
mul_by_1_plus_i_mod_384x:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$b_ptr]
|
||||
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$b_ptr,#32]
|
||||
add $b_ptr,$a_ptr,#48
|
||||
|
||||
bl __sub_mod_384 // a->re - a->im
|
||||
|
||||
ldp @b[0],@b[1],[$a_ptr]
|
||||
ldp @b[2],@b[3],[$a_ptr,#16]
|
||||
ldp @b[4],@b[5],[$a_ptr,#32]
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
ldp @a[0],@a[1],[$a_ptr,#48]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
ldp @a[2],@a[3],[$a_ptr,#64]
|
||||
stp @a[4],@a[5],[$r_ptr,#32]
|
||||
ldp @a[4],@a[5],[$a_ptr,#80]
|
||||
|
||||
bl __add_mod_384_ab_are_loaded // a->re + a->im
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr,#48]
|
||||
stp @a[2],@a[3],[$r_ptr,#64]
|
||||
stp @a[4],@a[5],[$r_ptr,#80]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
|
||||
|
||||
.globl sgn0_pty_mod_384
|
||||
.hidden sgn0_pty_mod_384
|
||||
.type sgn0_pty_mod_384,%function
|
||||
.align 5
|
||||
sgn0_pty_mod_384:
|
||||
ldp @a[0],@a[1],[$r_ptr]
|
||||
ldp @a[2],@a[3],[$r_ptr,#16]
|
||||
ldp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$a_ptr]
|
||||
ldp @mod[2],@mod[3],[$a_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$a_ptr,#32]
|
||||
|
||||
and $r_ptr,@a[0],#1
|
||||
adds @a[0],@a[0],@a[0]
|
||||
adcs @a[1],@a[1],@a[1]
|
||||
adcs @a[2],@a[2],@a[2]
|
||||
adcs @a[3],@a[3],@a[3]
|
||||
adcs @a[4],@a[4],@a[4]
|
||||
adcs @a[5],@a[5],@a[5]
|
||||
adc $carry,xzr,xzr
|
||||
|
||||
subs @a[0],@a[0],@mod[0]
|
||||
sbcs @a[1],@a[1],@mod[1]
|
||||
sbcs @a[2],@a[2],@mod[2]
|
||||
sbcs @a[3],@a[3],@mod[3]
|
||||
sbcs @a[4],@a[4],@mod[4]
|
||||
sbcs @a[5],@a[5],@mod[5]
|
||||
sbc $carry,$carry,xzr
|
||||
|
||||
mvn $carry,$carry
|
||||
and $carry,$carry,#2
|
||||
orr $r_ptr,$r_ptr,$carry
|
||||
|
||||
ret
|
||||
.size sgn0_pty_mod_384,.-sgn0_pty_mod_384
|
||||
|
||||
.globl sgn0_pty_mod_384x
|
||||
.hidden sgn0_pty_mod_384x
|
||||
.type sgn0_pty_mod_384x,%function
|
||||
.align 5
|
||||
sgn0_pty_mod_384x:
|
||||
ldp @a[0],@a[1],[$r_ptr]
|
||||
ldp @a[2],@a[3],[$r_ptr,#16]
|
||||
ldp @a[4],@a[5],[$r_ptr,#32]
|
||||
|
||||
ldp @mod[0],@mod[1],[$a_ptr]
|
||||
ldp @mod[2],@mod[3],[$a_ptr,#16]
|
||||
ldp @mod[4],@mod[5],[$a_ptr,#32]
|
||||
|
||||
and $b_ptr,@a[0],#1
|
||||
orr $n_ptr,@a[0],@a[1]
|
||||
adds @a[0],@a[0],@a[0]
|
||||
orr $n_ptr,$n_ptr,@a[2]
|
||||
adcs @a[1],@a[1],@a[1]
|
||||
orr $n_ptr,$n_ptr,@a[3]
|
||||
adcs @a[2],@a[2],@a[2]
|
||||
orr $n_ptr,$n_ptr,@a[4]
|
||||
adcs @a[3],@a[3],@a[3]
|
||||
orr $n_ptr,$n_ptr,@a[5]
|
||||
adcs @a[4],@a[4],@a[4]
|
||||
adcs @a[5],@a[5],@a[5]
|
||||
adc @b[0],xzr,xzr
|
||||
|
||||
subs @a[0],@a[0],@mod[0]
|
||||
sbcs @a[1],@a[1],@mod[1]
|
||||
sbcs @a[2],@a[2],@mod[2]
|
||||
sbcs @a[3],@a[3],@mod[3]
|
||||
sbcs @a[4],@a[4],@mod[4]
|
||||
sbcs @a[5],@a[5],@mod[5]
|
||||
sbc @b[0],@b[0],xzr
|
||||
|
||||
ldp @a[0],@a[1],[$r_ptr,#48]
|
||||
ldp @a[2],@a[3],[$r_ptr,#64]
|
||||
ldp @a[4],@a[5],[$r_ptr,#80]
|
||||
|
||||
mvn @b[0],@b[0]
|
||||
and @b[0],@b[0],#2
|
||||
orr $b_ptr,$b_ptr,@b[0]
|
||||
|
||||
and $r_ptr,@a[0],#1
|
||||
orr $a_ptr,@a[0],@a[1]
|
||||
adds @a[0],@a[0],@a[0]
|
||||
orr $a_ptr,$a_ptr,@a[2]
|
||||
adcs @a[1],@a[1],@a[1]
|
||||
orr $a_ptr,$a_ptr,@a[3]
|
||||
adcs @a[2],@a[2],@a[2]
|
||||
orr $a_ptr,$a_ptr,@a[4]
|
||||
adcs @a[3],@a[3],@a[3]
|
||||
orr $a_ptr,$a_ptr,@a[5]
|
||||
adcs @a[4],@a[4],@a[4]
|
||||
adcs @a[5],@a[5],@a[5]
|
||||
adc @b[0],xzr,xzr
|
||||
|
||||
subs @a[0],@a[0],@mod[0]
|
||||
sbcs @a[1],@a[1],@mod[1]
|
||||
sbcs @a[2],@a[2],@mod[2]
|
||||
sbcs @a[3],@a[3],@mod[3]
|
||||
sbcs @a[4],@a[4],@mod[4]
|
||||
sbcs @a[5],@a[5],@mod[5]
|
||||
sbc @b[0],@b[0],xzr
|
||||
|
||||
mvn @b[0],@b[0]
|
||||
and @b[0],@b[0],#2
|
||||
orr $r_ptr,$r_ptr,@b[0]
|
||||
|
||||
cmp $n_ptr,#0
|
||||
csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re)
|
||||
|
||||
cmp $a_ptr,#0
|
||||
csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re)
|
||||
|
||||
and $n_ptr,$n_ptr,#1
|
||||
and $a_ptr,$a_ptr,#2
|
||||
orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity
|
||||
|
||||
ret
|
||||
.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
|
||||
___
|
||||
if (1) {
|
||||
sub vec_select {
|
||||
my $sz = shift;
|
||||
my @v=map("v$_",(0..5,16..21));
|
||||
|
||||
$code.=<<___;
|
||||
.globl vec_select_$sz
|
||||
.hidden vec_select_$sz
|
||||
.type vec_select_$sz,%function
|
||||
.align 5
|
||||
vec_select_$sz:
|
||||
dup v6.2d, $n_ptr
|
||||
ld1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48
|
||||
cmeq v6.2d, v6.2d, #0
|
||||
ld1 {@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48
|
||||
___
|
||||
for($i=0; $i<$sz-48; $i+=48) {
|
||||
$code.=<<___;
|
||||
bit @v[0].16b, @v[3].16b, v6.16b
|
||||
ld1 {@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48
|
||||
bit @v[1].16b, @v[4].16b, v6.16b
|
||||
ld1 {@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48
|
||||
bit @v[2].16b, @v[5].16b, v6.16b
|
||||
st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48
|
||||
___
|
||||
@v = @v[6..11,0..5];
|
||||
}
|
||||
$code.=<<___;
|
||||
bit @v[0].16b, @v[3].16b, v6.16b
|
||||
bit @v[1].16b, @v[4].16b, v6.16b
|
||||
bit @v[2].16b, @v[5].16b, v6.16b
|
||||
st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr]
|
||||
ret
|
||||
.size vec_select_$sz,.-vec_select_$sz
|
||||
___
|
||||
}
|
||||
vec_select(48);
|
||||
vec_select(96);
|
||||
vec_select(192);
|
||||
vec_select(144);
|
||||
vec_select(288);
|
||||
}
|
||||
|
||||
{
|
||||
my ($inp, $end, $step) = map("x$_", (0..2));
|
||||
|
||||
$code.=<<___;
|
||||
.globl vec_prefetch
|
||||
.hidden vec_prefetch
|
||||
.type vec_prefetch,%function
|
||||
.align 5
|
||||
vec_prefetch:
|
||||
add $end, $end, $inp
|
||||
sub $end, $end, #1
|
||||
mov $step, #64
|
||||
prfm pldl1keep, [$inp]
|
||||
add $inp, $inp, $step
|
||||
cmp $inp, $end
|
||||
csel $inp, $end, $inp, hi
|
||||
csel $step, xzr, $step, hi
|
||||
prfm pldl1keep, [$inp]
|
||||
add $inp, $inp, $step
|
||||
cmp $inp, $end
|
||||
csel $inp, $end, $inp, hi
|
||||
csel $step, xzr, $step, hi
|
||||
prfm pldl1keep, [$inp]
|
||||
add $inp, $inp, $step
|
||||
cmp $inp, $end
|
||||
csel $inp, $end, $inp, hi
|
||||
csel $step, xzr, $step, hi
|
||||
prfm pldl1keep, [$inp]
|
||||
add $inp, $inp, $step
|
||||
cmp $inp, $end
|
||||
csel $inp, $end, $inp, hi
|
||||
csel $step, xzr, $step, hi
|
||||
prfm pldl1keep, [$inp]
|
||||
add $inp, $inp, $step
|
||||
cmp $inp, $end
|
||||
csel $inp, $end, $inp, hi
|
||||
csel $step, xzr, $step, hi
|
||||
prfm pldl1keep, [$inp]
|
||||
add $inp, $inp, $step
|
||||
cmp $inp, $end
|
||||
csel $inp, $end, $inp, hi
|
||||
prfm pldl1keep, [$inp]
|
||||
ret
|
||||
.size vec_prefetch,.-vec_prefetch
|
||||
___
|
||||
}
|
||||
|
||||
print $code;
|
||||
|
||||
close STDOUT;
|
1430
blst/asm/add_mod_384-x86_64.pl
Executable file
1430
blst/asm/add_mod_384-x86_64.pl
Executable file
File diff suppressed because it is too large
Load diff
260
blst/asm/add_mod_384x384-x86_64.pl
Executable file
260
blst/asm/add_mod_384x384-x86_64.pl
Executable file
|
@ -0,0 +1,260 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
# common argument layout
|
||||
($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
|
||||
$b_ptr = "%rbx";
|
||||
|
||||
# common accumulator layout
|
||||
@acc=map("%r$_",(8..15));
|
||||
|
||||
############################################################ 384x384 add/sub
|
||||
# Double-width addition/subtraction modulo n<<384, as opposite to
|
||||
# naively expected modulo n*n. It works because n<<384 is the actual
|
||||
# input boundary condition for Montgomery reduction, not n*n.
|
||||
# Just in case, this is duplicated, but only one module is
|
||||
# supposed to be linked...
|
||||
{
|
||||
my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected
|
||||
# except for $n_ptr and $r_ptr
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.type __add_mod_384x384,\@abi-omnipotent
|
||||
.align 32
|
||||
__add_mod_384x384:
|
||||
mov 8*0($a_ptr), @acc[0]
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
mov 8*4($a_ptr), @acc[4]
|
||||
mov 8*5($a_ptr), @acc[5]
|
||||
mov 8*6($a_ptr), @acc[6]
|
||||
|
||||
add 8*0($b_org), @acc[0]
|
||||
mov 8*7($a_ptr), @acc[7]
|
||||
adc 8*1($b_org), @acc[1]
|
||||
mov 8*8($a_ptr), @acc[8]
|
||||
adc 8*2($b_org), @acc[2]
|
||||
mov 8*9($a_ptr), @acc[9]
|
||||
adc 8*3($b_org), @acc[3]
|
||||
mov 8*10($a_ptr), @acc[10]
|
||||
adc 8*4($b_org), @acc[4]
|
||||
mov 8*11($a_ptr), @acc[11]
|
||||
adc 8*5($b_org), @acc[5]
|
||||
mov @acc[0], 8*0($r_ptr)
|
||||
adc 8*6($b_org), @acc[6]
|
||||
mov @acc[1], 8*1($r_ptr)
|
||||
adc 8*7($b_org), @acc[7]
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
adc 8*8($b_org), @acc[8]
|
||||
mov @acc[4], 8*4($r_ptr)
|
||||
mov @acc[6], @acc[0]
|
||||
adc 8*9($b_org), @acc[9]
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
mov @acc[7], @acc[1]
|
||||
adc 8*10($b_org), @acc[10]
|
||||
mov @acc[5], 8*5($r_ptr)
|
||||
mov @acc[8], @acc[2]
|
||||
adc 8*11($b_org), @acc[11]
|
||||
mov @acc[9], @acc[3]
|
||||
sbb $b_org, $b_org
|
||||
|
||||
sub 8*0($n_ptr), @acc[6]
|
||||
sbb 8*1($n_ptr), @acc[7]
|
||||
mov @acc[10], @acc[4]
|
||||
sbb 8*2($n_ptr), @acc[8]
|
||||
sbb 8*3($n_ptr), @acc[9]
|
||||
sbb 8*4($n_ptr), @acc[10]
|
||||
mov @acc[11], @acc[5]
|
||||
sbb 8*5($n_ptr), @acc[11]
|
||||
sbb \$0, $b_org
|
||||
|
||||
cmovc @acc[0], @acc[6]
|
||||
cmovc @acc[1], @acc[7]
|
||||
cmovc @acc[2], @acc[8]
|
||||
mov @acc[6], 8*6($r_ptr)
|
||||
cmovc @acc[3], @acc[9]
|
||||
mov @acc[7], 8*7($r_ptr)
|
||||
cmovc @acc[4], @acc[10]
|
||||
mov @acc[8], 8*8($r_ptr)
|
||||
cmovc @acc[5], @acc[11]
|
||||
mov @acc[9], 8*9($r_ptr)
|
||||
mov @acc[10], 8*10($r_ptr)
|
||||
mov @acc[11], 8*11($r_ptr)
|
||||
|
||||
ret
|
||||
.size __add_mod_384x384,.-__add_mod_384x384
|
||||
|
||||
.type __sub_mod_384x384,\@abi-omnipotent
|
||||
.align 32
|
||||
__sub_mod_384x384:
|
||||
mov 8*0($a_ptr), @acc[0]
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
mov 8*4($a_ptr), @acc[4]
|
||||
mov 8*5($a_ptr), @acc[5]
|
||||
mov 8*6($a_ptr), @acc[6]
|
||||
|
||||
sub 8*0($b_org), @acc[0]
|
||||
mov 8*7($a_ptr), @acc[7]
|
||||
sbb 8*1($b_org), @acc[1]
|
||||
mov 8*8($a_ptr), @acc[8]
|
||||
sbb 8*2($b_org), @acc[2]
|
||||
mov 8*9($a_ptr), @acc[9]
|
||||
sbb 8*3($b_org), @acc[3]
|
||||
mov 8*10($a_ptr), @acc[10]
|
||||
sbb 8*4($b_org), @acc[4]
|
||||
mov 8*11($a_ptr), @acc[11]
|
||||
sbb 8*5($b_org), @acc[5]
|
||||
mov @acc[0], 8*0($r_ptr)
|
||||
sbb 8*6($b_org), @acc[6]
|
||||
mov 8*0($n_ptr), @acc[0]
|
||||
mov @acc[1], 8*1($r_ptr)
|
||||
sbb 8*7($b_org), @acc[7]
|
||||
mov 8*1($n_ptr), @acc[1]
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
sbb 8*8($b_org), @acc[8]
|
||||
mov 8*2($n_ptr), @acc[2]
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
sbb 8*9($b_org), @acc[9]
|
||||
mov 8*3($n_ptr), @acc[3]
|
||||
mov @acc[4], 8*4($r_ptr)
|
||||
sbb 8*10($b_org), @acc[10]
|
||||
mov 8*4($n_ptr), @acc[4]
|
||||
mov @acc[5], 8*5($r_ptr)
|
||||
sbb 8*11($b_org), @acc[11]
|
||||
mov 8*5($n_ptr), @acc[5]
|
||||
sbb $b_org, $b_org
|
||||
|
||||
and $b_org, @acc[0]
|
||||
and $b_org, @acc[1]
|
||||
and $b_org, @acc[2]
|
||||
and $b_org, @acc[3]
|
||||
and $b_org, @acc[4]
|
||||
and $b_org, @acc[5]
|
||||
|
||||
add @acc[0], @acc[6]
|
||||
adc @acc[1], @acc[7]
|
||||
mov @acc[6], 8*6($r_ptr)
|
||||
adc @acc[2], @acc[8]
|
||||
mov @acc[7], 8*7($r_ptr)
|
||||
adc @acc[3], @acc[9]
|
||||
mov @acc[8], 8*8($r_ptr)
|
||||
adc @acc[4], @acc[10]
|
||||
mov @acc[9], 8*9($r_ptr)
|
||||
adc @acc[5], @acc[11]
|
||||
mov @acc[10], 8*10($r_ptr)
|
||||
mov @acc[11], 8*11($r_ptr)
|
||||
|
||||
ret
|
||||
.size __sub_mod_384x384,.-__sub_mod_384x384
|
||||
|
||||
.globl add_mod_384x384
|
||||
.hidden add_mod_384x384
|
||||
.type add_mod_384x384,\@function,4,"unwind"
|
||||
.align 32
|
||||
add_mod_384x384:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
call __add_mod_384x384
|
||||
|
||||
mov 8(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov 16(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
mov 24(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
mov 32(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 40(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 48(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 56(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -56
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size add_mod_384x384,.-add_mod_384x384
|
||||
|
||||
.globl sub_mod_384x384
|
||||
.hidden sub_mod_384x384
|
||||
.type sub_mod_384x384,\@function,4,"unwind"
|
||||
.align 32
|
||||
sub_mod_384x384:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
call __sub_mod_384x384
|
||||
|
||||
mov 8(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov 16(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
mov 24(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
mov 32(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 40(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 48(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 56(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -56
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size sub_mod_384x384,.-sub_mod_384x384
|
||||
___
|
||||
}
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
381
blst/asm/arm-xlate.pl
Executable file
381
blst/asm/arm-xlate.pl
Executable file
|
@ -0,0 +1,381 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# ARM assembler distiller/adapter by \@dot-asm.
|
||||
|
||||
use strict;
|
||||
|
||||
################################################################
|
||||
# Recognized "flavour"-s are:
|
||||
#
|
||||
# linux[32|64] GNU assembler, effectively pass-through
|
||||
# ios[32|64] global symbols' decorations, PIC tweaks, etc.
|
||||
# win[32|64] Visual Studio armasm-specific directives
|
||||
# coff[32|64] e.g. clang --target=arm-windows ...
|
||||
#
|
||||
my $flavour = shift;
|
||||
$flavour = "linux" if (!$flavour or $flavour eq "void");
|
||||
|
||||
my $output = shift;
|
||||
open STDOUT,">$output" || die "can't open $output: $!";
|
||||
|
||||
my %GLOBALS;
|
||||
my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0;
|
||||
my $in_proc; # used with 'windows' flavour
|
||||
|
||||
################################################################
|
||||
# directives which need special treatment on different platforms
|
||||
################################################################
|
||||
my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch
|
||||
my $fpu = sub { } if ($flavour !~ /linux/); # omit .fpu
|
||||
|
||||
my $rodata = sub {
|
||||
SWITCH: for ($flavour) {
|
||||
/linux/ && return ".section\t.rodata";
|
||||
/ios/ && return ".section\t__TEXT,__const";
|
||||
/coff/ && return ".section\t.rdata,\"dr\"";
|
||||
/win/ && return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8";
|
||||
last;
|
||||
}
|
||||
};
|
||||
|
||||
my $hidden = sub {
|
||||
if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); }
|
||||
} if ($flavour !~ /linux/);
|
||||
|
||||
my $comm = sub {
|
||||
my @args = split(/,\s*/,shift);
|
||||
my $name = @args[0];
|
||||
my $global = \$GLOBALS{$name};
|
||||
my $ret;
|
||||
|
||||
if ($flavour =~ /ios32/) {
|
||||
$ret = ".comm\t_$name,@args[1]\n";
|
||||
$ret .= ".non_lazy_symbol_pointer\n";
|
||||
$ret .= "$name:\n";
|
||||
$ret .= ".indirect_symbol\t_$name\n";
|
||||
$ret .= ".long\t0\n";
|
||||
$ret .= ".previous";
|
||||
$name = "_$name";
|
||||
} elsif ($flavour =~ /win/) {
|
||||
$ret = "\tCOMMON\t|$name|,@args[1]";
|
||||
} elsif ($flavour =~ /coff/) {
|
||||
$ret = ".comm\t$name,@args[1]";
|
||||
} else {
|
||||
$ret = ".comm\t".join(',',@args);
|
||||
}
|
||||
|
||||
$$global = $name;
|
||||
$ret;
|
||||
};
|
||||
|
||||
my $globl = sub {
|
||||
my $name = shift;
|
||||
my $global = \$GLOBALS{$name};
|
||||
my $ret;
|
||||
|
||||
SWITCH: for ($flavour) {
|
||||
/ios/ && do { $name = "_$name"; last; };
|
||||
/win/ && do { $ret = ""; last; };
|
||||
}
|
||||
|
||||
$ret = ".globl $name" if (!defined($ret));
|
||||
$$global = $name;
|
||||
$ret;
|
||||
};
|
||||
my $global = $globl;
|
||||
|
||||
my $extern = sub {
|
||||
&$globl(@_);
|
||||
if ($flavour =~ /win/) {
|
||||
return "\tEXTERN\t@_";
|
||||
}
|
||||
return; # return nothing
|
||||
};
|
||||
|
||||
my $type = sub {
|
||||
my $arg = join(',',@_);
|
||||
my $ret;
|
||||
|
||||
SWITCH: for ($flavour) {
|
||||
/ios32/ && do { if ($arg =~ /(\w+),\s*%function/) {
|
||||
$ret = "#ifdef __thumb2__\n" .
|
||||
".thumb_func $1\n" .
|
||||
"#endif";
|
||||
}
|
||||
last;
|
||||
};
|
||||
/win/ && do { if ($arg =~ /(\w+),\s*%(function|object)/) {
|
||||
my $type = "[DATA]";
|
||||
if ($2 eq "function") {
|
||||
$in_proc = $1;
|
||||
$type = "[FUNC]";
|
||||
}
|
||||
$ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type"
|
||||
: "";
|
||||
}
|
||||
last;
|
||||
};
|
||||
/coff/ && do { if ($arg =~ /(\w+),\s*%function/) {
|
||||
$ret = ".def $1;\n".
|
||||
".type 32;\n".
|
||||
".endef";
|
||||
}
|
||||
last;
|
||||
};
|
||||
}
|
||||
return $ret;
|
||||
} if ($flavour !~ /linux/);
|
||||
|
||||
my $size = sub {
|
||||
if ($in_proc && $flavour =~ /win/) {
|
||||
$in_proc = undef;
|
||||
return "\tENDP";
|
||||
}
|
||||
} if ($flavour !~ /linux/);
|
||||
|
||||
my $inst = sub {
|
||||
if ($flavour =~ /win/) { "\tDCDU\t".join(',',@_); }
|
||||
else { ".long\t".join(',',@_); }
|
||||
} if ($flavour !~ /linux/);
|
||||
|
||||
my $asciz = sub {
|
||||
my $line = join(",",@_);
|
||||
if ($line =~ /^"(.*)"$/)
|
||||
{ if ($flavour =~ /win/) {
|
||||
"\tDCB\t$line,0\n\tALIGN\t4";
|
||||
} else {
|
||||
".byte " . join(",",unpack("C*",$1),0) . "\n.align 2";
|
||||
}
|
||||
} else { ""; }
|
||||
};
|
||||
|
||||
my $align = sub {
|
||||
"\tALIGN\t".2**@_[0];
|
||||
} if ($flavour =~ /win/);
|
||||
$align = sub {
|
||||
".p2align\t".@_[0];
|
||||
} if ($flavour =~ /coff/);
|
||||
|
||||
my $byte = sub {
|
||||
"\tDCB\t".join(',',@_);
|
||||
} if ($flavour =~ /win/);
|
||||
|
||||
my $short = sub {
|
||||
"\tDCWU\t".join(',',@_);
|
||||
} if ($flavour =~ /win/);
|
||||
|
||||
my $word = sub {
|
||||
"\tDCDU\t".join(',',@_);
|
||||
} if ($flavour =~ /win/);
|
||||
|
||||
my $long = $word if ($flavour =~ /win/);
|
||||
|
||||
my $quad = sub {
|
||||
"\tDCQU\t".join(',',@_);
|
||||
} if ($flavour =~ /win/);
|
||||
|
||||
my $skip = sub {
|
||||
"\tSPACE\t".shift;
|
||||
} if ($flavour =~ /win/);
|
||||
|
||||
my $code = sub {
|
||||
"\tCODE@_[0]";
|
||||
} if ($flavour =~ /win/);
|
||||
|
||||
my $thumb = sub { # .thumb should appear prior .text in source
|
||||
"# define ARM THUMB\n" .
|
||||
"\tTHUMB";
|
||||
} if ($flavour =~ /win/);
|
||||
|
||||
my $text = sub {
|
||||
"\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM");
|
||||
} if ($flavour =~ /win/);
|
||||
|
||||
my $syntax = sub {} if ($flavour =~ /win/); # omit .syntax
|
||||
|
||||
my $rva = sub {
|
||||
# .rva directive comes in handy only on 32-bit Windows, i.e. it can
|
||||
# be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections.
|
||||
# However! Corresponding compilers don't seem to bet on PIC, which
|
||||
# raises the question why would assembler programmer have to jump
|
||||
# through the hoops? But just in case, it would go as following:
|
||||
#
|
||||
# ldr r1,.LOPENSSL_armcap
|
||||
# ldr r2,.LOPENSSL_armcap+4
|
||||
# adr r0,.LOPENSSL_armcap
|
||||
# bic r1,r1,#1 ; de-thumb-ify link.exe's ideas
|
||||
# sub r0,r0,r1 ; r0 is image base now
|
||||
# ldr r0,[r0,r2]
|
||||
# ...
|
||||
#.LOPENSSL_armcap:
|
||||
# .rva .LOPENSSL_armcap ; self-reference
|
||||
# .rva OPENSSL_armcap_P ; real target
|
||||
#
|
||||
# Non-position-independent [and ISA-neutral] alternative is so much
|
||||
# simpler:
|
||||
#
|
||||
# ldr r0,.LOPENSSL_armcap
|
||||
# ldr r0,[r0]
|
||||
# ...
|
||||
#.LOPENSSL_armcap:
|
||||
# .long OPENSSL_armcap_P
|
||||
#
|
||||
"\tDCDU\t@_[0]\n\tRELOC\t2"
|
||||
} if ($flavour =~ /win(?!64)/);
|
||||
|
||||
################################################################
|
||||
# some broken instructions in Visual Studio armasm[64]...
|
||||
|
||||
my $it = sub {} if ($flavour =~ /win32/); # omit 'it'
|
||||
|
||||
my $ext = sub {
|
||||
"\text8\t".join(',',@_);
|
||||
} if ($flavour =~ /win64/);
|
||||
|
||||
my $csel = sub {
|
||||
my ($args,$comment) = split(m|\s*//|,shift);
|
||||
my @regs = split(m|,\s*|,$args);
|
||||
my $cond = pop(@regs);
|
||||
|
||||
"\tcsel$cond\t".join(',',@regs);
|
||||
} if ($flavour =~ /win64/);
|
||||
|
||||
my $csetm = sub {
|
||||
my ($args,$comment) = split(m|\s*//|,shift);
|
||||
my @regs = split(m|,\s*|,$args);
|
||||
my $cond = pop(@regs);
|
||||
|
||||
"\tcsetm$cond\t".join(',',@regs);
|
||||
} if ($flavour =~ /win64/);
|
||||
|
||||
# ... then conditional branch instructions are also broken, but
|
||||
# maintaining all the variants is tedious, so I kludge-fix it
|
||||
# elsewhere...
|
||||
################################################################
|
||||
my $adrp = sub {
|
||||
my ($args,$comment) = split(m|\s*//|,shift);
|
||||
"\tadrp\t$args\@PAGE";
|
||||
} if ($flavour =~ /ios64/);
|
||||
|
||||
my $paciasp = sub {
|
||||
($flavour =~ /linux/) ? "\t.inst\t0xd503233f"
|
||||
: &$inst(0xd503233f);
|
||||
};
|
||||
|
||||
my $autiasp = sub {
|
||||
($flavour =~ /linux/) ? "\t.inst\t0xd50323bf"
|
||||
: &$inst(0xd50323bf);
|
||||
};
|
||||
|
||||
sub range {
|
||||
my ($r,$sfx,$start,$end) = @_;
|
||||
|
||||
join(",",map("$r$_$sfx",($start..$end)));
|
||||
}
|
||||
|
||||
sub expand_line {
|
||||
my $line = shift;
|
||||
my @ret = ();
|
||||
|
||||
pos($line)=0;
|
||||
|
||||
while ($line =~ m/\G[^@\/\{\"]*/g) {
|
||||
if ($line =~ m/\G(@|\/\/|$)/gc) {
|
||||
last;
|
||||
}
|
||||
elsif ($line =~ m/\G\{/gc) {
|
||||
my $saved_pos = pos($line);
|
||||
$line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e;
|
||||
pos($line) = $saved_pos;
|
||||
$line =~ m/\G[^\}]*\}/g;
|
||||
}
|
||||
elsif ($line =~ m/\G\"/gc) {
|
||||
$line =~ m/\G[^\"]*\"/g;
|
||||
}
|
||||
}
|
||||
|
||||
$line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
|
||||
|
||||
if ($flavour =~ /win/) {
|
||||
# adjust alignment hints, "[rN,:32]" -> "[rN@32]"
|
||||
$line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/;
|
||||
# adjust local labels, ".Lwhatever" -> "|$Lwhatever|"
|
||||
$line =~ s/\.(L\w{2,})/|\$$1|/g;
|
||||
# omit "#:lo12:" on win64
|
||||
$line =~ s/#:lo12://;
|
||||
} elsif ($flavour =~ /coff(?!64)/) {
|
||||
$line =~ s/\.L(\w{2,})/(\$ML$1)/g;
|
||||
} elsif ($flavour =~ /ios64/) {
|
||||
$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
|
||||
}
|
||||
|
||||
return $line;
|
||||
}
|
||||
|
||||
while(my $line=<>) {
|
||||
|
||||
# fix up assembler-specific commentary delimiter
|
||||
$line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/);
|
||||
|
||||
if ($line =~ m/^\s*(#|@|;|\/\/)/) { print $line; next; }
|
||||
|
||||
$line =~ s|/\*.*\*/||; # get rid of C-style comments...
|
||||
$line =~ s|^\s+||; # ... and skip white spaces in beginning...
|
||||
$line =~ s|\s+$||; # ... and at the end
|
||||
|
||||
{
|
||||
$line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel
|
||||
$line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels);
|
||||
}
|
||||
|
||||
{
|
||||
$line =~ s|(^[\.\w]+)\:\s*||;
|
||||
my $label = $1;
|
||||
if ($label) {
|
||||
$label = ($GLOBALS{$label} or $label);
|
||||
if ($flavour =~ /win/) {
|
||||
$label =~ s|^\.L(?=\w)|\$L|;
|
||||
printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : "");
|
||||
} else {
|
||||
$label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/);
|
||||
printf "%s:", $label;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($line !~ m/^[#@;]/) {
|
||||
$line =~ s|^\s*(\.?)(\S+)\s*||;
|
||||
my $c = $1; $c = "\t" if ($c eq "");
|
||||
my $mnemonic = $2;
|
||||
my $opcode;
|
||||
if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) {
|
||||
$opcode = eval("\$$1_$2");
|
||||
} else {
|
||||
$opcode = eval("\$$mnemonic");
|
||||
}
|
||||
|
||||
my $arg=expand_line($line);
|
||||
|
||||
if (ref($opcode) eq 'CODE') {
|
||||
$line = &$opcode($arg);
|
||||
} elsif ($mnemonic) {
|
||||
if ($flavour =~ /win64/) {
|
||||
# "b.cond" -> "bcond", kludge-fix:-(
|
||||
$mnemonic =~ s/^b\.([a-z]{2}$)/b$1/;
|
||||
}
|
||||
$line = $c.$mnemonic;
|
||||
$line.= "\t$arg" if ($arg ne "");
|
||||
}
|
||||
}
|
||||
|
||||
print $line if ($line);
|
||||
print "\n";
|
||||
}
|
||||
|
||||
print "\tEND\n" if ($flavour =~ /win/);
|
||||
|
||||
close STDOUT;
|
586
blst/asm/ct_inverse_mod_256-armv8.pl
Executable file
586
blst/asm/ct_inverse_mod_256-armv8.pl
Executable file
|
@ -0,0 +1,586 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Both constant-time and fast Euclidean inversion as suggested in
|
||||
# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 -
|
||||
# on Cortex-A57.
|
||||
#
|
||||
# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
|
||||
# const vec256 modx);
|
||||
#
|
||||
$python_ref.=<<'___';
|
||||
def ct_inverse_mod_256(inp, mod):
|
||||
a, u = inp, 1
|
||||
b, v = mod, 0
|
||||
|
||||
k = 31
|
||||
mask = (1 << k) - 1
|
||||
|
||||
for i in range(0, 512 // k - 1):
|
||||
# __ab_approximation_31
|
||||
n = max(a.bit_length(), b.bit_length())
|
||||
if n < 64:
|
||||
a_, b_ = a, b
|
||||
else:
|
||||
a_ = (a & mask) | ((a >> (n-k-2)) << k)
|
||||
b_ = (b & mask) | ((b >> (n-k-2)) << k)
|
||||
|
||||
# __inner_loop_31
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, k):
|
||||
if a_ & 1:
|
||||
if a_ < b_:
|
||||
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
|
||||
a_, f0, g0 = a_-b_, f0-f1, g0-g1
|
||||
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
|
||||
|
||||
# __smul_256_n_shift_by_31
|
||||
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
|
||||
if a < 0:
|
||||
a, f0, g0 = -a, -f0, -g0
|
||||
if b < 0:
|
||||
b, f1, g1 = -b, -f1, -g1
|
||||
|
||||
# __smul_512x63
|
||||
u, v = u*f0 + v*g0, u*f1 + v*g1
|
||||
|
||||
if 512 % k + k:
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, 512 % k + k):
|
||||
if a & 1:
|
||||
if a < b:
|
||||
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
|
||||
a, f0, g0 = a-b, f0-f1, g0-g1
|
||||
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
|
||||
|
||||
v = u*f1 + v*g1
|
||||
|
||||
mod <<= 512 - mod.bit_length() # align to the left
|
||||
if v < 0:
|
||||
v += mod
|
||||
if v < 0:
|
||||
v += mod
|
||||
elif v == 1<<512
|
||||
v -= mod
|
||||
|
||||
return v & (2**512 - 1) # to be reduced % mod
|
||||
___
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
|
||||
my @acc=map("x$_",(4..11));
|
||||
my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17));
|
||||
my $cnt = $n_ptr;
|
||||
my @t = map("x$_",(19..26));
|
||||
my ($a_lo, $b_lo) = @acc[3,7];
|
||||
|
||||
$frame = 16+2*512;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl ct_inverse_mod_256
|
||||
.type ct_inverse_mod_256, %function
|
||||
.align 5
|
||||
ct_inverse_mod_256:
|
||||
paciasp
|
||||
stp x29, x30, [sp,#-80]!
|
||||
add x29, sp, #0
|
||||
stp x19, x20, [sp,#16]
|
||||
stp x21, x22, [sp,#32]
|
||||
stp x23, x24, [sp,#48]
|
||||
stp x25, x26, [sp,#64]
|
||||
sub sp, sp, #$frame
|
||||
|
||||
ldp @acc[0], @acc[1], [$in_ptr,#8*0]
|
||||
ldp @acc[2], @acc[3], [$in_ptr,#8*2]
|
||||
|
||||
add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot
|
||||
and $in_ptr, $in_ptr, #-512 // in the frame...
|
||||
str $out_ptr, [sp]
|
||||
|
||||
ldp @acc[4], @acc[5], [$n_ptr,#8*0]
|
||||
ldp @acc[6], @acc[7], [$n_ptr,#8*2]
|
||||
|
||||
stp @acc[0], @acc[1], [$in_ptr,#8*0] // copy input to |a|
|
||||
stp @acc[2], @acc[3], [$in_ptr,#8*2]
|
||||
stp @acc[4], @acc[5], [$in_ptr,#8*4] // copy modulus to |b|
|
||||
stp @acc[6], @acc[7], [$in_ptr,#8*6]
|
||||
|
||||
////////////////////////////////////////// first iteration
|
||||
bl .Lab_approximation_31_256_loaded
|
||||
|
||||
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
|
||||
bl __smul_256_n_shift_by_31
|
||||
str $f0,[$out_ptr,#8*8] // initialize |u| with |f0|
|
||||
|
||||
mov $f0, $f1 // |f1|
|
||||
mov $g0, $g1 // |g1|
|
||||
add $out_ptr, $out_ptr, #8*4 // pointer to dst |b|
|
||||
bl __smul_256_n_shift_by_31
|
||||
str $f0, [$out_ptr,#8*9] // initialize |v| with |f1|
|
||||
|
||||
////////////////////////////////////////// second iteration
|
||||
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
|
||||
bl __ab_approximation_31_256
|
||||
|
||||
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
|
||||
bl __smul_256_n_shift_by_31
|
||||
mov $f_, $f0 // corrected |f0|
|
||||
mov $g_, $g0 // corrected |g0|
|
||||
|
||||
mov $f0, $f1 // |f1|
|
||||
mov $g0, $g1 // |g1|
|
||||
add $out_ptr, $out_ptr, #8*4 // pointer to destination |b|
|
||||
bl __smul_256_n_shift_by_31
|
||||
|
||||
ldr @acc[4], [$in_ptr,#8*8] // |u|
|
||||
ldr @acc[5], [$in_ptr,#8*13] // |v|
|
||||
madd @acc[0], $f_, @acc[4], xzr // |u|*|f0|
|
||||
madd @acc[0], $g_, @acc[5], @acc[0] // |v|*|g0|
|
||||
str @acc[0], [$out_ptr,#8*4]
|
||||
asr @acc[1], @acc[0], #63 // sign extenstion
|
||||
stp @acc[1], @acc[1], [$out_ptr,#8*5]
|
||||
stp @acc[1], @acc[1], [$out_ptr,#8*7]
|
||||
|
||||
madd @acc[0], $f0, @acc[4], xzr // |u|*|f1|
|
||||
madd @acc[0], $g0, @acc[5], @acc[0] // |v|*|g1|
|
||||
str @acc[0], [$out_ptr,#8*9]
|
||||
asr @acc[1], @acc[0], #63 // sign extenstion
|
||||
stp @acc[1], @acc[1], [$out_ptr,#8*10]
|
||||
stp @acc[1], @acc[1], [$out_ptr,#8*12]
|
||||
___
|
||||
for($i=2; $i<15; $i++) {
|
||||
$code.=<<___;
|
||||
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
|
||||
bl __ab_approximation_31_256
|
||||
|
||||
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
|
||||
bl __smul_256_n_shift_by_31
|
||||
mov $f_, $f0 // corrected |f0|
|
||||
mov $g_, $g0 // corrected |g0|
|
||||
|
||||
mov $f0, $f1 // |f1|
|
||||
mov $g0, $g1 // |g1|
|
||||
add $out_ptr, $out_ptr, #8*4 // pointer to destination |b|
|
||||
bl __smul_256_n_shift_by_31
|
||||
|
||||
add $out_ptr, $out_ptr, #8*4 // pointer to destination |u|
|
||||
bl __smul_256x63
|
||||
adc @t[3], @t[3], @t[4]
|
||||
str @t[3], [$out_ptr,#8*4]
|
||||
|
||||
mov $f_, $f0 // corrected |f1|
|
||||
mov $g_, $g0 // corrected |g1|
|
||||
add $out_ptr, $out_ptr, #8*5 // pointer to destination |v|
|
||||
bl __smul_256x63
|
||||
___
|
||||
$code.=<<___ if ($i>7);
|
||||
bl __smul_512x63_tail
|
||||
___
|
||||
$code.=<<___ if ($i<=7);
|
||||
adc @t[3], @t[3], @t[4]
|
||||
stp @t[3], @t[3], [$out_ptr,#8*4]
|
||||
stp @t[3], @t[3], [$out_ptr,#8*6]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
////////////////////////////////////////// two[!] last iterations
|
||||
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
|
||||
mov $cnt, #47 // 31 + 512 % 31
|
||||
//bl __ab_approximation_62_256 // |a| and |b| are exact,
|
||||
ldr $a_lo, [$in_ptr,#8*0] // just load
|
||||
ldr $b_lo, [$in_ptr,#8*4]
|
||||
bl __inner_loop_62_256
|
||||
|
||||
mov $f_, $f1
|
||||
mov $g_, $g1
|
||||
ldr $out_ptr, [sp] // original out_ptr
|
||||
bl __smul_256x63
|
||||
bl __smul_512x63_tail
|
||||
ldr x30, [x29,#8]
|
||||
|
||||
smulh @t[1], @acc[3], $g_ // figure out top-most limb
|
||||
ldp @acc[4], @acc[5], [$nx_ptr,#8*0]
|
||||
adc @t[4], @t[4], @t[6]
|
||||
ldp @acc[6], @acc[7], [$nx_ptr,#8*2]
|
||||
|
||||
add @t[1], @t[1], @t[4] // @t[1] is 1, 0 or -1
|
||||
asr @t[0], @t[1], #63 // sign as mask
|
||||
|
||||
and @t[4], @acc[4], @t[0] // add mod<<256 conditionally
|
||||
and @t[5], @acc[5], @t[0]
|
||||
adds @acc[0], @acc[0], @t[4]
|
||||
and @t[6], @acc[6], @t[0]
|
||||
adcs @acc[1], @acc[1], @t[5]
|
||||
and @t[7], @acc[7], @t[0]
|
||||
adcs @acc[2], @acc[2], @t[6]
|
||||
adcs @acc[3], @t[3], @t[7]
|
||||
adc @t[1], @t[1], xzr // @t[1] is 1, 0 or -1
|
||||
|
||||
neg @t[0], @t[1]
|
||||
orr @t[1], @t[1], @t[0] // excess bit or sign as mask
|
||||
asr @t[0], @t[0], #63 // excess bit as mask
|
||||
|
||||
and @acc[4], @acc[4], @t[1] // mask |mod|
|
||||
and @acc[5], @acc[5], @t[1]
|
||||
and @acc[6], @acc[6], @t[1]
|
||||
and @acc[7], @acc[7], @t[1]
|
||||
|
||||
eor @acc[4], @acc[4], @t[0] // conditionally negate |mod|
|
||||
eor @acc[5], @acc[5], @t[0]
|
||||
adds @acc[4], @acc[4], @t[0], lsr#63
|
||||
eor @acc[6], @acc[6], @t[0]
|
||||
adcs @acc[5], @acc[5], xzr
|
||||
eor @acc[7], @acc[7], @t[0]
|
||||
adcs @acc[6], @acc[6], xzr
|
||||
adc @acc[7], @acc[7], xzr
|
||||
|
||||
adds @acc[0], @acc[0], @acc[4] // final adjustment for |mod|<<256
|
||||
adcs @acc[1], @acc[1], @acc[5]
|
||||
adcs @acc[2], @acc[2], @acc[6]
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*4]
|
||||
adc @acc[3], @acc[3], @acc[7]
|
||||
stp @acc[2], @acc[3], [$out_ptr,#8*6]
|
||||
|
||||
add sp, sp, #$frame
|
||||
ldp x19, x20, [x29,#16]
|
||||
ldp x21, x22, [x29,#32]
|
||||
ldp x23, x24, [x29,#48]
|
||||
ldp x25, x26, [x29,#64]
|
||||
ldr x29, [sp],#80
|
||||
autiasp
|
||||
ret
|
||||
.size ct_inverse_mod_256,.-ct_inverse_mod_256
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
.type __smul_256x63, %function
|
||||
.align 5
|
||||
__smul_256x63:
|
||||
___
|
||||
for($j=0; $j<2; $j++) {
|
||||
my $f_ = $f_; $f_ = $g_ if ($j);
|
||||
my @acc = @acc; @acc = @acc[4..7] if ($j);
|
||||
my $k = 8*8+8*5*$j;
|
||||
$code.=<<___;
|
||||
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|)
|
||||
asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s)
|
||||
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
|
||||
eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|)
|
||||
ldr @t[3+$j], [$in_ptr,#8*4+$k]
|
||||
|
||||
eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|)
|
||||
sub $f_, $f_, $f1
|
||||
eor @acc[1], @acc[1], $f1
|
||||
adds @acc[0], @acc[0], $f1, lsr#63
|
||||
eor @acc[2], @acc[2], $f1
|
||||
adcs @acc[1], @acc[1], xzr
|
||||
eor @acc[3], @acc[3], $f1
|
||||
adcs @acc[2], @acc[2], xzr
|
||||
eor @t[3+$j], @t[3+$j], $f1
|
||||
umulh @t[0], @acc[0], $f_
|
||||
adcs @acc[3], @acc[3], xzr
|
||||
umulh @t[1], @acc[1], $f_
|
||||
adcs @t[3+$j], @t[3+$j], xzr
|
||||
umulh @t[2], @acc[2], $f_
|
||||
___
|
||||
$code.=<<___ if ($j!=0);
|
||||
adc $g1, xzr, xzr // used in __smul_512x63_tail
|
||||
___
|
||||
$code.=<<___;
|
||||
mul @acc[0], @acc[0], $f_
|
||||
cmp $f_, #0
|
||||
mul @acc[1], @acc[1], $f_
|
||||
csel @t[3+$j], @t[3+$j], xzr, ne
|
||||
mul @acc[2], @acc[2], $f_
|
||||
adds @acc[1], @acc[1], @t[0]
|
||||
mul @t[5+$j], @acc[3], $f_
|
||||
adcs @acc[2], @acc[2], @t[1]
|
||||
adcs @t[5+$j], @t[5+$j], @t[2]
|
||||
___
|
||||
$code.=<<___ if ($j==0);
|
||||
adc @t[7], xzr, xzr
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
adc @t[7], @t[7], xzr
|
||||
|
||||
adds @acc[0], @acc[0], @acc[4]
|
||||
adcs @acc[1], @acc[1], @acc[5]
|
||||
adcs @acc[2], @acc[2], @acc[6]
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*0]
|
||||
adcs @t[5], @t[5], @t[6]
|
||||
stp @acc[2], @t[5], [$out_ptr,#8*2]
|
||||
|
||||
ret
|
||||
.size __smul_256x63,.-__smul_256x63
|
||||
|
||||
.type __smul_512x63_tail, %function
|
||||
.align 5
|
||||
__smul_512x63_tail:
|
||||
umulh @t[5], @acc[3], $f_
|
||||
ldp @acc[1], @acc[2], [$in_ptr,#8*18] // load rest of |v|
|
||||
adc @t[7], @t[7], xzr
|
||||
ldr @acc[3], [$in_ptr,#8*20]
|
||||
and @t[3], @t[3], $f_
|
||||
|
||||
umulh @acc[7], @acc[7], $g_ // resume |v|*|g1| chain
|
||||
|
||||
sub @t[5], @t[5], @t[3] // tie up |u|*|f1| chain
|
||||
asr @t[6], @t[5], #63
|
||||
|
||||
eor @acc[1], @acc[1], $f1 // conditionally negate rest of |v|
|
||||
eor @acc[2], @acc[2], $f1
|
||||
adds @acc[1], @acc[1], $g1
|
||||
eor @acc[3], @acc[3], $f1
|
||||
adcs @acc[2], @acc[2], xzr
|
||||
umulh @t[0], @t[4], $g_
|
||||
adc @acc[3], @acc[3], xzr
|
||||
umulh @t[1], @acc[1], $g_
|
||||
add @acc[7], @acc[7], @t[7]
|
||||
umulh @t[2], @acc[2], $g_
|
||||
|
||||
mul @acc[0], @t[4], $g_
|
||||
mul @acc[1], @acc[1], $g_
|
||||
adds @acc[0], @acc[0], @acc[7]
|
||||
mul @acc[2], @acc[2], $g_
|
||||
adcs @acc[1], @acc[1], @t[0]
|
||||
mul @t[3], @acc[3], $g_
|
||||
adcs @acc[2], @acc[2], @t[1]
|
||||
adcs @t[3], @t[3], @t[2]
|
||||
adc @t[4], xzr, xzr // used in the final step
|
||||
|
||||
adds @acc[0], @acc[0], @t[5]
|
||||
adcs @acc[1], @acc[1], @t[6]
|
||||
adcs @acc[2], @acc[2], @t[6]
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*4]
|
||||
adcs @t[3], @t[3], @t[6] // carry is used in the final step
|
||||
stp @acc[2], @t[3], [$out_ptr,#8*6]
|
||||
|
||||
ret
|
||||
.size __smul_512x63_tail,.-__smul_512x63_tail
|
||||
|
||||
.type __smul_256_n_shift_by_31, %function
|
||||
.align 5
|
||||
__smul_256_n_shift_by_31:
|
||||
___
|
||||
for($j=0; $j<2; $j++) {
|
||||
my $f0 = $f0; $f0 = $g0 if ($j);
|
||||
my @acc = @acc; @acc = @acc[4..7] if ($j);
|
||||
my $k = 8*4*$j;
|
||||
$code.=<<___;
|
||||
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|)
|
||||
asr @t[5], $f0, #63 // |f0|'s sign as mask (or |g0|'s)
|
||||
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
|
||||
eor @t[6], $f0, @t[5] // conditionally negate |f0| (or |g0|)
|
||||
|
||||
eor @acc[0], @acc[0], @t[5] // conditionally negate |a| (or |b|)
|
||||
sub @t[6], @t[6], @t[5]
|
||||
eor @acc[1], @acc[1], @t[5]
|
||||
adds @acc[0], @acc[0], @t[5], lsr#63
|
||||
eor @acc[2], @acc[2], @t[5]
|
||||
adcs @acc[1], @acc[1], xzr
|
||||
eor @acc[3], @acc[3], @t[5]
|
||||
umulh @t[0], @acc[0], @t[6]
|
||||
adcs @acc[2], @acc[2], xzr
|
||||
umulh @t[1], @acc[1], @t[6]
|
||||
adc @acc[3], @acc[3], xzr
|
||||
umulh @t[2], @acc[2], @t[6]
|
||||
and @t[5], @t[5], @t[6]
|
||||
umulh @t[3+$j], @acc[3], @t[6]
|
||||
neg @t[5], @t[5]
|
||||
|
||||
mul @acc[0], @acc[0], @t[6]
|
||||
mul @acc[1], @acc[1], @t[6]
|
||||
mul @acc[2], @acc[2], @t[6]
|
||||
adds @acc[1], @acc[1], @t[0]
|
||||
mul @acc[3], @acc[3], @t[6]
|
||||
adcs @acc[2], @acc[2], @t[1]
|
||||
adcs @acc[3], @acc[3], @t[2]
|
||||
adc @t[3+$j], @t[3+$j], @t[5]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
adds @acc[0], @acc[0], @acc[4]
|
||||
adcs @acc[1], @acc[1], @acc[5]
|
||||
adcs @acc[2], @acc[2], @acc[6]
|
||||
adcs @acc[3], @acc[3], @acc[7]
|
||||
adc @acc[4], @t[3], @t[4]
|
||||
|
||||
extr @acc[0], @acc[1], @acc[0], #31
|
||||
extr @acc[1], @acc[2], @acc[1], #31
|
||||
extr @acc[2], @acc[3], @acc[2], #31
|
||||
asr @t[4], @acc[4], #63 // result's sign as mask
|
||||
extr @acc[3], @acc[4], @acc[3], #31
|
||||
|
||||
eor @acc[0], @acc[0], @t[4] // ensure the result is positive
|
||||
eor @acc[1], @acc[1], @t[4]
|
||||
adds @acc[0], @acc[0], @t[4], lsr#63
|
||||
eor @acc[2], @acc[2], @t[4]
|
||||
adcs @acc[1], @acc[1], xzr
|
||||
eor @acc[3], @acc[3], @t[4]
|
||||
adcs @acc[2], @acc[2], xzr
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*0]
|
||||
adc @acc[3], @acc[3], xzr
|
||||
stp @acc[2], @acc[3], [$out_ptr,#8*2]
|
||||
|
||||
eor $f0, $f0, @t[4] // adjust |f/g| accordingly
|
||||
eor $g0, $g0, @t[4]
|
||||
sub $f0, $f0, @t[4]
|
||||
sub $g0, $g0, @t[4]
|
||||
|
||||
ret
|
||||
.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
|
||||
___
|
||||
|
||||
{
|
||||
my @a = @acc[0..3];
|
||||
my @b = @acc[4..7];
|
||||
my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]);
|
||||
|
||||
$code.=<<___;
|
||||
.type __ab_approximation_31_256, %function
|
||||
.align 4
|
||||
__ab_approximation_31_256:
|
||||
ldp @a[2], @a[3], [$in_ptr,#8*2]
|
||||
ldp @b[2], @b[3], [$in_ptr,#8*6]
|
||||
ldp @a[0], @a[1], [$in_ptr,#8*0]
|
||||
ldp @b[0], @b[1], [$in_ptr,#8*4]
|
||||
|
||||
.Lab_approximation_31_256_loaded:
|
||||
orr @t[0], @a[3], @b[3] // check top-most limbs, ...
|
||||
cmp @t[0], #0
|
||||
csel @a[3], @a[3], @a[2], ne
|
||||
csel @b[3], @b[3], @b[2], ne
|
||||
csel @a[2], @a[2], @a[1], ne
|
||||
orr @t[0], @a[3], @b[3] // and ones before top-most, ...
|
||||
csel @b[2], @b[2], @b[1], ne
|
||||
|
||||
cmp @t[0], #0
|
||||
csel @a[3], @a[3], @a[2], ne
|
||||
csel @b[3], @b[3], @b[2], ne
|
||||
csel @a[2], @a[2], @a[0], ne
|
||||
orr @t[0], @a[3], @b[3] // and one more, ...
|
||||
csel @b[2], @b[2], @b[0], ne
|
||||
|
||||
clz @t[0], @t[0]
|
||||
cmp @t[0], #64
|
||||
csel @t[0], @t[0], xzr, ne
|
||||
csel @a[3], @a[3], @a[2], ne
|
||||
csel @b[3], @b[3], @b[2], ne
|
||||
neg @t[1], @t[0]
|
||||
|
||||
lslv @a[3], @a[3], @t[0] // align high limbs to the left
|
||||
lslv @b[3], @b[3], @t[0]
|
||||
lsrv @a[2], @a[2], @t[1]
|
||||
lsrv @b[2], @b[2], @t[1]
|
||||
and @a[2], @a[2], @t[1], asr#6
|
||||
and @b[2], @b[2], @t[1], asr#6
|
||||
orr $a_lo, @a[3], @a[2]
|
||||
orr $b_lo, @b[3], @b[2]
|
||||
|
||||
bfxil $a_lo, @a[0], #0, #31
|
||||
bfxil $b_lo, @b[0], #0, #31
|
||||
|
||||
b __inner_loop_31_256
|
||||
ret
|
||||
.size __ab_approximation_31_256,.-__ab_approximation_31_256
|
||||
|
||||
.type __inner_loop_31_256, %function
|
||||
.align 4
|
||||
__inner_loop_31_256:
|
||||
mov $cnt, #31
|
||||
mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0
|
||||
mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1
|
||||
mov $bias,#0x7FFFFFFF7FFFFFFF
|
||||
|
||||
.Loop_31_256:
|
||||
sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting
|
||||
sub $cnt, $cnt, #1
|
||||
and @t[0], $b_lo, @t[3]
|
||||
sub @t[1], $b_lo, $a_lo // |b_|-|a_|
|
||||
subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
|
||||
mov @t[0], $fg1
|
||||
csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_|
|
||||
csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
|
||||
csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1|
|
||||
csel $fg0, $fg0, @t[0], hs
|
||||
lsr $a_lo, $a_lo, #1
|
||||
and @t[0], $fg1, @t[3]
|
||||
and @t[1], $bias, @t[3]
|
||||
sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even)
|
||||
add $fg1, $fg1, $fg1 // |f1|<<=1
|
||||
add $fg0, $fg0, @t[1]
|
||||
sub $fg1, $fg1, $bias
|
||||
cbnz $cnt, .Loop_31_256
|
||||
|
||||
mov $bias, #0x7FFFFFFF
|
||||
ubfx $f0, $fg0, #0, #32
|
||||
ubfx $g0, $fg0, #32, #32
|
||||
ubfx $f1, $fg1, #0, #32
|
||||
ubfx $g1, $fg1, #32, #32
|
||||
sub $f0, $f0, $bias // remove bias
|
||||
sub $g0, $g0, $bias
|
||||
sub $f1, $f1, $bias
|
||||
sub $g1, $g1, $bias
|
||||
|
||||
ret
|
||||
.size __inner_loop_31_256,.-__inner_loop_31_256
|
||||
|
||||
.type __inner_loop_62_256, %function
|
||||
.align 4
|
||||
__inner_loop_62_256:
|
||||
mov $f0, #1 // |f0|=1
|
||||
mov $g0, #0 // |g0|=0
|
||||
mov $f1, #0 // |f1|=0
|
||||
mov $g1, #1 // |g1|=1
|
||||
|
||||
.Loop_62_256:
|
||||
sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting
|
||||
sub $cnt, $cnt, #1
|
||||
and @t[0], $b_lo, @t[3]
|
||||
sub @t[1], $b_lo, $a_lo // |b_|-|a_|
|
||||
subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
|
||||
mov @t[0], $f0
|
||||
csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_|
|
||||
csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
|
||||
mov @t[1], $g0
|
||||
csel $f0, $f0, $f1, hs // exchange |f0| and |f1|
|
||||
csel $f1, $f1, @t[0], hs
|
||||
csel $g0, $g0, $g1, hs // exchange |g0| and |g1|
|
||||
csel $g1, $g1, @t[1], hs
|
||||
lsr $a_lo, $a_lo, #1
|
||||
and @t[0], $f1, @t[3]
|
||||
and @t[1], $g1, @t[3]
|
||||
add $f1, $f1, $f1 // |f1|<<=1
|
||||
add $g1, $g1, $g1 // |g1|<<=1
|
||||
sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even)
|
||||
sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...)
|
||||
cbnz $cnt, .Loop_62_256
|
||||
|
||||
ret
|
||||
.size __inner_loop_62_256,.-__inner_loop_62_256
|
||||
___
|
||||
}
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/;
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
837
blst/asm/ct_inverse_mod_256-x86_64.pl
Executable file
837
blst/asm/ct_inverse_mod_256-x86_64.pl
Executable file
|
@ -0,0 +1,837 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Both constant-time and fast Euclidean inversion as suggested in
|
||||
# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake.
|
||||
#
|
||||
# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
|
||||
# const vec256 modx);
|
||||
#
|
||||
$python_ref.=<<'___';
|
||||
def ct_inverse_mod_256(inp, mod):
|
||||
a, u = inp, 1
|
||||
b, v = mod, 0
|
||||
|
||||
k = 31
|
||||
mask = (1 << k) - 1
|
||||
|
||||
for i in range(0, 512 // k - 1):
|
||||
# __ab_approximation_31
|
||||
n = max(a.bit_length(), b.bit_length())
|
||||
if n < 64:
|
||||
a_, b_ = a, b
|
||||
else:
|
||||
a_ = (a & mask) | ((a >> (n-k-2)) << k)
|
||||
b_ = (b & mask) | ((b >> (n-k-2)) << k)
|
||||
|
||||
# __inner_loop_31
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, k):
|
||||
if a_ & 1:
|
||||
if a_ < b_:
|
||||
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
|
||||
a_, f0, g0 = a_-b_, f0-f1, g0-g1
|
||||
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
|
||||
|
||||
# __smulq_256_n_shift_by_31
|
||||
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
|
||||
if a < 0:
|
||||
a, f0, g0 = -a, -f0, -g0
|
||||
if b < 0:
|
||||
b, f1, g1 = -b, -f1, -g1
|
||||
|
||||
# __smulq_512x63
|
||||
u, v = u*f0 + v*g0, u*f1 + v*g1
|
||||
|
||||
if 512 % k + k:
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, 512 % k + k):
|
||||
if a & 1:
|
||||
if a < b:
|
||||
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
|
||||
a, f0, g0 = a-b, f0-f1, g0-g1
|
||||
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
|
||||
|
||||
v = u*f1 + v*g1
|
||||
|
||||
mod <<= 512 - mod.bit_length() # align to the left
|
||||
if v < 0:
|
||||
v += mod
|
||||
if v < 0:
|
||||
v += mod
|
||||
elif v == 1<<512
|
||||
v -= mod
|
||||
|
||||
return v & (2**512 - 1) # to be reduced % mod
|
||||
___
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
|
||||
my @acc = map("%r$_",(8..15));
|
||||
my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
|
||||
my $cnt = "%edx";
|
||||
|
||||
$frame = 8*6+2*512;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl ct_inverse_mod_256
|
||||
.type ct_inverse_mod_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
ct_inverse_mod_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$$frame, %rsp
|
||||
.cfi_adjust_cfa_offset $frame
|
||||
.cfi_end_prologue
|
||||
|
||||
lea 8*6+511(%rsp), %rax # find closest 512-byte-aligned spot
|
||||
and \$-512, %rax # in the frame...
|
||||
mov $out_ptr, 8*4(%rsp)
|
||||
mov $nx_ptr, 8*5(%rsp)
|
||||
|
||||
mov 8*0($in_ptr), @acc[0] # load input
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
|
||||
mov 8*0($n_ptr), @acc[4] # load modulus
|
||||
mov 8*1($n_ptr), @acc[5]
|
||||
mov 8*2($n_ptr), @acc[6]
|
||||
mov 8*3($n_ptr), @acc[7]
|
||||
|
||||
mov @acc[0], 8*0(%rax) # copy input to |a|
|
||||
mov @acc[1], 8*1(%rax)
|
||||
mov @acc[2], 8*2(%rax)
|
||||
mov @acc[3], 8*3(%rax)
|
||||
|
||||
mov @acc[4], 8*4(%rax) # copy modulus to |b|
|
||||
mov @acc[5], 8*5(%rax)
|
||||
mov @acc[6], 8*6(%rax)
|
||||
mov @acc[7], 8*7(%rax)
|
||||
mov %rax, $in_ptr
|
||||
|
||||
################################# first iteration
|
||||
mov \$31, $cnt
|
||||
call __ab_approximation_31_256
|
||||
#mov $f0, 8*0(%rsp)
|
||||
#mov $g0, 8*1(%rsp)
|
||||
mov $f1, 8*2(%rsp)
|
||||
mov $g1, 8*3(%rsp)
|
||||
|
||||
mov \$256, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||||
call __smulq_256_n_shift_by_31
|
||||
#mov $f0, 8*0(%rsp) # corrected |f0|
|
||||
#mov $g0, 8*1(%rsp) # corrected |g0|
|
||||
mov $f0, 8*8($out_ptr) # initialize |u| with |f0|
|
||||
|
||||
mov 8*2(%rsp), $f0 # |f1|
|
||||
mov 8*3(%rsp), $g0 # |g1|
|
||||
lea 8*4($out_ptr), $out_ptr # pointer to destination |b|
|
||||
call __smulq_256_n_shift_by_31
|
||||
#mov $f0, 8*2(%rsp) # corrected |f1|
|
||||
#mov $g0, 8*3(%rsp) # corrected |g1|
|
||||
mov $f0, 8*9($out_ptr) # initialize |v| with |f1|
|
||||
|
||||
################################# second iteration
|
||||
xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||||
mov \$31, $cnt
|
||||
call __ab_approximation_31_256
|
||||
#mov $f0, 8*0(%rsp)
|
||||
#mov $g0, 8*1(%rsp)
|
||||
mov $f1, 8*2(%rsp)
|
||||
mov $g1, 8*3(%rsp)
|
||||
|
||||
mov \$256, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||||
call __smulq_256_n_shift_by_31
|
||||
mov $f0, 8*0(%rsp) # corrected |f0|
|
||||
mov $g0, 8*1(%rsp) # corrected |g0|
|
||||
|
||||
mov 8*2(%rsp), $f0 # |f1|
|
||||
mov 8*3(%rsp), $g0 # |g1|
|
||||
lea 8*4($out_ptr), $out_ptr # pointer to destination |b|
|
||||
call __smulq_256_n_shift_by_31
|
||||
#mov $f0, 8*2(%rsp) # corrected |f1|
|
||||
#mov $g0, 8*3(%rsp) # corrected |g1|
|
||||
|
||||
mov 8*8($in_ptr), @acc[0] # |u|
|
||||
mov 8*13($in_ptr), @acc[4] # |v|
|
||||
mov @acc[0], @acc[1]
|
||||
imulq 8*0(%rsp), @acc[0] # |u|*|f0|
|
||||
mov @acc[4], @acc[5]
|
||||
imulq 8*1(%rsp), @acc[4] # |v|*|g0|
|
||||
add @acc[4], @acc[0]
|
||||
mov @acc[0], 8*4($out_ptr) # destination |u|
|
||||
sar \$63, @acc[0] # sign extension
|
||||
mov @acc[0], 8*5($out_ptr)
|
||||
mov @acc[0], 8*6($out_ptr)
|
||||
mov @acc[0], 8*7($out_ptr)
|
||||
mov @acc[0], 8*8($out_ptr)
|
||||
lea 8*8($in_ptr), $in_ptr # make in_ptr "rewindable" with xor
|
||||
|
||||
imulq $f0, @acc[1] # |u|*|f1|
|
||||
imulq $g0, @acc[5] # |v|*|g1|
|
||||
add @acc[5], @acc[1]
|
||||
mov @acc[1], 8*9($out_ptr) # destination |v|
|
||||
sar \$63, @acc[1] # sign extension
|
||||
mov @acc[1], 8*10($out_ptr)
|
||||
mov @acc[1], 8*11($out_ptr)
|
||||
mov @acc[1], 8*12($out_ptr)
|
||||
mov @acc[1], 8*13($out_ptr)
|
||||
___
|
||||
for($i=2; $i<15; $i++) {
|
||||
my $smul_512x63 = $i>8 ? "__smulq_512x63"
|
||||
: "__smulq_256x63";
|
||||
$code.=<<___;
|
||||
xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||||
mov \$31, $cnt
|
||||
call __ab_approximation_31_256
|
||||
#mov $f0, 8*0(%rsp)
|
||||
#mov $g0, 8*1(%rsp)
|
||||
mov $f1, 8*2(%rsp)
|
||||
mov $g1, 8*3(%rsp)
|
||||
|
||||
mov \$256, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||||
call __smulq_256_n_shift_by_31
|
||||
mov $f0, 8*0(%rsp) # corrected |f0|
|
||||
mov $g0, 8*1(%rsp) # corrected |g0|
|
||||
|
||||
mov 8*2(%rsp), $f0 # |f1|
|
||||
mov 8*3(%rsp), $g0 # |g1|
|
||||
lea 8*4($out_ptr), $out_ptr # pointer to destination |b|
|
||||
call __smulq_256_n_shift_by_31
|
||||
mov $f0, 8*2(%rsp) # corrected |f1|
|
||||
mov $g0, 8*3(%rsp) # corrected |g1|
|
||||
|
||||
mov 8*0(%rsp), $f0 # |f0|
|
||||
mov 8*1(%rsp), $g0 # |g0|
|
||||
lea 8*8($in_ptr), $in_ptr # pointer to source |u|v|
|
||||
lea 8*4($out_ptr), $out_ptr # pointer to destination |u|
|
||||
call __smulq_256x63
|
||||
|
||||
mov 8*2(%rsp), $f0 # |f1|
|
||||
mov 8*3(%rsp), $g0 # |g1|
|
||||
lea 8*5($out_ptr),$out_ptr # pointer to destination |v|
|
||||
call $smul_512x63
|
||||
___
|
||||
$code.=<<___ if ($i==8);
|
||||
sar \$63, %rbp # sign extension
|
||||
mov %rbp, 8*5($out_ptr)
|
||||
mov %rbp, 8*6($out_ptr)
|
||||
mov %rbp, 8*7($out_ptr)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
################################# two[!] last iterations in one go
|
||||
xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||||
mov \$47, $cnt # 31 + 512 % 31
|
||||
#call __ab_approximation_31 # |a| and |b| are exact, just load
|
||||
mov 8*0($in_ptr), @acc[0] # |a_lo|
|
||||
#xor @acc[1], @acc[1] # |a_hi|
|
||||
mov 8*4($in_ptr), @acc[2] # |b_lo|
|
||||
#xor @acc[3], @acc[3] # |b_hi|
|
||||
call __inner_loop_62_256
|
||||
#mov $f0, 8*0(%rsp)
|
||||
#mov $g0, 8*1(%rsp)
|
||||
#mov $f1, 8*2(%rsp)
|
||||
#mov $g1, 8*3(%rsp)
|
||||
|
||||
#mov 8*0(%rsp), $f0 # |f0|
|
||||
#mov 8*1(%rsp), $g0 # |g0|
|
||||
lea 8*8($in_ptr), $in_ptr # pointer to source |u|v|
|
||||
#lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
|
||||
#call __smulq_256x63
|
||||
|
||||
#mov 8*2(%rsp), $f0 # |f1|
|
||||
#mov 8*3(%rsp), $g0 # |g1|
|
||||
mov $f1, $f0
|
||||
mov $g1, $g0
|
||||
mov 8*4(%rsp), $out_ptr # original |out_ptr|
|
||||
call __smulq_512x63
|
||||
adc %rbp, %rdx # the excess limb of the result
|
||||
|
||||
mov 8*5(%rsp), $in_ptr # original |nx_ptr|
|
||||
mov %rdx, %rax
|
||||
sar \$63, %rdx # result's sign as mask
|
||||
|
||||
mov %rdx, @acc[0] # mask |modulus|
|
||||
mov %rdx, @acc[1]
|
||||
and 8*0($in_ptr), @acc[0]
|
||||
mov %rdx, @acc[2]
|
||||
and 8*1($in_ptr), @acc[1]
|
||||
and 8*2($in_ptr), @acc[2]
|
||||
and 8*3($in_ptr), %rdx
|
||||
|
||||
add @acc[0], @acc[4] # conditionally add |modulus|<<256
|
||||
adc @acc[1], @acc[5]
|
||||
adc @acc[2], @acc[6]
|
||||
adc %rdx, @acc[7]
|
||||
adc \$0, %rax
|
||||
|
||||
mov %rax, %rdx
|
||||
neg %rax
|
||||
or %rax, %rdx # excess bit or sign as mask
|
||||
sar \$63, %rax # excess bit as mask
|
||||
|
||||
mov %rdx, @acc[0] # mask |modulus|
|
||||
mov %rdx, @acc[1]
|
||||
and 8*0($in_ptr), @acc[0]
|
||||
mov %rdx, @acc[2]
|
||||
and 8*1($in_ptr), @acc[1]
|
||||
and 8*2($in_ptr), @acc[2]
|
||||
and 8*3($in_ptr), %rdx
|
||||
|
||||
xor %rax, @acc[0] # conditionally negate |modulus|
|
||||
xor %rcx, %rcx
|
||||
xor %rax, @acc[1]
|
||||
sub %rax, %rcx
|
||||
xor %rax, @acc[2]
|
||||
xor %rax, %rdx
|
||||
add %rcx, @acc[0]
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, %rdx
|
||||
|
||||
add @acc[0], @acc[4] # final adjustment for |modulus|<<256
|
||||
adc @acc[1], @acc[5]
|
||||
adc @acc[2], @acc[6]
|
||||
adc %rdx, @acc[7]
|
||||
|
||||
mov @acc[4], 8*4($out_ptr) # store absolute value
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
mov @acc[6], 8*6($out_ptr)
|
||||
mov @acc[7], 8*7($out_ptr)
|
||||
|
||||
lea $frame(%rsp), %r8 # size optimization
|
||||
mov 8*0(%r8),%r15
|
||||
.cfi_restore %r15
|
||||
mov 8*1(%r8),%r14
|
||||
.cfi_restore %r14
|
||||
mov 8*2(%r8),%r13
|
||||
.cfi_restore %r13
|
||||
mov 8*3(%r8),%r12
|
||||
.cfi_restore %r12
|
||||
mov 8*4(%r8),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 8*5(%r8),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 8*6(%r8),%rsp
|
||||
.cfi_adjust_cfa_offset -$frame-8*6
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size ct_inverse_mod_256,.-ct_inverse_mod_256
|
||||
___
|
||||
########################################################################
|
||||
# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
|
||||
# to the maximum bit-length of the *result*, and "63" - to the maximum
|
||||
# bit-length of the |f?| and |g?| single-limb multiplicands. However!
|
||||
# The latter should not be taken literally, as they are always chosen so
|
||||
# that "bad things" don't happen. For example, there comes a point when
|
||||
# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
|
||||
# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
|
||||
# because past that point |f0| is always 1 and |g0| is always 0. And,
|
||||
# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
|
||||
# perform full-width |u|*|f1| multiplication, half-width one with sign
|
||||
# extension is sufficient...
|
||||
$code.=<<___;
|
||||
.type __smulq_512x63,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulq_512x63:
|
||||
mov 8*0($in_ptr), @acc[0] # load |u|
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
mov 8*4($in_ptr), %rbp # sign limb
|
||||
|
||||
mov $f0, %rbx
|
||||
sar \$63, $f0 # |f0|'s sign as mask
|
||||
xor %rax, %rax
|
||||
sub $f0, %rax # |f0|'s sign as bit
|
||||
|
||||
xor $f0, %rbx # conditionally negate |f0|
|
||||
add %rax, %rbx
|
||||
|
||||
xor $f0, @acc[0] # conditionally negate |u|
|
||||
xor $f0, @acc[1]
|
||||
xor $f0, @acc[2]
|
||||
xor $f0, @acc[3]
|
||||
xor $f0, %rbp
|
||||
add @acc[0], %rax
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, %rbp
|
||||
|
||||
mulq %rbx # |u|*|f0|
|
||||
mov %rax, 8*0($out_ptr) # offload |u|*|f0|
|
||||
mov @acc[1], %rax
|
||||
mov %rdx, @acc[1]
|
||||
___
|
||||
for($i=1; $i<3; $i++) {
|
||||
$code.=<<___;
|
||||
mulq %rbx
|
||||
add %rax, @acc[$i]
|
||||
mov @acc[$i+1], %rax
|
||||
adc \$0, %rdx
|
||||
mov @acc[$i], 8*$i($out_ptr)
|
||||
mov %rdx, @acc[$i+1]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
and %rbx, %rbp
|
||||
neg %rbp
|
||||
mulq %rbx
|
||||
add %rax, @acc[3]
|
||||
adc %rdx, %rbp
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
|
||||
mov 8*5($in_ptr), @acc[0] # load |v|
|
||||
mov 8*6($in_ptr), @acc[1]
|
||||
mov 8*7($in_ptr), @acc[2]
|
||||
mov 8*8($in_ptr), @acc[3]
|
||||
mov 8*9($in_ptr), @acc[4]
|
||||
mov 8*10($in_ptr), @acc[5]
|
||||
mov 8*11($in_ptr), @acc[6]
|
||||
mov 8*12($in_ptr), @acc[7]
|
||||
|
||||
mov $g0, $f0
|
||||
sar \$63, $f0 # |g0|'s sign as mask
|
||||
xor %rax, %rax
|
||||
sub $f0, %rax # |g0|'s sign as bit
|
||||
|
||||
xor $f0, $g0 # conditionally negate |g0|
|
||||
add %rax, $g0
|
||||
|
||||
xor $f0, @acc[0] # conditionally negate |v|
|
||||
xor $f0, @acc[1]
|
||||
xor $f0, @acc[2]
|
||||
xor $f0, @acc[3]
|
||||
xor $f0, @acc[4]
|
||||
xor $f0, @acc[5]
|
||||
xor $f0, @acc[6]
|
||||
xor $f0, @acc[7]
|
||||
add @acc[0], %rax
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
adc \$0, @acc[6]
|
||||
adc \$0, @acc[7]
|
||||
|
||||
mulq $g0
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[1], %rax
|
||||
mov %rdx, @acc[1]
|
||||
___
|
||||
for($i=1; $i<7; $i++) {
|
||||
$code.=<<___;
|
||||
mulq $g0
|
||||
add %rax, @acc[$i]
|
||||
mov @acc[$i+1], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[$i+1]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
imulq $g0
|
||||
add %rax, @acc[7]
|
||||
adc \$0, %rdx # used in the final step
|
||||
|
||||
mov %rbp, %rbx
|
||||
sar \$63, %rbp # sign extension
|
||||
|
||||
add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0|
|
||||
adc 8*1($out_ptr), @acc[1]
|
||||
adc 8*2($out_ptr), @acc[2]
|
||||
adc 8*3($out_ptr), @acc[3]
|
||||
adc %rbx, @acc[4]
|
||||
adc %rbp, @acc[5]
|
||||
adc %rbp, @acc[6]
|
||||
adc %rbp, @acc[7]
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
mov @acc[6], 8*6($out_ptr)
|
||||
mov @acc[7], 8*7($out_ptr)
|
||||
|
||||
ret
|
||||
.size __smulq_512x63,.-__smulq_512x63
|
||||
|
||||
.type __smulq_256x63,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulq_256x63:
|
||||
___
|
||||
for($j=0; $j<2; $j++) {
|
||||
my $k = 8*5*$j;
|
||||
my @acc=@acc; @acc=@acc[4..7] if($j);
|
||||
my $top="%rbp"; $top=$g0 if($j);
|
||||
$code.=<<___;
|
||||
mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
|
||||
mov $k+8*1($in_ptr), @acc[1]
|
||||
mov $k+8*2($in_ptr), @acc[2]
|
||||
mov $k+8*3($in_ptr), @acc[3]
|
||||
mov $k+8*4($in_ptr), $top # sign/excess limb
|
||||
|
||||
mov $f0, %rbx
|
||||
sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s)
|
||||
xor %rax, %rax
|
||||
sub $f0, %rax # |f0|'s sign as bit (or |g0|'s)
|
||||
|
||||
xor $f0, %rbx # conditionally negate |f0|
|
||||
add %rax, %rbx
|
||||
|
||||
xor $f0, @acc[0] # conditionally negate |u| (or |v|)
|
||||
xor $f0, @acc[1]
|
||||
xor $f0, @acc[2]
|
||||
xor $f0, @acc[3]
|
||||
xor $f0, $top
|
||||
add @acc[0], %rax
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, $top
|
||||
|
||||
mulq %rbx
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[1], %rax
|
||||
mov %rdx, @acc[1]
|
||||
___
|
||||
for($i=1; $i<3; $i++) {
|
||||
$code.=<<___;
|
||||
mulq %rbx
|
||||
add %rax, @acc[$i]
|
||||
mov @acc[$i+1], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[$i+1]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
and %rbx, $top
|
||||
neg $top
|
||||
mulq %rbx
|
||||
add %rax, @acc[3]
|
||||
adc %rdx, $top
|
||||
___
|
||||
$code.=<<___ if ($j==0);
|
||||
mov $g0, $f0
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
add @acc[4], @acc[0] # accumulate |u|*|f0|
|
||||
adc @acc[5], @acc[1]
|
||||
adc @acc[6], @acc[2]
|
||||
adc @acc[7], @acc[3]
|
||||
adc %rcx, %rbp
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov %rbp, 8*4($out_ptr)
|
||||
|
||||
ret
|
||||
.size __smulq_256x63,.-__smulq_256x63
|
||||
___
|
||||
########################################################################
|
||||
# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
|
||||
# the names refers to maximum bit-lengths of |a| and |b|. As already
|
||||
# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
|
||||
# chosen so that "bad things" don't happen. For example, so that the
|
||||
# sum of the products doesn't overflow, and that the final result is
|
||||
# never wider than inputs...
|
||||
{
|
||||
$code.=<<___;
|
||||
.type __smulq_256_n_shift_by_31,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulq_256_n_shift_by_31:
|
||||
mov $f0, 8*0($out_ptr) # offload |f0|
|
||||
mov $g0, 8*1($out_ptr) # offload |g0|
|
||||
mov $f0, %rbp
|
||||
___
|
||||
for($j=0; $j<2; $j++) {
|
||||
my $k = 8*4*$j;
|
||||
my @acc=@acc; @acc=@acc[4..7] if ($j);
|
||||
my $f0="%rbp"; $f0=$g0 if ($j);
|
||||
$code.=<<___;
|
||||
mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
|
||||
mov $k+8*1($in_ptr), @acc[1]
|
||||
mov $k+8*2($in_ptr), @acc[2]
|
||||
mov $k+8*3($in_ptr), @acc[3]
|
||||
|
||||
mov $f0, %rbx
|
||||
sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s)
|
||||
xor %rax, %rax
|
||||
sub $f0, %rax # |f0|'s sign as bit (or |g0|'s)
|
||||
|
||||
xor $f0, %rbx # conditionally negate |f0| (or |g0|)
|
||||
add %rax, %rbx
|
||||
|
||||
xor $f0, @acc[0] # conditionally negate |a| (or |b|)
|
||||
xor $f0, @acc[1]
|
||||
xor $f0, @acc[2]
|
||||
xor $f0, @acc[3]
|
||||
add @acc[0], %rax
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
|
||||
mulq %rbx
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[1], %rax
|
||||
and %rbx, $f0
|
||||
neg $f0
|
||||
mov %rdx, @acc[1]
|
||||
___
|
||||
for($i=1; $i<3; $i++) {
|
||||
$code.=<<___;
|
||||
mulq %rbx
|
||||
add %rax, @acc[$i]
|
||||
mov @acc[$i+1], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[$i+1]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
mulq %rbx
|
||||
add %rax, @acc[3]
|
||||
adc %rdx, $f0
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
add @acc[4], @acc[0]
|
||||
adc @acc[5], @acc[1]
|
||||
adc @acc[6], @acc[2]
|
||||
adc @acc[7], @acc[3]
|
||||
adc $g0, %rbp
|
||||
|
||||
mov 8*0($out_ptr), $f0 # restore original |f0|
|
||||
mov 8*1($out_ptr), $g0 # restore original |g0|
|
||||
|
||||
shrd \$31, @acc[1], @acc[0]
|
||||
shrd \$31, @acc[2], @acc[1]
|
||||
shrd \$31, @acc[3], @acc[2]
|
||||
shrd \$31, %rbp, @acc[3]
|
||||
|
||||
sar \$63, %rbp # sign as mask
|
||||
xor %rax, %rax
|
||||
sub %rbp, %rax # sign as bit
|
||||
|
||||
xor %rbp, @acc[0] # conditionally negate the result
|
||||
xor %rbp, @acc[1]
|
||||
xor %rbp, @acc[2]
|
||||
xor %rbp, @acc[3]
|
||||
add %rax, @acc[0]
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
|
||||
xor %rbp, $f0 # conditionally negate |f0|
|
||||
xor %rbp, $g0 # conditionally negate |g0|
|
||||
add %rax, $f0
|
||||
add %rax, $g0
|
||||
|
||||
ret
|
||||
.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31
|
||||
___
|
||||
}
|
||||
|
||||
{
|
||||
my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
|
||||
my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
|
||||
my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
|
||||
my ($a_, $b_) = ($a_lo, $b_lo);
|
||||
{
|
||||
my @a = ($a_lo, $t1, $a_hi);
|
||||
my @b = ($b_lo, $t2, $b_hi);
|
||||
|
||||
$code.=<<___;
|
||||
.type __ab_approximation_31_256,\@abi-omnipotent
|
||||
.align 32
|
||||
__ab_approximation_31_256:
|
||||
mov 8*3($in_ptr), @a[2] # load |a| in reverse order
|
||||
mov 8*7($in_ptr), @b[2] # load |b| in reverse order
|
||||
mov 8*2($in_ptr), @a[1]
|
||||
mov 8*6($in_ptr), @b[1]
|
||||
mov 8*1($in_ptr), @a[0]
|
||||
mov 8*5($in_ptr), @b[0]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0 # check top-most limbs, ...
|
||||
cmovz @a[1], @a[2]
|
||||
cmovz @b[1], @b[2]
|
||||
cmovz @a[0], @a[1]
|
||||
mov 8*0($in_ptr), @a[0]
|
||||
cmovz @b[0], @b[1]
|
||||
mov 8*4($in_ptr), @b[0]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0 # ... and ones before that ...
|
||||
cmovz @a[1], @a[2]
|
||||
cmovz @b[1], @b[2]
|
||||
cmovz @a[0], @a[1]
|
||||
cmovz @b[0], @b[1]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0
|
||||
bsr $t0, %rcx
|
||||
lea 1(%rcx), %rcx
|
||||
cmovz @a[0], @a[2]
|
||||
cmovz @b[0], @b[2]
|
||||
cmovz $t0, %rcx
|
||||
neg %rcx
|
||||
#and \$63, %rcx # debugging artefact
|
||||
|
||||
shldq %cl, @a[1], @a[2] # align second limb to the left
|
||||
shldq %cl, @b[1], @b[2]
|
||||
|
||||
mov \$0x7FFFFFFF, %eax
|
||||
and %rax, @a[0]
|
||||
and %rax, @b[0]
|
||||
not %rax
|
||||
and %rax, @a[2]
|
||||
and %rax, @b[2]
|
||||
or @a[2], @a[0]
|
||||
or @b[2], @b[0]
|
||||
|
||||
jmp __inner_loop_31_256
|
||||
|
||||
ret
|
||||
.size __ab_approximation_31_256,.-__ab_approximation_31_256
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.type __inner_loop_31_256,\@abi-omnipotent
|
||||
.align 32 # comment and punish Coffee Lake by up to 40%
|
||||
__inner_loop_31_256: ################# by Thomas Pornin
|
||||
mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0
|
||||
mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1
|
||||
mov \$0x7FFFFFFF7FFFFFFF, $bias
|
||||
|
||||
.Loop_31_256:
|
||||
cmp $b_, $a_ # if |a_|<|b_|, swap the variables
|
||||
mov $a_, $t0
|
||||
mov $b_, $t1
|
||||
mov $fg0, $t2
|
||||
mov $fg1, $t3
|
||||
cmovb $b_, $a_
|
||||
cmovb $t0, $b_
|
||||
cmovb $fg1, $fg0
|
||||
cmovb $t2, $fg1
|
||||
|
||||
sub $b_, $a_ # |a_|-|b_|
|
||||
sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1|
|
||||
add $bias, $fg0
|
||||
|
||||
test \$1, $t0 # if |a_| was even, roll back
|
||||
cmovz $t0, $a_
|
||||
cmovz $t1, $b_
|
||||
cmovz $t2, $fg0
|
||||
cmovz $t3, $fg1
|
||||
|
||||
shr \$1, $a_ # |a_|>>=1
|
||||
add $fg1, $fg1 # |f1|<<=1, |g1|<<=1
|
||||
sub $bias, $fg1
|
||||
sub \$1, $cnt
|
||||
jnz .Loop_31_256
|
||||
|
||||
shr \$32, $bias
|
||||
mov %ecx, %edx # $fg0, $f0
|
||||
mov ${fg1}d, ${f1}d
|
||||
shr \$32, $g0
|
||||
shr \$32, $g1
|
||||
sub $bias, $f0 # remove the bias
|
||||
sub $bias, $g0
|
||||
sub $bias, $f1
|
||||
sub $bias, $g1
|
||||
|
||||
ret
|
||||
.size __inner_loop_31_256,.-__inner_loop_31_256
|
||||
|
||||
.type __inner_loop_62_256,\@abi-omnipotent
|
||||
.align 32
|
||||
__inner_loop_62_256:
|
||||
mov $cnt, %r15d
|
||||
mov \$1, $f0 # |f0|=1
|
||||
xor $g0, $g0 # |g0|=0
|
||||
xor $f1, $f1 # |f1|=0
|
||||
mov $f0, $g1 # |g1|=1
|
||||
mov $f0, %r14
|
||||
|
||||
.Loop_62_256:
|
||||
xor $t0, $t0
|
||||
test %r14, $a_lo # if |a_| is odd, then we'll be subtracting |b_|
|
||||
mov $b_lo, $t1
|
||||
cmovnz $b_lo, $t0
|
||||
sub $a_lo, $t1 # |b_|-|a_|
|
||||
mov $a_lo, $t2
|
||||
sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even)
|
||||
cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_|
|
||||
cmovc $t2, $b_lo # |b_| = |a_|
|
||||
mov $f0, $t0 # exchange |f0| and |f1|
|
||||
cmovc $f1, $f0
|
||||
cmovc $t0, $f1
|
||||
mov $g0, $t1 # exchange |g0| and |g1|
|
||||
cmovc $g1, $g0
|
||||
cmovc $t1, $g1
|
||||
xor $t0, $t0
|
||||
xor $t1, $t1
|
||||
shr \$1, $a_lo
|
||||
test %r14, $t2 # if |a_| was odd, then we'll be subtracting...
|
||||
cmovnz $f1, $t0
|
||||
cmovnz $g1, $t1
|
||||
add $f1, $f1 # |f1|<<=1
|
||||
add $g1, $g1 # |g1|<<=1
|
||||
sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even)
|
||||
sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...)
|
||||
sub \$1, %r15d
|
||||
jnz .Loop_62_256
|
||||
|
||||
ret
|
||||
.size __inner_loop_62_256,.-__inner_loop_62_256
|
||||
___
|
||||
}
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
610
blst/asm/ct_inverse_mod_384-armv8.pl
Executable file
610
blst/asm/ct_inverse_mod_384-armv8.pl
Executable file
|
@ -0,0 +1,610 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Both constant-time and fast Euclidean inversion as suggested in
|
||||
# https://eprint.iacr.org/2020/972. Performance is >12x better [on
|
||||
# Cortex cores] than modulus-specific FLT addition chain...
|
||||
#
|
||||
# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
|
||||
#
|
||||
$python_ref.=<<'___';
|
||||
def ct_inverse_mod_383(inp, mod):
|
||||
a, u = inp, 1
|
||||
b, v = mod, 0
|
||||
|
||||
k = 62
|
||||
w = 64
|
||||
mask = (1 << w) - 1
|
||||
|
||||
for i in range(0, 766 // k):
|
||||
# __ab_approximation_62
|
||||
n = max(a.bit_length(), b.bit_length())
|
||||
if n < 128:
|
||||
a_, b_ = a, b
|
||||
else:
|
||||
a_ = (a & mask) | ((a >> (n-w)) << w)
|
||||
b_ = (b & mask) | ((b >> (n-w)) << w)
|
||||
|
||||
# __inner_loop_62
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, k):
|
||||
if a_ & 1:
|
||||
if a_ < b_:
|
||||
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
|
||||
a_, f0, g0 = a_-b_, f0-f1, g0-g1
|
||||
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
|
||||
|
||||
# __smul_383_n_shift_by_62
|
||||
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
|
||||
if a < 0:
|
||||
a, f0, g0 = -a, -f0, -g0
|
||||
if b < 0:
|
||||
b, f1, g1 = -b, -f1, -g1
|
||||
|
||||
# __smul_767x63
|
||||
u, v = u*f0 + v*g0, u*f1 + v*g1
|
||||
|
||||
if 766 % k:
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, 766 % k):
|
||||
if a & 1:
|
||||
if a < b:
|
||||
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
|
||||
a, f0, g0 = a-b, f0-f1, g0-g1
|
||||
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
|
||||
|
||||
v = u*f1 + v*g1
|
||||
|
||||
if v < 0:
|
||||
v += mod << (768 - mod.bit_length()) # left aligned
|
||||
|
||||
return v & (2**768 - 1) # to be reduced % mod
|
||||
___
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
|
||||
my @acc=map("x$_",(3..14));
|
||||
my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21));
|
||||
my $cnt = $n_ptr;
|
||||
my @t = map("x$_",(22..28,2));
|
||||
my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11];
|
||||
|
||||
$frame = 16+2*512;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl ct_inverse_mod_383
|
||||
.type ct_inverse_mod_383, %function
|
||||
.align 5
|
||||
ct_inverse_mod_383:
|
||||
paciasp
|
||||
stp x29, x30, [sp,#-128]!
|
||||
add x29, sp, #0
|
||||
stp x19, x20, [sp,#16]
|
||||
stp x21, x22, [sp,#32]
|
||||
stp x23, x24, [sp,#48]
|
||||
stp x25, x26, [sp,#64]
|
||||
stp x27, x28, [sp,#80]
|
||||
sub sp, sp, #$frame
|
||||
|
||||
ldp @t[0], @acc[1], [$in_ptr,#8*0]
|
||||
ldp @acc[2], @acc[3], [$in_ptr,#8*2]
|
||||
ldp @acc[4], @acc[5], [$in_ptr,#8*4]
|
||||
|
||||
add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot
|
||||
and $in_ptr, $in_ptr, #-512 // in the frame...
|
||||
stp $out_ptr, $nx_ptr, [sp]
|
||||
|
||||
ldp @acc[6], @acc[7], [$n_ptr,#8*0]
|
||||
ldp @acc[8], @acc[9], [$n_ptr,#8*2]
|
||||
ldp @acc[10], @acc[11], [$n_ptr,#8*4]
|
||||
|
||||
stp @t[0], @acc[1], [$in_ptr,#8*0] // copy input to |a|
|
||||
stp @acc[2], @acc[3], [$in_ptr,#8*2]
|
||||
stp @acc[4], @acc[5], [$in_ptr,#8*4]
|
||||
stp @acc[6], @acc[7], [$in_ptr,#8*6] // copy modulus to |b|
|
||||
stp @acc[8], @acc[9], [$in_ptr,#8*8]
|
||||
stp @acc[10], @acc[11], [$in_ptr,#8*10]
|
||||
|
||||
////////////////////////////////////////// first iteration
|
||||
mov $cnt, #62
|
||||
bl .Lab_approximation_62_loaded
|
||||
|
||||
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
|
||||
bl __smul_383_n_shift_by_62
|
||||
str $f0,[$out_ptr,#8*12] // initialize |u| with |f0|
|
||||
|
||||
mov $f0, $f1 // |f1|
|
||||
mov $g0, $g1 // |g1|
|
||||
add $out_ptr, $out_ptr, #8*6 // pointer to dst |b|
|
||||
bl __smul_383_n_shift_by_62
|
||||
str $f0, [$out_ptr,#8*12] // initialize |v| with |f1|
|
||||
|
||||
////////////////////////////////////////// second iteration
|
||||
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
|
||||
mov $cnt, #62
|
||||
bl __ab_approximation_62
|
||||
|
||||
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
|
||||
bl __smul_383_n_shift_by_62
|
||||
mov $f_, $f0 // corrected |f0|
|
||||
mov $g_, $g0 // corrected |g0|
|
||||
|
||||
mov $f0, $f1 // |f1|
|
||||
mov $g0, $g1 // |g1|
|
||||
add $out_ptr, $out_ptr, #8*6 // pointer to destination |b|
|
||||
bl __smul_383_n_shift_by_62
|
||||
|
||||
ldr @acc[4], [$in_ptr,#8*12] // |u|
|
||||
ldr @acc[5], [$in_ptr,#8*18] // |v|
|
||||
mul @acc[0], $f_, @acc[4] // |u|*|f0|
|
||||
smulh @acc[1], $f_, @acc[4]
|
||||
mul @acc[2], $g_, @acc[5] // |v|*|g0|
|
||||
smulh @acc[3], $g_, @acc[5]
|
||||
adds @acc[0], @acc[0], @acc[2]
|
||||
adc @acc[1], @acc[1], @acc[3]
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*6]
|
||||
asr @acc[2], @acc[1], #63 // sign extenstion
|
||||
stp @acc[2], @acc[2], [$out_ptr,#8*8]
|
||||
stp @acc[2], @acc[2], [$out_ptr,#8*10]
|
||||
|
||||
mul @acc[0], $f0, @acc[4] // |u|*|f1|
|
||||
smulh @acc[1], $f0, @acc[4]
|
||||
mul @acc[2], $g0, @acc[5] // |v|*|g1|
|
||||
smulh @acc[3], $g0, @acc[5]
|
||||
adds @acc[0], @acc[0], @acc[2]
|
||||
adc @acc[1], @acc[1], @acc[3]
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*12]
|
||||
asr @acc[2], @acc[1], #63 // sign extenstion
|
||||
stp @acc[2], @acc[2], [$out_ptr,#8*14]
|
||||
stp @acc[2], @acc[2], [$out_ptr,#8*16]
|
||||
___
|
||||
for($i=2; $i<11; $i++) {
|
||||
$code.=<<___;
|
||||
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
|
||||
mov $cnt, #62
|
||||
bl __ab_approximation_62
|
||||
|
||||
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
|
||||
bl __smul_383_n_shift_by_62
|
||||
mov $f_, $f0 // corrected |f0|
|
||||
mov $g_, $g0 // corrected |g0|
|
||||
|
||||
mov $f0, $f1 // |f1|
|
||||
mov $g0, $g1 // |g1|
|
||||
add $out_ptr, $out_ptr, #8*6 // pointer to destination |b|
|
||||
bl __smul_383_n_shift_by_62
|
||||
|
||||
add $out_ptr, $out_ptr, #8*6 // pointer to destination |u|
|
||||
bl __smul_383x63
|
||||
|
||||
mov $f_, $f0 // corrected |f1|
|
||||
mov $g_, $g0 // corrected |g1|
|
||||
add $out_ptr, $out_ptr, #8*6 // pointer to destination |v|
|
||||
bl __smul_383x63
|
||||
___
|
||||
$code.=<<___ if ($i>5);
|
||||
bl __smul_767x63_tail
|
||||
___
|
||||
$code.=<<___ if ($i==5);
|
||||
asr @t[5], @t[5], #63 // sign extension
|
||||
stp @t[5], @t[5], [$out_ptr,#8*6]
|
||||
stp @t[5], @t[5], [$out_ptr,#8*8]
|
||||
stp @t[5], @t[5], [$out_ptr,#8*10]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
////////////////////////////////////////// iteration before last
|
||||
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
|
||||
mov $cnt, #62
|
||||
//bl __ab_approximation_62 // |a| and |b| are exact,
|
||||
ldp $a_lo, $a_hi, [$in_ptr,#8*0] // just load
|
||||
ldp $b_lo, $b_hi, [$in_ptr,#8*6]
|
||||
bl __inner_loop_62
|
||||
|
||||
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
|
||||
str $a_lo, [$out_ptr,#8*0]
|
||||
str $b_lo, [$out_ptr,#8*6]
|
||||
|
||||
mov $f_, $f0 // exact |f0|
|
||||
mov $g_, $g0 // exact |g0|
|
||||
mov $f0, $f1
|
||||
mov $g0, $g1
|
||||
add $out_ptr, $out_ptr, #8*12 // pointer to dst |u|
|
||||
bl __smul_383x63
|
||||
|
||||
mov $f_, $f0 // exact |f1|
|
||||
mov $g_, $g0 // exact |g1|
|
||||
add $out_ptr, $out_ptr, #8*6 // pointer to dst |v|
|
||||
bl __smul_383x63
|
||||
bl __smul_767x63_tail
|
||||
|
||||
////////////////////////////////////////// last iteration
|
||||
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
|
||||
mov $cnt, #22 // 766 % 62
|
||||
//bl __ab_approximation_62 // |a| and |b| are exact,
|
||||
ldr $a_lo, [$in_ptr,#8*0] // just load
|
||||
eor $a_hi, $a_hi, $a_hi
|
||||
ldr $b_lo, [$in_ptr,#8*6]
|
||||
eor $b_hi, $b_hi, $b_hi
|
||||
bl __inner_loop_62
|
||||
|
||||
mov $f_, $f1
|
||||
mov $g_, $g1
|
||||
ldp $out_ptr, $f0, [sp] // original out_ptr and n_ptr
|
||||
bl __smul_383x63
|
||||
bl __smul_767x63_tail
|
||||
ldr x30, [x29,#8]
|
||||
|
||||
asr @t[0], @acc[5], #63 // sign as mask
|
||||
ldp @acc[6], @acc[7], [$f0,#8*0]
|
||||
ldp @acc[8], @acc[9], [$f0,#8*2]
|
||||
ldp @acc[10], @acc[11], [$f0,#8*4]
|
||||
|
||||
and @acc[6], @acc[6], @t[0] // add mod<<384 conditionally
|
||||
and @acc[7], @acc[7], @t[0]
|
||||
adds @acc[0], @acc[0], @acc[6]
|
||||
and @acc[8], @acc[8], @t[0]
|
||||
adcs @acc[1], @acc[1], @acc[7]
|
||||
and @acc[9], @acc[9], @t[0]
|
||||
adcs @acc[2], @acc[2], @acc[8]
|
||||
and @acc[10], @acc[10], @t[0]
|
||||
adcs @acc[3], @acc[3], @acc[9]
|
||||
and @acc[11], @acc[11], @t[0]
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*6]
|
||||
adcs @acc[4], @acc[4], @acc[10]
|
||||
stp @acc[2], @acc[3], [$out_ptr,#8*8]
|
||||
adc @acc[5], @acc[5], @acc[11]
|
||||
stp @acc[4], @acc[5], [$out_ptr,#8*10]
|
||||
|
||||
add sp, sp, #$frame
|
||||
ldp x19, x20, [x29,#16]
|
||||
ldp x21, x22, [x29,#32]
|
||||
ldp x23, x24, [x29,#48]
|
||||
ldp x25, x26, [x29,#64]
|
||||
ldp x27, x28, [x29,#80]
|
||||
ldr x29, [sp],#128
|
||||
autiasp
|
||||
ret
|
||||
.size ct_inverse_mod_383,.-ct_inverse_mod_383
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// see corresponding commentary in ctx_inverse_mod_384-x86_64...
|
||||
.type __smul_383x63, %function
|
||||
.align 5
|
||||
__smul_383x63:
|
||||
___
|
||||
for($j=0; $j<2; $j++) {
|
||||
my $f_ = $f_; $f_ = $g_ if ($j);
|
||||
my @acc = @acc; @acc = @acc[6..11] if ($j);
|
||||
my $k = 8*12+8*6*$j;
|
||||
$code.=<<___;
|
||||
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|)
|
||||
asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s)
|
||||
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
|
||||
eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|)
|
||||
ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k]
|
||||
|
||||
eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|)
|
||||
sub $f_, $f_, $f1
|
||||
eor @acc[1], @acc[1], $f1
|
||||
adds @acc[0], @acc[0], $f1, lsr#63
|
||||
eor @acc[2], @acc[2], $f1
|
||||
adcs @acc[1], @acc[1], xzr
|
||||
eor @acc[3], @acc[3], $f1
|
||||
adcs @acc[2], @acc[2], xzr
|
||||
eor @acc[4], @acc[4], $f1
|
||||
adcs @acc[3], @acc[3], xzr
|
||||
umulh @t[0], @acc[0], $f_
|
||||
eor @acc[5], @acc[5], $f1
|
||||
umulh @t[1], @acc[1], $f_
|
||||
adcs @acc[4], @acc[4], xzr
|
||||
umulh @t[2], @acc[2], $f_
|
||||
adcs @acc[5], @acc[5], xzr
|
||||
umulh @t[3], @acc[3], $f_
|
||||
___
|
||||
$code.=<<___ if ($j);
|
||||
adc $g1, xzr, xzr // used in __smul_767x63_tail
|
||||
___
|
||||
$code.=<<___;
|
||||
umulh @t[4], @acc[4], $f_
|
||||
mul @acc[0], @acc[0], $f_
|
||||
mul @acc[1], @acc[1], $f_
|
||||
mul @acc[2], @acc[2], $f_
|
||||
adds @acc[1], @acc[1], @t[0]
|
||||
mul @acc[3], @acc[3], $f_
|
||||
adcs @acc[2], @acc[2], @t[1]
|
||||
mul @acc[4], @acc[4], $f_
|
||||
adcs @acc[3], @acc[3], @t[2]
|
||||
mul @t[5+$j],@acc[5], $f_
|
||||
adcs @acc[4], @acc[4], @t[3]
|
||||
adcs @t[5+$j],@t[5+$j],@t[4]
|
||||
___
|
||||
$code.=<<___ if ($j==0);
|
||||
adc @t[7], xzr, xzr
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
adc @t[7], @t[7], xzr
|
||||
|
||||
adds @acc[0], @acc[0], @acc[6]
|
||||
adcs @acc[1], @acc[1], @acc[7]
|
||||
adcs @acc[2], @acc[2], @acc[8]
|
||||
adcs @acc[3], @acc[3], @acc[9]
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*0]
|
||||
adcs @acc[4], @acc[4], @acc[10]
|
||||
stp @acc[2], @acc[3], [$out_ptr,#8*2]
|
||||
adcs @t[5], @t[5], @t[6]
|
||||
stp @acc[4], @t[5], [$out_ptr,#8*4]
|
||||
adc @t[6], @t[7], xzr // used in __smul_767x63_tail
|
||||
|
||||
ret
|
||||
.size __smul_383x63,.-__smul_383x63
|
||||
|
||||
.type __smul_767x63_tail, %function
|
||||
.align 5
|
||||
__smul_767x63_tail:
|
||||
smulh @t[5], @acc[5], $f_
|
||||
ldp @acc[0], @acc[1], [$in_ptr,#8*24] // load rest of |v|
|
||||
umulh @acc[11],@acc[11], $g_
|
||||
ldp @acc[2], @acc[3], [$in_ptr,#8*26]
|
||||
ldp @acc[4], @acc[5], [$in_ptr,#8*28]
|
||||
|
||||
eor @acc[0], @acc[0], $f1 // conditionally negate rest of |v|
|
||||
eor @acc[1], @acc[1], $f1
|
||||
eor @acc[2], @acc[2], $f1
|
||||
adds @acc[0], @acc[0], $g1
|
||||
eor @acc[3], @acc[3], $f1
|
||||
adcs @acc[1], @acc[1], xzr
|
||||
eor @acc[4], @acc[4], $f1
|
||||
adcs @acc[2], @acc[2], xzr
|
||||
eor @acc[5], @acc[5], $f1
|
||||
adcs @acc[3], @acc[3], xzr
|
||||
umulh @t[0], @acc[0], $g_
|
||||
adcs @acc[4], @acc[4], xzr
|
||||
umulh @t[1], @acc[1], $g_
|
||||
adc @acc[5], @acc[5], xzr
|
||||
|
||||
umulh @t[2], @acc[2], $g_
|
||||
add @acc[11], @acc[11], @t[6]
|
||||
umulh @t[3], @acc[3], $g_
|
||||
asr @t[6], @t[5], #63
|
||||
umulh @t[4], @acc[4], $g_
|
||||
mul @acc[0], @acc[0], $g_
|
||||
mul @acc[1], @acc[1], $g_
|
||||
mul @acc[2], @acc[2], $g_
|
||||
adds @acc[0], @acc[0], @acc[11]
|
||||
mul @acc[3], @acc[3], $g_
|
||||
adcs @acc[1], @acc[1], @t[0]
|
||||
mul @acc[4], @acc[4], $g_
|
||||
adcs @acc[2], @acc[2], @t[1]
|
||||
mul @acc[5], @acc[5], $g_
|
||||
adcs @acc[3], @acc[3], @t[2]
|
||||
adcs @acc[4], @acc[4], @t[3]
|
||||
adc @acc[5], @acc[5], @t[4]
|
||||
|
||||
adds @acc[0], @acc[0], @t[5]
|
||||
adcs @acc[1], @acc[1], @t[6]
|
||||
adcs @acc[2], @acc[2], @t[6]
|
||||
adcs @acc[3], @acc[3], @t[6]
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*6]
|
||||
adcs @acc[4], @acc[4], @t[6]
|
||||
stp @acc[2], @acc[3], [$out_ptr,#8*8]
|
||||
adc @acc[5], @acc[5], @t[6]
|
||||
stp @acc[4], @acc[5], [$out_ptr,#8*10]
|
||||
|
||||
ret
|
||||
.size __smul_767x63_tail,.-__smul_767x63_tail
|
||||
|
||||
.type __smul_383_n_shift_by_62, %function
|
||||
.align 5
|
||||
__smul_383_n_shift_by_62:
|
||||
___
|
||||
for($j=0; $j<2; $j++) {
|
||||
my $f0 = $f0; $f0 = $g0 if ($j);
|
||||
my @acc = @acc; @acc = @acc[6..11] if ($j);
|
||||
my $k = 8*6*$j;
|
||||
$code.=<<___;
|
||||
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|)
|
||||
asr @t[6], $f0, #63 // |f0|'s sign as mask (or |g0|'s)
|
||||
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
|
||||
eor @t[7], $f0, @t[6] // conditionally negate |f0| (or |g0|)
|
||||
ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k]
|
||||
|
||||
eor @acc[0], @acc[0], @t[6] // conditionally negate |a| (or |b|)
|
||||
sub @t[7], @t[7], @t[6]
|
||||
eor @acc[1], @acc[1], @t[6]
|
||||
adds @acc[0], @acc[0], @t[6], lsr#63
|
||||
eor @acc[2], @acc[2], @t[6]
|
||||
adcs @acc[1], @acc[1], xzr
|
||||
eor @acc[3], @acc[3], @t[6]
|
||||
adcs @acc[2], @acc[2], xzr
|
||||
eor @acc[4], @acc[4], @t[6]
|
||||
umulh @t[0], @acc[0], @t[7]
|
||||
adcs @acc[3], @acc[3], xzr
|
||||
umulh @t[1], @acc[1], @t[7]
|
||||
eor @acc[5], @acc[5], @t[6]
|
||||
umulh @t[2], @acc[2], @t[7]
|
||||
adcs @acc[4], @acc[4], xzr
|
||||
umulh @t[3], @acc[3], @t[7]
|
||||
adc @acc[5], @acc[5], xzr
|
||||
|
||||
umulh @t[4], @acc[4], @t[7]
|
||||
smulh @t[5+$j], @acc[5], @t[7]
|
||||
mul @acc[0], @acc[0], @t[7]
|
||||
mul @acc[1], @acc[1], @t[7]
|
||||
mul @acc[2], @acc[2], @t[7]
|
||||
adds @acc[1], @acc[1], @t[0]
|
||||
mul @acc[3], @acc[3], @t[7]
|
||||
adcs @acc[2], @acc[2], @t[1]
|
||||
mul @acc[4], @acc[4], @t[7]
|
||||
adcs @acc[3], @acc[3], @t[2]
|
||||
mul @acc[5], @acc[5], @t[7]
|
||||
adcs @acc[4], @acc[4], @t[3]
|
||||
adcs @acc[5], @acc[5] ,@t[4]
|
||||
adc @t[5+$j], @t[5+$j], xzr
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
adds @acc[0], @acc[0], @acc[6]
|
||||
adcs @acc[1], @acc[1], @acc[7]
|
||||
adcs @acc[2], @acc[2], @acc[8]
|
||||
adcs @acc[3], @acc[3], @acc[9]
|
||||
adcs @acc[4], @acc[4], @acc[10]
|
||||
adcs @acc[5], @acc[5], @acc[11]
|
||||
adc @acc[6], @t[5], @t[6]
|
||||
|
||||
extr @acc[0], @acc[1], @acc[0], #62
|
||||
extr @acc[1], @acc[2], @acc[1], #62
|
||||
extr @acc[2], @acc[3], @acc[2], #62
|
||||
asr @t[6], @acc[6], #63
|
||||
extr @acc[3], @acc[4], @acc[3], #62
|
||||
extr @acc[4], @acc[5], @acc[4], #62
|
||||
extr @acc[5], @acc[6], @acc[5], #62
|
||||
|
||||
eor @acc[0], @acc[0], @t[6]
|
||||
eor @acc[1], @acc[1], @t[6]
|
||||
adds @acc[0], @acc[0], @t[6], lsr#63
|
||||
eor @acc[2], @acc[2], @t[6]
|
||||
adcs @acc[1], @acc[1], xzr
|
||||
eor @acc[3], @acc[3], @t[6]
|
||||
adcs @acc[2], @acc[2], xzr
|
||||
eor @acc[4], @acc[4], @t[6]
|
||||
adcs @acc[3], @acc[3], xzr
|
||||
eor @acc[5], @acc[5], @t[6]
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*0]
|
||||
adcs @acc[4], @acc[4], xzr
|
||||
stp @acc[2], @acc[3], [$out_ptr,#8*2]
|
||||
adc @acc[5], @acc[5], xzr
|
||||
stp @acc[4], @acc[5], [$out_ptr,#8*4]
|
||||
|
||||
eor $f0, $f0, @t[6]
|
||||
eor $g0, $g0, @t[6]
|
||||
sub $f0, $f0, @t[6]
|
||||
sub $g0, $g0, @t[6]
|
||||
|
||||
ret
|
||||
.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
|
||||
___
|
||||
|
||||
{
|
||||
my @a = @acc[0..5];
|
||||
my @b = @acc[6..11];
|
||||
|
||||
$code.=<<___;
|
||||
.type __ab_approximation_62, %function
|
||||
.align 4
|
||||
__ab_approximation_62:
|
||||
ldp @a[4], @a[5], [$in_ptr,#8*4]
|
||||
ldp @b[4], @b[5], [$in_ptr,#8*10]
|
||||
ldp @a[2], @a[3], [$in_ptr,#8*2]
|
||||
ldp @b[2], @b[3], [$in_ptr,#8*8]
|
||||
|
||||
.Lab_approximation_62_loaded:
|
||||
orr @t[0], @a[5], @b[5] // check top-most limbs, ...
|
||||
cmp @t[0], #0
|
||||
csel @a[5], @a[5], @a[4], ne
|
||||
csel @b[5], @b[5], @b[4], ne
|
||||
csel @a[4], @a[4], @a[3], ne
|
||||
orr @t[0], @a[5], @b[5] // ... ones before top-most, ...
|
||||
csel @b[4], @b[4], @b[3], ne
|
||||
|
||||
ldp @a[0], @a[1], [$in_ptr,#8*0]
|
||||
ldp @b[0], @b[1], [$in_ptr,#8*6]
|
||||
|
||||
cmp @t[0], #0
|
||||
csel @a[5], @a[5], @a[4], ne
|
||||
csel @b[5], @b[5], @b[4], ne
|
||||
csel @a[4], @a[4], @a[2], ne
|
||||
orr @t[0], @a[5], @b[5] // ... and ones before that ...
|
||||
csel @b[4], @b[4], @b[2], ne
|
||||
|
||||
cmp @t[0], #0
|
||||
csel @a[5], @a[5], @a[4], ne
|
||||
csel @b[5], @b[5], @b[4], ne
|
||||
csel @a[4], @a[4], @a[1], ne
|
||||
orr @t[0], @a[5], @b[5]
|
||||
csel @b[4], @b[4], @b[1], ne
|
||||
|
||||
clz @t[0], @t[0]
|
||||
cmp @t[0], #64
|
||||
csel @t[0], @t[0], xzr, ne
|
||||
csel @a[5], @a[5], @a[4], ne
|
||||
csel @b[5], @b[5], @b[4], ne
|
||||
neg @t[1], @t[0]
|
||||
|
||||
lslv @a[5], @a[5], @t[0] // align high limbs to the left
|
||||
lslv @b[5], @b[5], @t[0]
|
||||
lsrv @a[4], @a[4], @t[1]
|
||||
lsrv @b[4], @b[4], @t[1]
|
||||
and @a[4], @a[4], @t[1], asr#6
|
||||
and @b[4], @b[4], @t[1], asr#6
|
||||
orr @a[5], @a[5], @a[4]
|
||||
orr @b[5], @b[5], @b[4]
|
||||
|
||||
b __inner_loop_62
|
||||
ret
|
||||
.size __ab_approximation_62,.-__ab_approximation_62
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.type __inner_loop_62, %function
|
||||
.align 4
|
||||
__inner_loop_62:
|
||||
mov $f0, #1 // |f0|=1
|
||||
mov $g0, #0 // |g0|=0
|
||||
mov $f1, #0 // |f1|=0
|
||||
mov $g1, #1 // |g1|=1
|
||||
|
||||
.Loop_62:
|
||||
sbfx @t[6], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting
|
||||
sub $cnt, $cnt, #1
|
||||
subs @t[2], $b_lo, $a_lo // |b_|-|a_|
|
||||
and @t[0], $b_lo, @t[6]
|
||||
sbc @t[3], $b_hi, $a_hi
|
||||
and @t[1], $b_hi, @t[6]
|
||||
subs @t[4], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
|
||||
mov @t[0], $f0
|
||||
sbcs @t[5], $a_hi, @t[1]
|
||||
mov @t[1], $g0
|
||||
csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_|
|
||||
csel $b_hi, $b_hi, $a_hi, hs
|
||||
csel $a_lo, @t[4], @t[2], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
|
||||
csel $a_hi, @t[5], @t[3], hs
|
||||
csel $f0, $f0, $f1, hs // exchange |f0| and |f1|
|
||||
csel $f1, $f1, @t[0], hs
|
||||
csel $g0, $g0, $g1, hs // exchange |g0| and |g1|
|
||||
csel $g1, $g1, @t[1], hs
|
||||
extr $a_lo, $a_hi, $a_lo, #1
|
||||
lsr $a_hi, $a_hi, #1
|
||||
and @t[0], $f1, @t[6]
|
||||
and @t[1], $g1, @t[6]
|
||||
add $f1, $f1, $f1 // |f1|<<=1
|
||||
add $g1, $g1, $g1 // |g1|<<=1
|
||||
sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even)
|
||||
sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...)
|
||||
cbnz $cnt, .Loop_62
|
||||
|
||||
ret
|
||||
.size __inner_loop_62,.-__inner_loop_62
|
||||
___
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
398
blst/asm/ct_is_square_mod_384-armv8.pl
Executable file
398
blst/asm/ct_is_square_mod_384-armv8.pl
Executable file
|
@ -0,0 +1,398 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Both constant-time and fast quadratic residue test as suggested in
|
||||
# https://eprint.iacr.org/2020/972. Performance is >12x better [on
|
||||
# Cortex cores] than modulus-specific Legendre symbol addition chain...
|
||||
#
|
||||
# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
|
||||
#
|
||||
$python_ref.=<<'___';
|
||||
def ct_is_square_mod_384(inp, mod):
|
||||
a = inp
|
||||
b = mod
|
||||
L = 0 # only least significant bit, adding 1 makes up for sign change
|
||||
|
||||
k = 30
|
||||
w = 32
|
||||
mask = (1 << w) - 1
|
||||
|
||||
for i in range(0, 768 // k - 1):
|
||||
# __ab_approximation_30
|
||||
n = max(a.bit_length(), b.bit_length())
|
||||
if n < 64:
|
||||
a_, b_ = a, b
|
||||
else:
|
||||
a_ = (a & mask) | ((a >> (n-w)) << w)
|
||||
b_ = (b & mask) | ((b >> (n-w)) << w)
|
||||
|
||||
# __inner_loop_30
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, k):
|
||||
if a_ & 1:
|
||||
if a_ < b_:
|
||||
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
|
||||
L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
|
||||
# tell the whole story
|
||||
a_, f0, g0 = a_-b_, f0-f1, g0-g1
|
||||
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
|
||||
L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7]
|
||||
|
||||
# __smulq_384_n_shift_by_30
|
||||
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
|
||||
if b < 0:
|
||||
b = -b
|
||||
if a < 0:
|
||||
a = -a
|
||||
L += (b % 4) >> 1 # |b| is always odd, the second bit
|
||||
# tells the whole story
|
||||
|
||||
if True:
|
||||
for j in range(0, 768 % k + k):
|
||||
if a & 1:
|
||||
if a < b:
|
||||
a, b = b, a
|
||||
L += (a & b) >> 1 # |a| and |b| are both odd, second bits
|
||||
# tell the whole story
|
||||
a = a-b
|
||||
a = a >> 1
|
||||
L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7]
|
||||
|
||||
return (L & 1) ^ 1
|
||||
___
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2));
|
||||
my @acc=map("x$_",(3..14));
|
||||
my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20));
|
||||
my @t = map("x$_",(21..28));
|
||||
my ($a_, $b_) = @acc[5,11];
|
||||
|
||||
$frame = 2*256;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl ct_is_square_mod_384
|
||||
.type ct_is_square_mod_384, %function
|
||||
.align 5
|
||||
ct_is_square_mod_384:
|
||||
paciasp
|
||||
stp x29, x30, [sp,#-128]!
|
||||
add x29, sp, #0
|
||||
stp x19, x20, [sp,#16]
|
||||
stp x21, x22, [sp,#32]
|
||||
stp x23, x24, [sp,#48]
|
||||
stp x25, x26, [sp,#64]
|
||||
stp x27, x28, [sp,#80]
|
||||
sub sp, sp, #$frame
|
||||
|
||||
ldp @acc[0], @acc[1], [x0,#8*0] // load input
|
||||
ldp @acc[2], @acc[3], [x0,#8*2]
|
||||
ldp @acc[4], @acc[5], [x0,#8*4]
|
||||
|
||||
add $in_ptr, sp, #255 // find closest 256-byte-aligned spot
|
||||
and $in_ptr, $in_ptr, #-256 // in the frame...
|
||||
|
||||
ldp @acc[6], @acc[7], [x1,#8*0] // load modulus
|
||||
ldp @acc[8], @acc[9], [x1,#8*2]
|
||||
ldp @acc[10], @acc[11], [x1,#8*4]
|
||||
|
||||
stp @acc[0], @acc[1], [$in_ptr,#8*6] // copy input to |a|
|
||||
stp @acc[2], @acc[3], [$in_ptr,#8*8]
|
||||
stp @acc[4], @acc[5], [$in_ptr,#8*10]
|
||||
stp @acc[6], @acc[7], [$in_ptr,#8*0] // copy modulus to |b|
|
||||
stp @acc[8], @acc[9], [$in_ptr,#8*2]
|
||||
stp @acc[10], @acc[11], [$in_ptr,#8*4]
|
||||
|
||||
eor $L, $L, $L // init the Legendre symbol
|
||||
mov $cnt, #24 // 24 is 768/30-1
|
||||
b .Loop_is_square
|
||||
|
||||
.align 4
|
||||
.Loop_is_square:
|
||||
bl __ab_approximation_30
|
||||
sub $cnt, $cnt, #1
|
||||
|
||||
eor $out_ptr, $in_ptr, #128 // pointer to dst |b|
|
||||
bl __smul_384_n_shift_by_30
|
||||
|
||||
mov $f1, $f0 // |f0|
|
||||
mov $g1, $g0 // |g0|
|
||||
add $out_ptr, $out_ptr, #8*6 // pointer to dst |a|
|
||||
bl __smul_384_n_shift_by_30
|
||||
|
||||
ldp @acc[6], @acc[7], [$out_ptr,#-8*6]
|
||||
eor $in_ptr, $in_ptr, #128 // flip-flop src |a|b|
|
||||
and @t[6], @t[6], @acc[6] // if |a| was negative,
|
||||
add $L, $L, @t[6], lsr#1 // adjust |L|
|
||||
|
||||
cbnz $cnt, .Loop_is_square
|
||||
|
||||
////////////////////////////////////////// last iteration
|
||||
//bl __ab_approximation_30 // |a| and |b| are exact,
|
||||
//ldr $a_, [$in_ptr,#8*6] // just load
|
||||
mov $b_, @acc[6] // ldr $b_, [$in_ptr,#8*0]
|
||||
mov $cnt, #48 // 48 is 768%30 + 30
|
||||
bl __inner_loop_48
|
||||
ldr x30, [x29,#8]
|
||||
|
||||
and x0, $L, #1
|
||||
eor x0, x0, #1
|
||||
|
||||
add sp, sp, #$frame
|
||||
ldp x19, x20, [x29,#16]
|
||||
ldp x21, x22, [x29,#32]
|
||||
ldp x23, x24, [x29,#48]
|
||||
ldp x25, x26, [x29,#64]
|
||||
ldp x27, x28, [x29,#80]
|
||||
ldr x29, [sp],#128
|
||||
autiasp
|
||||
ret
|
||||
.size ct_is_square_mod_384,.-ct_is_square_mod_384
|
||||
|
||||
.type __smul_384_n_shift_by_30, %function
|
||||
.align 5
|
||||
__smul_384_n_shift_by_30:
|
||||
___
|
||||
for($j=0; $j<2; $j++) {
|
||||
my $fx = $g1; $fx = $f1 if ($j);
|
||||
my @acc = @acc; @acc = @acc[6..11] if ($j);
|
||||
my $k = 8*6*$j;
|
||||
$code.=<<___;
|
||||
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |b| (or |a|)
|
||||
asr @t[6], $fx, #63 // |g1|'s sign as mask (or |f1|'s)
|
||||
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
|
||||
eor $fx, $fx, @t[6] // conditionally negate |g1| (or |f1|)
|
||||
ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k]
|
||||
|
||||
eor @acc[0], @acc[0], @t[6] // conditionally negate |b| (or |a|)
|
||||
sub $fx, $fx, @t[6]
|
||||
eor @acc[1], @acc[1], @t[6]
|
||||
adds @acc[0], @acc[0], @t[6], lsr#63
|
||||
eor @acc[2], @acc[2], @t[6]
|
||||
adcs @acc[1], @acc[1], xzr
|
||||
eor @acc[3], @acc[3], @t[6]
|
||||
adcs @acc[2], @acc[2], xzr
|
||||
eor @acc[4], @acc[4], @t[6]
|
||||
umulh @t[0], @acc[0], $fx
|
||||
adcs @acc[3], @acc[3], xzr
|
||||
umulh @t[1], @acc[1], $fx
|
||||
eor @acc[5], @acc[5], @t[6]
|
||||
umulh @t[2], @acc[2], $fx
|
||||
adcs @acc[4], @acc[4], xzr
|
||||
umulh @t[3], @acc[3], $fx
|
||||
adc @acc[5], @acc[5], xzr
|
||||
|
||||
umulh @t[4], @acc[4], $fx
|
||||
and @t[7], $fx, @t[6]
|
||||
umulh @t[5+$j], @acc[5], $fx
|
||||
neg @t[7], @t[7]
|
||||
mul @acc[0], @acc[0], $fx
|
||||
mul @acc[1], @acc[1], $fx
|
||||
mul @acc[2], @acc[2], $fx
|
||||
adds @acc[1], @acc[1], @t[0]
|
||||
mul @acc[3], @acc[3], $fx
|
||||
adcs @acc[2], @acc[2], @t[1]
|
||||
mul @acc[4], @acc[4], $fx
|
||||
adcs @acc[3], @acc[3], @t[2]
|
||||
mul @acc[5], @acc[5], $fx
|
||||
adcs @acc[4], @acc[4], @t[3]
|
||||
adcs @acc[5], @acc[5] ,@t[4]
|
||||
adc @t[5+$j], @t[5+$j], @t[7]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
adds @acc[0], @acc[0], @acc[6]
|
||||
adcs @acc[1], @acc[1], @acc[7]
|
||||
adcs @acc[2], @acc[2], @acc[8]
|
||||
adcs @acc[3], @acc[3], @acc[9]
|
||||
adcs @acc[4], @acc[4], @acc[10]
|
||||
adcs @acc[5], @acc[5], @acc[11]
|
||||
adc @acc[6], @t[5], @t[6]
|
||||
|
||||
extr @acc[0], @acc[1], @acc[0], #30
|
||||
extr @acc[1], @acc[2], @acc[1], #30
|
||||
extr @acc[2], @acc[3], @acc[2], #30
|
||||
asr @t[6], @acc[6], #63
|
||||
extr @acc[3], @acc[4], @acc[3], #30
|
||||
extr @acc[4], @acc[5], @acc[4], #30
|
||||
extr @acc[5], @acc[6], @acc[5], #30
|
||||
|
||||
eor @acc[0], @acc[0], @t[6]
|
||||
eor @acc[1], @acc[1], @t[6]
|
||||
adds @acc[0], @acc[0], @t[6], lsr#63
|
||||
eor @acc[2], @acc[2], @t[6]
|
||||
adcs @acc[1], @acc[1], xzr
|
||||
eor @acc[3], @acc[3], @t[6]
|
||||
adcs @acc[2], @acc[2], xzr
|
||||
eor @acc[4], @acc[4], @t[6]
|
||||
adcs @acc[3], @acc[3], xzr
|
||||
eor @acc[5], @acc[5], @t[6]
|
||||
stp @acc[0], @acc[1], [$out_ptr,#8*0]
|
||||
adcs @acc[4], @acc[4], xzr
|
||||
stp @acc[2], @acc[3], [$out_ptr,#8*2]
|
||||
adc @acc[5], @acc[5], xzr
|
||||
stp @acc[4], @acc[5], [$out_ptr,#8*4]
|
||||
|
||||
ret
|
||||
.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
|
||||
___
|
||||
|
||||
{
|
||||
my @a = @acc[0..5];
|
||||
my @b = @acc[6..11];
|
||||
my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]);
|
||||
|
||||
$code.=<<___;
|
||||
.type __ab_approximation_30, %function
|
||||
.align 4
|
||||
__ab_approximation_30:
|
||||
ldp @b[4], @b[5], [$in_ptr,#8*4] // |a| is still in registers
|
||||
ldp @b[2], @b[3], [$in_ptr,#8*2]
|
||||
|
||||
orr @t[0], @a[5], @b[5] // check top-most limbs, ...
|
||||
cmp @t[0], #0
|
||||
csel @a[5], @a[5], @a[4], ne
|
||||
csel @b[5], @b[5], @b[4], ne
|
||||
csel @a[4], @a[4], @a[3], ne
|
||||
orr @t[0], @a[5], @b[5] // ... ones before top-most, ...
|
||||
csel @b[4], @b[4], @b[3], ne
|
||||
|
||||
cmp @t[0], #0
|
||||
csel @a[5], @a[5], @a[4], ne
|
||||
csel @b[5], @b[5], @b[4], ne
|
||||
csel @a[4], @a[4], @a[2], ne
|
||||
orr @t[0], @a[5], @b[5] // ... and ones before that ...
|
||||
csel @b[4], @b[4], @b[2], ne
|
||||
|
||||
cmp @t[0], #0
|
||||
csel @a[5], @a[5], @a[4], ne
|
||||
csel @b[5], @b[5], @b[4], ne
|
||||
csel @a[4], @a[4], @a[1], ne
|
||||
orr @t[0], @a[5], @b[5] // and one more, ...
|
||||
csel @b[4], @b[4], @b[1], ne
|
||||
|
||||
cmp @t[0], #0
|
||||
csel @a[5], @a[5], @a[4], ne
|
||||
csel @b[5], @b[5], @b[4], ne
|
||||
csel @a[4], @a[4], @a[0], ne
|
||||
orr @t[0], @a[5], @b[5]
|
||||
csel @b[4], @b[4], @b[0], ne
|
||||
|
||||
clz @t[0], @t[0]
|
||||
cmp @t[0], #64
|
||||
csel @t[0], @t[0], xzr, ne
|
||||
csel @a[5], @a[5], @a[4], ne
|
||||
csel @b[5], @b[5], @b[4], ne
|
||||
neg @t[1], @t[0]
|
||||
|
||||
lslv @a[5], @a[5], @t[0] // align high limbs to the left
|
||||
lslv @b[5], @b[5], @t[0]
|
||||
lsrv @a[4], @a[4], @t[1]
|
||||
lsrv @b[4], @b[4], @t[1]
|
||||
and @a[4], @a[4], @t[1], asr#6
|
||||
and @b[4], @b[4], @t[1], asr#6
|
||||
orr $a_, @a[5], @a[4]
|
||||
orr $b_, @b[5], @b[4]
|
||||
|
||||
bfxil $a_, @a[0], #0, #32
|
||||
bfxil $b_, @b[0], #0, #32
|
||||
|
||||
b __inner_loop_30
|
||||
ret
|
||||
.size __ab_approximation_30,.-__ab_approximation_30
|
||||
|
||||
.type __inner_loop_30, %function
|
||||
.align 4
|
||||
__inner_loop_30:
|
||||
mov $cnt, #30
|
||||
mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0
|
||||
mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1
|
||||
mov $bias,#0x7FFFFFFF7FFFFFFF
|
||||
|
||||
.Loop_30:
|
||||
sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting
|
||||
and @t[4], $a_, $b_
|
||||
sub $cnt, $cnt, #1
|
||||
and @t[0], $b_, @t[3]
|
||||
|
||||
sub @t[1], $b_, $a_ // |b_|-|a_|
|
||||
subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
|
||||
add @t[4], $L, @t[4], lsr#1 // L + (a_ & b_) >> 1
|
||||
mov @t[0], $fg1
|
||||
csel $b_, $b_, $a_, hs // |b_| = |a_|
|
||||
csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
|
||||
csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1|
|
||||
csel $fg0, $fg0, @t[0], hs
|
||||
csel $L, $L, @t[4], hs
|
||||
lsr $a_, $a_, #1
|
||||
and @t[0], $fg1, @t[3]
|
||||
and @t[1], $bias, @t[3]
|
||||
add $t[2], $b_, #2
|
||||
sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even)
|
||||
add $fg1, $fg1, $fg1 // |f1|<<=1
|
||||
add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5
|
||||
add $fg0, $fg0, @t[1]
|
||||
sub $fg1, $fg1, $bias
|
||||
|
||||
cbnz $cnt, .Loop_30
|
||||
|
||||
mov $bias, #0x7FFFFFFF
|
||||
ubfx $f0, $fg0, #0, #32
|
||||
ubfx $g0, $fg0, #32, #32
|
||||
ubfx $f1, $fg1, #0, #32
|
||||
ubfx $g1, $fg1, #32, #32
|
||||
sub $f0, $f0, $bias // remove the bias
|
||||
sub $g0, $g0, $bias
|
||||
sub $f1, $f1, $bias
|
||||
sub $g1, $g1, $bias
|
||||
|
||||
ret
|
||||
.size __inner_loop_30,.-__inner_loop_30
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
.type __inner_loop_48, %function
|
||||
.align 4
|
||||
__inner_loop_48:
|
||||
.Loop_48:
|
||||
sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting
|
||||
and @t[4], $a_, $b_
|
||||
sub $cnt, $cnt, #1
|
||||
and @t[0], $b_, @t[3]
|
||||
sub @t[1], $b_, $a_ // |b_|-|a_|
|
||||
subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
|
||||
add @t[4], $L, @t[4], lsr#1
|
||||
csel $b_, $b_, $a_, hs // |b_| = |a_|
|
||||
csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
|
||||
csel $L, $L, @t[4], hs
|
||||
add $t[2], $b_, #2
|
||||
lsr $a_, $a_, #1
|
||||
add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5
|
||||
|
||||
cbnz $cnt, .Loop_48
|
||||
|
||||
ret
|
||||
.size __inner_loop_48,.-__inner_loop_48
|
||||
___
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
494
blst/asm/ct_is_square_mod_384-x86_64.pl
Executable file
494
blst/asm/ct_is_square_mod_384-x86_64.pl
Executable file
|
@ -0,0 +1,494 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Both constant-time and fast quadratic residue test as suggested in
|
||||
# https://eprint.iacr.org/2020/972. Performance is >5x better than
|
||||
# modulus-specific Legendre symbol addition chain...
|
||||
#
|
||||
# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
|
||||
#
|
||||
$python_ref.=<<'___';
|
||||
def ct_is_square_mod_384(inp, mod):
|
||||
a = inp
|
||||
b = mod
|
||||
L = 0 # only least significant bit, adding 1 makes up for sign change
|
||||
|
||||
k = 30
|
||||
w = 32
|
||||
mask = (1 << w) - 1
|
||||
|
||||
for i in range(0, 768 // k - 1):
|
||||
# __ab_approximation_30
|
||||
n = max(a.bit_length(), b.bit_length())
|
||||
if n < 64:
|
||||
a_, b_ = a, b
|
||||
else:
|
||||
a_ = (a & mask) | ((a >> (n-w)) << w)
|
||||
b_ = (b & mask) | ((b >> (n-w)) << w)
|
||||
|
||||
# __inner_loop_30
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, k):
|
||||
if a_ & 1:
|
||||
if a_ < b_:
|
||||
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
|
||||
L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
|
||||
# tell the whole story
|
||||
a_, f0, g0 = a_-b_, f0-f1, g0-g1
|
||||
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
|
||||
L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7]
|
||||
|
||||
# __smulq_384_n_shift_by_30
|
||||
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
|
||||
if b < 0:
|
||||
b = -b
|
||||
if a < 0:
|
||||
a = -a
|
||||
L += (b % 4) >> 1 # |b| is always odd, the second bit
|
||||
# tells the whole story
|
||||
|
||||
if True:
|
||||
for j in range(0, 768 % k + k):
|
||||
if a & 1:
|
||||
if a < b:
|
||||
a, b = b, a
|
||||
L += (a & b) >> 1 # |a| and |b| are both odd, second bits
|
||||
# tell the whole story
|
||||
a = a-b
|
||||
a = a >> 1
|
||||
L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7]
|
||||
|
||||
return (L & 1) ^ 1
|
||||
___
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
my ($out_ptr, $in_ptr) = ("%rdi", "%rsi");
|
||||
my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx");
|
||||
my @acc=map("%r$_",(8..15));
|
||||
my $L = "%rbp";
|
||||
|
||||
$frame = 8*3+2*256;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl ct_is_square_mod_384
|
||||
.type ct_is_square_mod_384,\@function,2,"unwind"
|
||||
.align 32
|
||||
ct_is_square_mod_384:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$$frame, %rsp
|
||||
.cfi_adjust_cfa_offset $frame
|
||||
.cfi_end_prologue
|
||||
|
||||
lea 8*3+255(%rsp), %rax # find closest 256-byte-aligned spot
|
||||
and \$-256, %rax # in the frame...
|
||||
|
||||
mov 8*0(%rdi), @acc[0] # load input
|
||||
mov 8*1(%rdi), @acc[1]
|
||||
mov 8*2(%rdi), @acc[2]
|
||||
mov 8*3(%rdi), @acc[3]
|
||||
mov 8*4(%rdi), @acc[4]
|
||||
mov 8*5(%rdi), @acc[5]
|
||||
|
||||
mov 8*0(%rsi), @acc[6] # load modulus
|
||||
mov 8*1(%rsi), @acc[7]
|
||||
mov 8*2(%rsi), %rbx
|
||||
mov 8*3(%rsi), %rcx
|
||||
mov 8*4(%rsi), %rdx
|
||||
mov 8*5(%rsi), %rdi
|
||||
mov %rax, $in_ptr # pointer to source |a|b|
|
||||
|
||||
mov @acc[0], 8*0(%rax) # copy input to |a|
|
||||
mov @acc[1], 8*1(%rax)
|
||||
mov @acc[2], 8*2(%rax)
|
||||
mov @acc[3], 8*3(%rax)
|
||||
mov @acc[4], 8*4(%rax)
|
||||
mov @acc[5], 8*5(%rax)
|
||||
|
||||
mov @acc[6], 8*6(%rax) # copy modulus to |b|
|
||||
mov @acc[7], 8*7(%rax)
|
||||
mov %rbx, 8*8(%rax)
|
||||
mov %rcx, 8*9(%rax)
|
||||
mov %rdx, 8*10(%rax)
|
||||
mov %rdi, 8*11(%rax)
|
||||
|
||||
xor $L, $L # initialize the Legendre symbol
|
||||
mov \$24, %ecx # 24 is 768/30-1
|
||||
jmp .Loop_is_square
|
||||
|
||||
.align 32
|
||||
.Loop_is_square:
|
||||
mov %ecx, 8*2(%rsp) # offload loop counter
|
||||
|
||||
call __ab_approximation_30
|
||||
mov $f0, 8*0(%rsp) # offload |f0| and |g0|
|
||||
mov $g0, 8*1(%rsp)
|
||||
|
||||
mov \$128+8*6, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |b|
|
||||
call __smulq_384_n_shift_by_30
|
||||
|
||||
mov 8*0(%rsp), $f1 # pop |f0| and |g0|
|
||||
mov 8*1(%rsp), $g1
|
||||
lea -8*6($out_ptr),$out_ptr # pointer to destination |a|
|
||||
call __smulq_384_n_shift_by_30
|
||||
|
||||
mov 8*2(%rsp), %ecx # re-load loop counter
|
||||
xor \$128, $in_ptr # flip-flop pointer to source |a|b|
|
||||
|
||||
and 8*6($out_ptr), @acc[6] # if |a| was negative, adjust |L|
|
||||
shr \$1, @acc[6]
|
||||
add @acc[6], $L
|
||||
|
||||
sub \$1, %ecx
|
||||
jnz .Loop_is_square
|
||||
|
||||
################################# last iteration
|
||||
#call __ab_approximation_30 # |a| and |b| are exact, just load
|
||||
#mov 8*0($in_ptr), @acc[0] # |a_|
|
||||
mov 8*6($in_ptr), @acc[1] # |b_|
|
||||
call __inner_loop_48 # 48 is 768%30+30
|
||||
|
||||
mov \$1, %rax
|
||||
and $L, %rax
|
||||
xor \$1, %rax # return value
|
||||
|
||||
lea $frame(%rsp), %r8 # size optimization
|
||||
mov 8*0(%r8),%r15
|
||||
.cfi_restore %r15
|
||||
mov 8*1(%r8),%r14
|
||||
.cfi_restore %r14
|
||||
mov 8*2(%r8),%r13
|
||||
.cfi_restore %r13
|
||||
mov 8*3(%r8),%r12
|
||||
.cfi_restore %r12
|
||||
mov 8*4(%r8),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 8*5(%r8),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 8*6(%r8),%rsp
|
||||
.cfi_adjust_cfa_offset -$frame-8*6
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size ct_is_square_mod_384,.-ct_is_square_mod_384
|
||||
|
||||
.type __smulq_384_n_shift_by_30,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulq_384_n_shift_by_30:
|
||||
___
|
||||
for($j=0; $j<2; $j++) {
|
||||
$code.=<<___;
|
||||
mov 8*0($in_ptr), @acc[0] # load |a| (or |b|)
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
mov 8*4($in_ptr), @acc[4]
|
||||
mov 8*5($in_ptr), @acc[5]
|
||||
|
||||
mov %rdx, %rbx # |f1| (or |g1|)
|
||||
sar \$63, %rdx # |f1|'s sign as mask (or |g1|'s)
|
||||
xor %rax, %rax
|
||||
sub %rdx, %rax # |f1|'s sign as bit (or |g1|'s)
|
||||
|
||||
xor %rdx, %rbx # conditionally negate |f1| (or |g1|)
|
||||
add %rax, %rbx
|
||||
|
||||
xor %rdx, @acc[0] # conditionally negate |a| (or |b|)
|
||||
xor %rdx, @acc[1]
|
||||
xor %rdx, @acc[2]
|
||||
xor %rdx, @acc[3]
|
||||
xor %rdx, @acc[4]
|
||||
xor %rdx, @acc[5]
|
||||
add @acc[0], %rax
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
|
||||
mov %rdx, @acc[6+$j]
|
||||
and %rbx, @acc[6+$j]
|
||||
mulq %rbx # |a|*|f1| (or |b|*|g1|)
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[1], %rax
|
||||
mov %rdx, @acc[1]
|
||||
___
|
||||
for($i=1; $i<5; $i++) {
|
||||
$code.=<<___;
|
||||
mulq %rbx
|
||||
add %rax, @acc[$i]
|
||||
mov @acc[$i+1], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[$i+1]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
neg @acc[6+$j]
|
||||
mulq %rbx
|
||||
add %rax, @acc[5]
|
||||
adc %rdx, @acc[6+$j]
|
||||
___
|
||||
$code.=<<___ if ($j==0);
|
||||
lea 8*6($in_ptr), $in_ptr # pointer to |b|
|
||||
mov $g1, %rdx
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
lea -8*6($in_ptr), $in_ptr # restore original in_ptr
|
||||
|
||||
add 8*0($out_ptr), @acc[0]
|
||||
adc 8*1($out_ptr), @acc[1]
|
||||
adc 8*2($out_ptr), @acc[2]
|
||||
adc 8*3($out_ptr), @acc[3]
|
||||
adc 8*4($out_ptr), @acc[4]
|
||||
adc 8*5($out_ptr), @acc[5]
|
||||
adc @acc[7], @acc[6]
|
||||
|
||||
shrd \$30, @acc[1], @acc[0]
|
||||
shrd \$30, @acc[2], @acc[1]
|
||||
shrd \$30, @acc[3], @acc[2]
|
||||
shrd \$30, @acc[4], @acc[3]
|
||||
shrd \$30, @acc[5], @acc[4]
|
||||
shrd \$30, @acc[6], @acc[5]
|
||||
|
||||
sar \$63, @acc[6] # sign as mask
|
||||
xor %rbx, %rbx
|
||||
sub @acc[6], %rbx # sign as bit
|
||||
|
||||
xor @acc[6], @acc[0] # conditionally negate the result
|
||||
xor @acc[6], @acc[1]
|
||||
xor @acc[6], @acc[2]
|
||||
xor @acc[6], @acc[3]
|
||||
xor @acc[6], @acc[4]
|
||||
xor @acc[6], @acc[5]
|
||||
add %rbx, @acc[0]
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
|
||||
ret
|
||||
.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30
|
||||
___
|
||||
{
|
||||
my ($a_, $b_) = @acc[0..1];
|
||||
my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15));
|
||||
my ($fg0, $fg1, $bias) = ($g0, $g1, $t5);
|
||||
my $cnt = "%edi";
|
||||
{
|
||||
my @a = @acc[0..5];
|
||||
my @b = (@a[1..3], $t4, $t5, $g0);
|
||||
|
||||
$code.=<<___;
|
||||
.type __ab_approximation_30,\@abi-omnipotent
|
||||
.align 32
|
||||
__ab_approximation_30:
|
||||
mov 8*11($in_ptr), @b[5] # load |b| in reverse order
|
||||
mov 8*10($in_ptr), @b[4]
|
||||
mov 8*9($in_ptr), @b[3]
|
||||
|
||||
mov @a[5], %rax
|
||||
or @b[5], %rax # check top-most limbs, ...
|
||||
cmovz @a[4], @a[5]
|
||||
cmovz @b[4], @b[5]
|
||||
cmovz @a[3], @a[4]
|
||||
mov 8*8($in_ptr), @b[2]
|
||||
cmovz @b[3], @b[4]
|
||||
|
||||
mov @a[5], %rax
|
||||
or @b[5], %rax # ... ones before top-most, ...
|
||||
cmovz @a[4], @a[5]
|
||||
cmovz @b[4], @b[5]
|
||||
cmovz @a[2], @a[4]
|
||||
mov 8*7($in_ptr), @b[1]
|
||||
cmovz @b[2], @b[4]
|
||||
|
||||
mov @a[5], %rax
|
||||
or @b[5], %rax # ... and ones before that ...
|
||||
cmovz @a[4], @a[5]
|
||||
cmovz @b[4], @b[5]
|
||||
cmovz @a[1], @a[4]
|
||||
mov 8*6($in_ptr), @b[0]
|
||||
cmovz @b[1], @b[4]
|
||||
|
||||
mov @a[5], %rax
|
||||
or @b[5], %rax # ... and ones before that ...
|
||||
cmovz @a[4], @a[5]
|
||||
cmovz @b[4], @b[5]
|
||||
cmovz @a[0], @a[4]
|
||||
cmovz @b[0], @b[4]
|
||||
|
||||
mov @a[5], %rax
|
||||
or @b[5], %rax
|
||||
bsr %rax, %rcx
|
||||
lea 1(%rcx), %rcx
|
||||
cmovz @a[0], @a[5]
|
||||
cmovz @b[0], @b[5]
|
||||
cmovz %rax, %rcx
|
||||
neg %rcx
|
||||
#and \$63, %rcx # debugging artefact
|
||||
|
||||
shldq %cl, @a[4], @a[5] # align second limb to the left
|
||||
shldq %cl, @b[4], @b[5]
|
||||
|
||||
mov \$0xFFFFFFFF00000000, %rax
|
||||
mov @a[0]d, ${a_}d
|
||||
mov @b[0]d, ${b_}d
|
||||
and %rax, @a[5]
|
||||
and %rax, @b[5]
|
||||
or @a[5], ${a_}
|
||||
or @b[5], ${b_}
|
||||
|
||||
jmp __inner_loop_30
|
||||
|
||||
ret
|
||||
.size __ab_approximation_30,.-__ab_approximation_30
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.type __inner_loop_30,\@abi-omnipotent
|
||||
.align 32
|
||||
__inner_loop_30: ################# by Thomas Pornin
|
||||
mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0
|
||||
mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1
|
||||
lea -1($fg0), $bias # 0x7FFFFFFF7FFFFFFF
|
||||
mov \$30, $cnt
|
||||
|
||||
.Loop_30:
|
||||
mov $a_, %rax
|
||||
and $b_, %rax
|
||||
shr \$1, %rax # (a_ & b_) >> 1
|
||||
|
||||
cmp $b_, $a_ # if |a_|<|b_|, swap the variables
|
||||
mov $a_, $t0
|
||||
mov $b_, $t1
|
||||
lea (%rax,$L), %rax # pre-"negate" |L|
|
||||
mov $fg0, $t2
|
||||
mov $fg1, $t3
|
||||
mov $L, $t4
|
||||
cmovb $b_, $a_
|
||||
cmovb $t0, $b_
|
||||
cmovb $fg1, $fg0
|
||||
cmovb $t2, $fg1
|
||||
cmovb %rax, $L
|
||||
|
||||
sub $b_, $a_ # |a_|-|b_|
|
||||
sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1|
|
||||
add $bias, $fg0
|
||||
|
||||
test \$1, $t0 # if |a_| was even, roll back
|
||||
cmovz $t0, $a_
|
||||
cmovz $t1, $b_
|
||||
cmovz $t2, $fg0
|
||||
cmovz $t3, $fg1
|
||||
cmovz $t4, $L
|
||||
|
||||
lea 2($b_), %rax
|
||||
shr \$1, $a_ # |a_|>>=1
|
||||
shr \$2, %rax
|
||||
add $fg1, $fg1 # |f1|<<=1, |g1|<<=1
|
||||
lea (%rax,$L), $L # "negate" |L| if |b|%8 is 3 or 5
|
||||
sub $bias, $fg1
|
||||
|
||||
sub \$1, $cnt
|
||||
jnz .Loop_30
|
||||
|
||||
shr \$32, $bias
|
||||
mov %ebx, %eax # $fg0 -> $f0
|
||||
shr \$32, $g0
|
||||
mov %ecx, %edx # $fg1 -> $f1
|
||||
shr \$32, $g1
|
||||
sub $bias, $f0 # remove the bias
|
||||
sub $bias, $g0
|
||||
sub $bias, $f1
|
||||
sub $bias, $g1
|
||||
|
||||
ret
|
||||
.size __inner_loop_30,.-__inner_loop_30
|
||||
|
||||
.type __inner_loop_48,\@abi-omnipotent
|
||||
.align 32
|
||||
__inner_loop_48:
|
||||
mov \$48, $cnt # 48 is 768%30+30
|
||||
|
||||
.Loop_48:
|
||||
mov $a_, %rax
|
||||
and $b_, %rax
|
||||
shr \$1, %rax # (a_ & b_) >> 1
|
||||
|
||||
cmp $b_, $a_ # if |a_|<|b_|, swap the variables
|
||||
mov $a_, $t0
|
||||
mov $b_, $t1
|
||||
lea (%rax,$L), %rax
|
||||
mov $L, $t2
|
||||
cmovb $b_, $a_
|
||||
cmovb $t0, $b_
|
||||
cmovb %rax, $L
|
||||
|
||||
sub $b_, $a_ # |a_|-|b_|
|
||||
|
||||
test \$1, $t0 # if |a_| was even, roll back
|
||||
cmovz $t0, $a_
|
||||
cmovz $t1, $b_
|
||||
cmovz $t2, $L
|
||||
|
||||
lea 2($b_), %rax
|
||||
shr \$1, $a_ # |a_|>>=1
|
||||
shr \$2, %rax
|
||||
add %rax, $L # "negate" |L| if |b|%8 is 3 or 5
|
||||
|
||||
sub \$1, $cnt
|
||||
jnz .Loop_48
|
||||
|
||||
ret
|
||||
.size __inner_loop_48,.-__inner_loop_48
|
||||
___
|
||||
}
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
886
blst/asm/ctq_inverse_mod_384-x86_64.pl
Executable file
886
blst/asm/ctq_inverse_mod_384-x86_64.pl
Executable file
|
@ -0,0 +1,886 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Both constant-time and fast Euclidean inversion as suggested in
|
||||
# https://eprint.iacr.org/2020/972. Performance is >5x better than
|
||||
# modulus-specific FLT addition chain...
|
||||
#
|
||||
# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
|
||||
#
|
||||
$python_ref.=<<'___';
|
||||
def ct_inverse_mod_383(inp, mod):
|
||||
a, u = inp, 1
|
||||
b, v = mod, 0
|
||||
|
||||
k = 62
|
||||
w = 64
|
||||
mask = (1 << w) - 1
|
||||
|
||||
for i in range(0, 766 // k):
|
||||
# __ab_approximation_62
|
||||
n = max(a.bit_length(), b.bit_length())
|
||||
if n < 128:
|
||||
a_, b_ = a, b
|
||||
else:
|
||||
a_ = (a & mask) | ((a >> (n-w)) << w)
|
||||
b_ = (b & mask) | ((b >> (n-w)) << w)
|
||||
|
||||
# __inner_loop_62
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, k):
|
||||
if a_ & 1:
|
||||
if a_ < b_:
|
||||
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
|
||||
a_, f0, g0 = a_-b_, f0-f1, g0-g1
|
||||
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
|
||||
|
||||
# __smulq_383_n_shift_by_62
|
||||
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
|
||||
if a < 0:
|
||||
a, f0, g0 = -a, -f0, -g0
|
||||
if b < 0:
|
||||
b, f1, g1 = -b, -f1, -g1
|
||||
|
||||
# __smulq_767x63
|
||||
u, v = u*f0 + v*g0, u*f1 + v*g1
|
||||
|
||||
if 766 % k:
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, 766 % k):
|
||||
if a & 1:
|
||||
if a < b:
|
||||
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
|
||||
a, f0, g0 = a-b, f0-f1, g0-g1
|
||||
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
|
||||
|
||||
v = u*f1 + v*g1
|
||||
|
||||
if v < 0:
|
||||
v += mod << (768 - mod.bit_length()) # left aligned
|
||||
|
||||
return v & (2**768 - 1) # to be reduced % mod
|
||||
___
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
|
||||
my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
|
||||
my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
|
||||
my $cnt = "%edi";
|
||||
|
||||
$frame = 8*11+2*512;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl ct_inverse_mod_383
|
||||
.type ct_inverse_mod_383,\@function,4,"unwind"
|
||||
.align 32
|
||||
ct_inverse_mod_383:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$$frame, %rsp
|
||||
.cfi_adjust_cfa_offset $frame
|
||||
.cfi_end_prologue
|
||||
|
||||
lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot
|
||||
and \$-512, %rax # in the frame...
|
||||
mov $out_ptr, 8*4(%rsp)
|
||||
mov $nx_ptr, 8*5(%rsp)
|
||||
|
||||
mov 8*0($in_ptr), @acc[0] # load input
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
mov 8*4($in_ptr), @acc[4]
|
||||
mov 8*5($in_ptr), @acc[5]
|
||||
|
||||
mov 8*0($n_ptr), @acc[6] # load modulus
|
||||
mov 8*1($n_ptr), @acc[7]
|
||||
mov 8*2($n_ptr), @acc[8]
|
||||
mov 8*3($n_ptr), @acc[9]
|
||||
mov 8*4($n_ptr), @acc[10]
|
||||
mov 8*5($n_ptr), @acc[11]
|
||||
|
||||
mov @acc[0], 8*0(%rax) # copy input to |a|
|
||||
mov @acc[1], 8*1(%rax)
|
||||
mov @acc[2], 8*2(%rax)
|
||||
mov @acc[3], 8*3(%rax)
|
||||
mov @acc[4], 8*4(%rax)
|
||||
mov @acc[5], 8*5(%rax)
|
||||
|
||||
mov @acc[6], 8*6(%rax) # copy modulus to |b|
|
||||
mov @acc[7], 8*7(%rax)
|
||||
mov @acc[8], 8*8(%rax)
|
||||
mov @acc[9], 8*9(%rax)
|
||||
mov @acc[10], 8*10(%rax)
|
||||
mov %rax, $in_ptr # pointer to source |a|b|1|0|
|
||||
mov @acc[11], 8*11(%rax)
|
||||
|
||||
################################# first iteration
|
||||
mov \$62, $cnt
|
||||
call __ab_approximation_62
|
||||
#mov $f0, 8*7(%rsp)
|
||||
#mov $g0, 8*8(%rsp)
|
||||
mov $f1, 8*9(%rsp)
|
||||
mov $g1, 8*10(%rsp)
|
||||
|
||||
mov \$256, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||||
call __smulq_383_n_shift_by_62
|
||||
#mov $f0, 8*7(%rsp) # corrected |f0|
|
||||
#mov $g0, 8*8(%rsp) # corrected |g0|
|
||||
mov $f0, 8*12($out_ptr) # initialize |u| with |f0|
|
||||
|
||||
mov 8*9(%rsp), $f0 # |f1|
|
||||
mov 8*10(%rsp), $g0 # |g1|
|
||||
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
|
||||
call __smulq_383_n_shift_by_62
|
||||
#mov $f0, 8*9(%rsp) # corrected |f1|
|
||||
#mov $g0, 8*10(%rsp) # corrected |g1|
|
||||
mov $f0, 8*12($out_ptr) # initialize |v| with |f1|
|
||||
|
||||
################################# second iteration
|
||||
xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||||
mov \$62, $cnt
|
||||
call __ab_approximation_62
|
||||
#mov $f0, 8*7(%rsp)
|
||||
#mov $g0, 8*8(%rsp)
|
||||
mov $f1, 8*9(%rsp)
|
||||
mov $g1, 8*10(%rsp)
|
||||
|
||||
mov \$256, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||||
call __smulq_383_n_shift_by_62
|
||||
mov $f0, 8*7(%rsp) # corrected |f0|
|
||||
mov $g0, 8*8(%rsp) # corrected |g0|
|
||||
|
||||
mov 8*9(%rsp), $f0 # |f1|
|
||||
mov 8*10(%rsp), $g0 # |g1|
|
||||
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
|
||||
call __smulq_383_n_shift_by_62
|
||||
#mov $f0, 8*9(%rsp) # corrected |f1|
|
||||
#mov $g0, 8*10(%rsp) # corrected |g1|
|
||||
|
||||
mov 8*12($in_ptr), %rax # |u|
|
||||
mov 8*18($in_ptr), @acc[3] # |v|
|
||||
mov $f0, %rbx
|
||||
mov %rax, @acc[2]
|
||||
imulq 8*7(%rsp) # |u|*|f0|
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[3], %rax
|
||||
mov %rdx, @acc[1]
|
||||
imulq 8*8(%rsp) # |v|*|g0|
|
||||
add %rax, @acc[0]
|
||||
adc %rdx, @acc[1]
|
||||
mov @acc[0], 8*6($out_ptr) # destination |u|
|
||||
mov @acc[1], 8*7($out_ptr)
|
||||
sar \$63, @acc[1] # sign extension
|
||||
mov @acc[1], 8*8($out_ptr)
|
||||
mov @acc[1], 8*9($out_ptr)
|
||||
mov @acc[1], 8*10($out_ptr)
|
||||
mov @acc[1], 8*11($out_ptr)
|
||||
lea 8*12($in_ptr),$in_ptr # make in_ptr "rewindable" with xor
|
||||
|
||||
mov @acc[2], %rax
|
||||
imulq %rbx # |u|*|f1|
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[3], %rax
|
||||
mov %rdx, @acc[1]
|
||||
imulq %rcx # |v|*|g1|
|
||||
add %rax, @acc[0]
|
||||
adc %rdx, @acc[1]
|
||||
mov @acc[0], 8*12($out_ptr) # destination |v|
|
||||
mov @acc[1], 8*13($out_ptr)
|
||||
sar \$63, @acc[1] # sign extension
|
||||
mov @acc[1], 8*14($out_ptr)
|
||||
mov @acc[1], 8*15($out_ptr)
|
||||
mov @acc[1], 8*16($out_ptr)
|
||||
mov @acc[1], 8*17($out_ptr)
|
||||
___
|
||||
for($i=2; $i<11; $i++) {
|
||||
my $smul_767x63 = $i>5 ? "__smulq_767x63"
|
||||
: "__smulq_383x63";
|
||||
$code.=<<___;
|
||||
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||||
mov \$62, $cnt
|
||||
call __ab_approximation_62
|
||||
#mov $f0, 8*7(%rsp)
|
||||
#mov $g0, 8*8(%rsp)
|
||||
mov $f1, 8*9(%rsp)
|
||||
mov $g1, 8*10(%rsp)
|
||||
|
||||
mov \$256, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||||
call __smulq_383_n_shift_by_62
|
||||
mov $f0, 8*7(%rsp) # corrected |f0|
|
||||
mov $g0, 8*8(%rsp) # corrected |g0|
|
||||
|
||||
mov 8*9(%rsp), $f0 # |f1|
|
||||
mov 8*10(%rsp), $g0 # |g1|
|
||||
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
|
||||
call __smulq_383_n_shift_by_62
|
||||
mov $f0, 8*9(%rsp) # corrected |f1|
|
||||
mov $g0, 8*10(%rsp) # corrected |g1|
|
||||
|
||||
mov 8*7(%rsp), $f0 # |f0|
|
||||
mov 8*8(%rsp), $g0 # |g0|
|
||||
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
|
||||
lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
|
||||
call __smulq_383x63
|
||||
|
||||
mov 8*9(%rsp), $f0 # |f1|
|
||||
mov 8*10(%rsp), $g0 # |g1|
|
||||
lea 8*6($out_ptr),$out_ptr # pointer to destination |v|
|
||||
call $smul_767x63
|
||||
___
|
||||
$code.=<<___ if ($i==5);
|
||||
sar \$63, @acc[5] # sign extension
|
||||
mov @acc[5], 8*6($out_ptr)
|
||||
mov @acc[5], 8*7($out_ptr)
|
||||
mov @acc[5], 8*8($out_ptr)
|
||||
mov @acc[5], 8*9($out_ptr)
|
||||
mov @acc[5], 8*10($out_ptr)
|
||||
mov @acc[5], 8*11($out_ptr)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
################################# iteration before last
|
||||
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||||
mov \$62, $cnt
|
||||
#call __ab_approximation_62 # |a| and |b| are exact, just load
|
||||
mov 8*0($in_ptr), @acc[0] # |a_lo|
|
||||
mov 8*1($in_ptr), @acc[1] # |a_hi|
|
||||
mov 8*6($in_ptr), @acc[2] # |b_lo|
|
||||
mov 8*7($in_ptr), @acc[3] # |b_hi|
|
||||
call __inner_loop_62
|
||||
#mov $f0, 8*7(%rsp)
|
||||
#mov $g0, 8*8(%rsp)
|
||||
mov $f1, 8*9(%rsp)
|
||||
mov $g1, 8*10(%rsp)
|
||||
|
||||
mov \$256, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[2], 8*6($out_ptr)
|
||||
|
||||
#mov 8*7(%rsp), $f0 # |f0|
|
||||
#mov 8*8(%rsp), $g0 # |g0|
|
||||
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
|
||||
lea 8*12($out_ptr),$out_ptr # pointer to destination |u|
|
||||
call __smulq_383x63
|
||||
|
||||
mov 8*9(%rsp), $f0 # |f1|
|
||||
mov 8*10(%rsp), $g0 # |g1|
|
||||
lea 8*6($out_ptr),$out_ptr # pointer to destination |v|
|
||||
call __smulq_767x63
|
||||
|
||||
################################# last iteration
|
||||
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||||
mov \$22, $cnt # 766 % 62
|
||||
#call __ab_approximation_62 # |a| and |b| are exact, just load
|
||||
mov 8*0($in_ptr), @acc[0] # |a_lo|
|
||||
xor @acc[1], @acc[1] # |a_hi|
|
||||
mov 8*6($in_ptr), @acc[2] # |b_lo|
|
||||
xor @acc[3], @acc[3] # |b_hi|
|
||||
call __inner_loop_62
|
||||
#mov $f0, 8*7(%rsp)
|
||||
#mov $g0, 8*8(%rsp)
|
||||
#mov $f1, 8*9(%rsp)
|
||||
#mov $g1, 8*10(%rsp)
|
||||
|
||||
#mov 8*7(%rsp), $f0 # |f0|
|
||||
#mov 8*8(%rsp), $g0 # |g0|
|
||||
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
|
||||
#lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
|
||||
#call __smulq_383x63
|
||||
|
||||
#mov 8*9(%rsp), $f0 # |f1|
|
||||
#mov 8*10(%rsp), $g0 # |g1|
|
||||
mov $f1, $f0
|
||||
mov $g1, $g0
|
||||
mov 8*4(%rsp), $out_ptr # original out_ptr
|
||||
call __smulq_767x63
|
||||
|
||||
mov 8*5(%rsp), $in_ptr # original n_ptr
|
||||
mov %rax, %rdx # top limb of the result
|
||||
sar \$63, %rax # result's sign as mask
|
||||
|
||||
mov %rax, @acc[0] # mask |modulus|
|
||||
mov %rax, @acc[1]
|
||||
mov %rax, @acc[2]
|
||||
and 8*0($in_ptr), @acc[0]
|
||||
and 8*1($in_ptr), @acc[1]
|
||||
mov %rax, @acc[3]
|
||||
and 8*2($in_ptr), @acc[2]
|
||||
and 8*3($in_ptr), @acc[3]
|
||||
mov %rax, @acc[4]
|
||||
and 8*4($in_ptr), @acc[4]
|
||||
and 8*5($in_ptr), %rax
|
||||
|
||||
add @acc[0], @acc[6] # conditionally add |modulus|<<384
|
||||
adc @acc[1], @acc[7]
|
||||
adc @acc[2], @acc[8]
|
||||
adc @acc[3], @acc[9]
|
||||
adc @acc[4], %rcx
|
||||
adc %rax, %rdx
|
||||
|
||||
mov @acc[6], 8*6($out_ptr) # store absolute value
|
||||
mov @acc[7], 8*7($out_ptr)
|
||||
mov @acc[8], 8*8($out_ptr)
|
||||
mov @acc[9], 8*9($out_ptr)
|
||||
mov %rcx, 8*10($out_ptr)
|
||||
mov %rdx, 8*11($out_ptr)
|
||||
|
||||
lea $frame(%rsp), %r8 # size optimization
|
||||
mov 8*0(%r8),%r15
|
||||
.cfi_restore %r15
|
||||
mov 8*1(%r8),%r14
|
||||
.cfi_restore %r14
|
||||
mov 8*2(%r8),%r13
|
||||
.cfi_restore %r13
|
||||
mov 8*3(%r8),%r12
|
||||
.cfi_restore %r12
|
||||
mov 8*4(%r8),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 8*5(%r8),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 8*6(%r8),%rsp
|
||||
.cfi_adjust_cfa_offset -$frame-8*6
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size ct_inverse_mod_383,.-ct_inverse_mod_383
|
||||
___
|
||||
########################################################################
|
||||
# see corresponding commentary in ctx_inverse_mod_384-x86_64...
|
||||
{
|
||||
my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
|
||||
my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
|
||||
my $fx = @acc[9];
|
||||
|
||||
$code.=<<___;
|
||||
.type __smulq_767x63,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulq_767x63:
|
||||
mov 8*0($in_ptr), @acc[0] # load |u|
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
mov 8*4($in_ptr), @acc[4]
|
||||
mov 8*5($in_ptr), @acc[5]
|
||||
|
||||
mov $f0, $fx
|
||||
sar \$63, $f0 # |f0|'s sign as mask
|
||||
xor %rax, %rax
|
||||
sub $f0, %rax # |f0|'s sign as bit
|
||||
|
||||
mov $out_ptr, 8*1(%rsp)
|
||||
mov $in_ptr, 8*2(%rsp)
|
||||
lea 8*6($in_ptr), $in_ptr # pointer to |v|
|
||||
|
||||
xor $f0, $fx # conditionally negate |f0|
|
||||
add %rax, $fx
|
||||
|
||||
xor $f0, @acc[0] # conditionally negate |u|
|
||||
xor $f0, @acc[1]
|
||||
xor $f0, @acc[2]
|
||||
xor $f0, @acc[3]
|
||||
xor $f0, @acc[4]
|
||||
xor $f0, @acc[5]
|
||||
add @acc[0], %rax
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
|
||||
mulq $fx # |u|*|f0|
|
||||
mov %rax, 8*0($out_ptr) # offload |u|*|f0|
|
||||
mov @acc[1], %rax
|
||||
mov %rdx, @acc[1]
|
||||
___
|
||||
for($i=1; $i<5; $i++) {
|
||||
$code.=<<___;
|
||||
mulq $fx
|
||||
add %rax, @acc[$i]
|
||||
mov @acc[$i+1], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[$i+1]
|
||||
mov @acc[$i], 8*$i($out_ptr)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
imulq $fx
|
||||
add %rax, @acc[$i]
|
||||
adc \$0, %rdx
|
||||
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
mov %rdx, 8*6($out_ptr)
|
||||
sar \$63, %rdx # sign extension
|
||||
mov %rdx, 8*7($out_ptr)
|
||||
___
|
||||
{
|
||||
my $fx=$in_ptr;
|
||||
$code.=<<___;
|
||||
mov $g0, $f0 # load |g0|
|
||||
|
||||
mov 8*0($in_ptr), @acc[0] # load |v|
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
mov 8*4($in_ptr), @acc[4]
|
||||
mov 8*5($in_ptr), @acc[5]
|
||||
mov 8*6($in_ptr), @acc[6]
|
||||
mov 8*7($in_ptr), @acc[7]
|
||||
mov 8*8($in_ptr), @acc[8]
|
||||
mov 8*9($in_ptr), @acc[9]
|
||||
mov 8*10($in_ptr), @acc[10]
|
||||
mov 8*11($in_ptr), @acc[11]
|
||||
|
||||
mov $f0, $fx # overrides in_ptr
|
||||
sar \$63, $f0 # |g0|'s sign as mask
|
||||
xor %rax, %rax
|
||||
sub $f0, %rax # |g0|'s sign as bit
|
||||
|
||||
xor $f0, $fx # conditionally negate |g0|
|
||||
add %rax, $fx
|
||||
|
||||
xor $f0, @acc[0] # conditionally negate |v|
|
||||
xor $f0, @acc[1]
|
||||
xor $f0, @acc[2]
|
||||
xor $f0, @acc[3]
|
||||
xor $f0, @acc[4]
|
||||
xor $f0, @acc[5]
|
||||
xor $f0, @acc[6]
|
||||
xor $f0, @acc[7]
|
||||
xor $f0, @acc[8]
|
||||
xor $f0, @acc[9]
|
||||
xor $f0, @acc[10]
|
||||
xor $f0, @acc[11]
|
||||
add @acc[0], %rax
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
adc \$0, @acc[6]
|
||||
adc \$0, @acc[7]
|
||||
adc \$0, @acc[8]
|
||||
adc \$0, @acc[9]
|
||||
adc \$0, @acc[10]
|
||||
adc \$0, @acc[11]
|
||||
|
||||
mulq $fx # |v|*|g0|
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[1], %rax
|
||||
mov %rdx, @acc[1]
|
||||
___
|
||||
for($i=1; $i<11; $i++) {
|
||||
$code.=<<___;
|
||||
mulq $fx
|
||||
add %rax, @acc[$i]
|
||||
mov @acc[$i+1], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[$i+1]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
mov 8*1(%rsp), %rdx # out_ptr
|
||||
imulq $fx, %rax
|
||||
mov 8*2(%rsp), $in_ptr # restore original in_ptr
|
||||
add @acc[11], %rax
|
||||
|
||||
add 8*0(%rdx), @acc[0] # accumulate |u|*|f0|
|
||||
adc 8*1(%rdx), @acc[1]
|
||||
adc 8*2(%rdx), @acc[2]
|
||||
adc 8*3(%rdx), @acc[3]
|
||||
adc 8*4(%rdx), @acc[4]
|
||||
adc 8*5(%rdx), @acc[5]
|
||||
adc 8*6(%rdx), @acc[6]
|
||||
mov 8*7(%rdx), @acc[11] # sign extension
|
||||
adc @acc[11], @acc[7]
|
||||
adc @acc[11], @acc[8]
|
||||
adc @acc[11], @acc[9]
|
||||
adc @acc[11], @acc[10]
|
||||
adc @acc[11], %rax
|
||||
|
||||
mov %rdx, $out_ptr # restore original out_ptr
|
||||
|
||||
mov @acc[0], 8*0(%rdx)
|
||||
mov @acc[1], 8*1(%rdx)
|
||||
mov @acc[2], 8*2(%rdx)
|
||||
mov @acc[3], 8*3(%rdx)
|
||||
mov @acc[4], 8*4(%rdx)
|
||||
mov @acc[5], 8*5(%rdx)
|
||||
mov @acc[6], 8*6(%rdx)
|
||||
mov @acc[7], 8*7(%rdx)
|
||||
mov @acc[8], 8*8(%rdx)
|
||||
mov @acc[9], 8*9(%rdx)
|
||||
mov @acc[10], 8*10(%rdx)
|
||||
mov %rax, 8*11(%rdx)
|
||||
|
||||
ret
|
||||
.size __smulq_767x63,.-__smulq_767x63
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.type __smulq_383x63,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulq_383x63:
|
||||
___
|
||||
for($j=0; $j<2; $j++) {
|
||||
$code.=<<___;
|
||||
mov 8*0($in_ptr), @acc[0] # load |u| (or |v|)
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
mov 8*4($in_ptr), @acc[4]
|
||||
mov 8*5($in_ptr), @acc[5]
|
||||
|
||||
mov %rdx, $fx
|
||||
sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s)
|
||||
xor %rax, %rax
|
||||
sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s)
|
||||
|
||||
xor %rdx, $fx # conditionally negate |f0|
|
||||
add %rax, $fx
|
||||
|
||||
xor %rdx, @acc[0] # conditionally negate |u| (or |v|)
|
||||
xor %rdx, @acc[1]
|
||||
xor %rdx, @acc[2]
|
||||
xor %rdx, @acc[3]
|
||||
xor %rdx, @acc[4]
|
||||
xor %rdx, @acc[5]
|
||||
add @acc[0], %rax
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
|
||||
mulq $fx # |u|*|f0| (or |v|*|g0|)
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[1], %rax
|
||||
mov %rdx, @acc[1]
|
||||
___
|
||||
for($i=1; $i<5; $i++) {
|
||||
$code.=<<___;
|
||||
mulq $fx
|
||||
add %rax, @acc[$i]
|
||||
mov @acc[$i+1], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[$i+1]
|
||||
___
|
||||
}
|
||||
$code.=<<___ if ($j==0);
|
||||
imulq $fx, %rax
|
||||
add %rax, @acc[$i]
|
||||
|
||||
lea 8*6($in_ptr), $in_ptr # pointer to |v|
|
||||
mov $g0, %rdx
|
||||
|
||||
mov @acc[0], 8*0($out_ptr) # offload |u|*|f0|
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
imulq $fx, %rax
|
||||
add %rax, @acc[$i]
|
||||
|
||||
lea -8*6($in_ptr), $in_ptr # restore original in_ptr
|
||||
|
||||
add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0|
|
||||
adc 8*1($out_ptr), @acc[1]
|
||||
adc 8*2($out_ptr), @acc[2]
|
||||
adc 8*3($out_ptr), @acc[3]
|
||||
adc 8*4($out_ptr), @acc[4]
|
||||
adc 8*5($out_ptr), @acc[5]
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
|
||||
ret
|
||||
.size __smulq_383x63,.-__smulq_383x63
|
||||
___
|
||||
{
|
||||
$code.=<<___;
|
||||
.type __smulq_383_n_shift_by_62,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulq_383_n_shift_by_62:
|
||||
mov $f0, @acc[8]
|
||||
___
|
||||
my $f0 = @acc[8];
|
||||
for($j=0; $j<2; $j++) {
|
||||
$code.=<<___;
|
||||
mov 8*0($in_ptr), @acc[0] # load |a| (or |b|)
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
mov 8*4($in_ptr), @acc[4]
|
||||
mov 8*5($in_ptr), @acc[5]
|
||||
|
||||
mov %rdx, $fx
|
||||
sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s)
|
||||
xor %rax, %rax
|
||||
sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s)
|
||||
|
||||
xor %rdx, $fx # conditionally negate |f0| (or |g0|)
|
||||
add %rax, $fx
|
||||
|
||||
xor %rdx, @acc[0] # conditionally negate |a| (or |b|)
|
||||
xor %rdx, @acc[1]
|
||||
xor %rdx, @acc[2]
|
||||
xor %rdx, @acc[3]
|
||||
xor %rdx, @acc[4]
|
||||
xor %rdx, @acc[5]
|
||||
add @acc[0], %rax
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
|
||||
mulq $fx # |a|*|f0| (or |b|*|g0|)
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[1], %rax
|
||||
mov %rdx, @acc[1]
|
||||
___
|
||||
for($i=1; $i<5; $i++) {
|
||||
$code.=<<___;
|
||||
mulq $fx
|
||||
add %rax, @acc[$i]
|
||||
mov @acc[$i+1], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[$i+1]
|
||||
___
|
||||
}
|
||||
$code.=<<___ if ($j==0);
|
||||
imulq $fx
|
||||
add %rax, @acc[$i]
|
||||
adc \$0, %rdx
|
||||
|
||||
lea 8*6($in_ptr), $in_ptr # pointer to |b|
|
||||
mov %rdx, @acc[6]
|
||||
mov $g0, %rdx
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
imulq $fx
|
||||
add %rax, @acc[$i]
|
||||
adc \$0, %rdx
|
||||
|
||||
lea -8*6($in_ptr), $in_ptr # restore original in_ptr
|
||||
|
||||
add 8*0($out_ptr), @acc[0]
|
||||
adc 8*1($out_ptr), @acc[1]
|
||||
adc 8*2($out_ptr), @acc[2]
|
||||
adc 8*3($out_ptr), @acc[3]
|
||||
adc 8*4($out_ptr), @acc[4]
|
||||
adc 8*5($out_ptr), @acc[5]
|
||||
adc %rdx, @acc[6]
|
||||
mov $f0, %rdx
|
||||
|
||||
shrd \$62, @acc[1], @acc[0]
|
||||
shrd \$62, @acc[2], @acc[1]
|
||||
shrd \$62, @acc[3], @acc[2]
|
||||
shrd \$62, @acc[4], @acc[3]
|
||||
shrd \$62, @acc[5], @acc[4]
|
||||
shrd \$62, @acc[6], @acc[5]
|
||||
|
||||
sar \$63, @acc[6] # sign as mask
|
||||
xor $fx, $fx
|
||||
sub @acc[6], $fx # sign as bit
|
||||
|
||||
xor @acc[6], @acc[0] # conditionally negate the result
|
||||
xor @acc[6], @acc[1]
|
||||
xor @acc[6], @acc[2]
|
||||
xor @acc[6], @acc[3]
|
||||
xor @acc[6], @acc[4]
|
||||
xor @acc[6], @acc[5]
|
||||
add $fx, @acc[0]
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
|
||||
xor @acc[6], %rdx # conditionally negate |f0|
|
||||
xor @acc[6], $g0 # conditionally negate |g0|
|
||||
add $fx, %rdx
|
||||
add $fx, $g0
|
||||
|
||||
ret
|
||||
.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62
|
||||
___
|
||||
} }
|
||||
|
||||
{
|
||||
my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
|
||||
my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi");
|
||||
{
|
||||
my @a = ($a_lo, $t1, $a_hi);
|
||||
my @b = ($b_lo, $t2, $b_hi);
|
||||
|
||||
$code.=<<___;
|
||||
.type __ab_approximation_62,\@abi-omnipotent
|
||||
.align 32
|
||||
__ab_approximation_62:
|
||||
mov 8*5($in_ptr), @a[2] # load |a| in reverse order
|
||||
mov 8*11($in_ptr), @b[2] # load |b| in reverse order
|
||||
mov 8*4($in_ptr), @a[1]
|
||||
mov 8*10($in_ptr), @b[1]
|
||||
mov 8*3($in_ptr), @a[0]
|
||||
mov 8*9($in_ptr), @b[0]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0 # check top-most limbs, ...
|
||||
cmovz @a[1], @a[2]
|
||||
cmovz @b[1], @b[2]
|
||||
cmovz @a[0], @a[1]
|
||||
cmovz @b[0], @b[1]
|
||||
mov 8*2($in_ptr), @a[0]
|
||||
mov 8*8($in_ptr), @b[0]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0 # ... ones before top-most, ...
|
||||
cmovz @a[1], @a[2]
|
||||
cmovz @b[1], @b[2]
|
||||
cmovz @a[0], @a[1]
|
||||
cmovz @b[0], @b[1]
|
||||
mov 8*1($in_ptr), @a[0]
|
||||
mov 8*7($in_ptr), @b[0]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0 # ... and ones before that ...
|
||||
cmovz @a[1], @a[2]
|
||||
cmovz @b[1], @b[2]
|
||||
cmovz @a[0], @a[1]
|
||||
cmovz @b[0], @b[1]
|
||||
mov 8*0($in_ptr), @a[0]
|
||||
mov 8*6($in_ptr), @b[0]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0
|
||||
bsr $t0, %rcx
|
||||
lea 1(%rcx), %rcx
|
||||
cmovz @a[1], @a[2]
|
||||
cmovz @b[1], @b[2]
|
||||
cmovz $t0, %rcx
|
||||
neg %rcx
|
||||
#and \$63, %rcx # debugging artefact
|
||||
|
||||
shldq %cl, @a[1], @a[2] # align second limb to the left
|
||||
shldq %cl, @b[1], @b[2]
|
||||
|
||||
jmp __inner_loop_62
|
||||
|
||||
ret
|
||||
.size __ab_approximation_62,.-__ab_approximation_62
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.type __inner_loop_62,\@abi-omnipotent
|
||||
.align 8
|
||||
.long 0
|
||||
__inner_loop_62:
|
||||
mov \$1, $f0 # |f0|=1
|
||||
xor $g0, $g0 # |g0|=0
|
||||
xor $f1, $f1 # |f1|=0
|
||||
mov \$1, $g1 # |g1|=1
|
||||
mov $in_ptr, 8(%rsp)
|
||||
|
||||
.Loop_62:
|
||||
xor $t0, $t0
|
||||
xor $t1, $t1
|
||||
test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_|
|
||||
mov $b_lo, $t2
|
||||
mov $b_hi, $t3
|
||||
cmovnz $b_lo, $t0
|
||||
cmovnz $b_hi, $t1
|
||||
sub $a_lo, $t2 # |b_|-|a_|
|
||||
sbb $a_hi, $t3
|
||||
mov $a_lo, $t4
|
||||
mov $a_hi, $t5
|
||||
sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even)
|
||||
sbb $t1, $a_hi
|
||||
cmovc $t2, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_|
|
||||
cmovc $t3, $a_hi
|
||||
cmovc $t4, $b_lo # |b_| = |a_|
|
||||
cmovc $t5, $b_hi
|
||||
mov $f0, $t0 # exchange |f0| and |f1|
|
||||
cmovc $f1, $f0
|
||||
cmovc $t0, $f1
|
||||
mov $g0, $t1 # exchange |g0| and |g1|
|
||||
cmovc $g1, $g0
|
||||
cmovc $t1, $g1
|
||||
xor $t0, $t0
|
||||
xor $t1, $t1
|
||||
shrd \$1, $a_hi, $a_lo
|
||||
shr \$1, $a_hi
|
||||
test \$1, $t4 # if |a_| was odd, then we'll be subtracting...
|
||||
cmovnz $f1, $t0
|
||||
cmovnz $g1, $t1
|
||||
add $f1, $f1 # |f1|<<=1
|
||||
add $g1, $g1 # |g1|<<=1
|
||||
sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even)
|
||||
sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...)
|
||||
sub \$1, $cnt
|
||||
jnz .Loop_62
|
||||
|
||||
mov 8(%rsp), $in_ptr
|
||||
ret
|
||||
.size __inner_loop_62,.-__inner_loop_62
|
||||
___
|
||||
}
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
995
blst/asm/ctx_inverse_mod_384-x86_64.pl
Executable file
995
blst/asm/ctx_inverse_mod_384-x86_64.pl
Executable file
|
@ -0,0 +1,995 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Both constant-time and fast Euclidean inversion as suggested in
|
||||
# https://eprint.iacr.org/2020/972. Performance is >4x better than
|
||||
# modulus-specific FLT addition chain...
|
||||
#
|
||||
# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
|
||||
#
|
||||
$python_ref.=<<'___';
|
||||
def ct_inverse_mod_383(inp, mod):
|
||||
a, u = inp, 1
|
||||
b, v = mod, 0
|
||||
|
||||
k = 31
|
||||
mask = (1 << k) - 1
|
||||
|
||||
for i in range(0, 766 // k):
|
||||
# __ab_approximation_31
|
||||
n = max(a.bit_length(), b.bit_length())
|
||||
if n < 64:
|
||||
a_, b_ = a, b
|
||||
else:
|
||||
a_ = (a & mask) | ((a >> (n-k-2)) << k)
|
||||
b_ = (b & mask) | ((b >> (n-k-2)) << k)
|
||||
|
||||
# __inner_loop_31
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, k):
|
||||
if a_ & 1:
|
||||
if a_ < b_:
|
||||
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
|
||||
a_, f0, g0 = a_-b_, f0-f1, g0-g1
|
||||
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
|
||||
|
||||
# __smulx_383_n_shift_by_31
|
||||
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
|
||||
if a < 0:
|
||||
a, f0, g0 = -a, -f0, -g0
|
||||
if b < 0:
|
||||
b, f1, g1 = -b, -f1, -g1
|
||||
|
||||
# __smulx_767x63
|
||||
u, v = u*f0 + v*g0, u*f1 + v*g1
|
||||
|
||||
if 766 % k:
|
||||
f0, g0, f1, g1 = 1, 0, 0, 1
|
||||
for j in range(0, 766 % k):
|
||||
if a & 1:
|
||||
if a < b:
|
||||
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
|
||||
a, f0, g0 = a-b, f0-f1, g0-g1
|
||||
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
|
||||
|
||||
v = u*f1 + v*g1
|
||||
|
||||
if v < 0:
|
||||
v += mod << (768 - mod.bit_length()) # left aligned
|
||||
|
||||
return v & (2**768 - 1) # to be reduced % mod
|
||||
___
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
|
||||
my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
|
||||
my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
|
||||
my $cnt = "%edi";
|
||||
|
||||
$frame = 8*11+2*512;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl ctx_inverse_mod_383
|
||||
.type ctx_inverse_mod_383,\@function,4,"unwind"
|
||||
.align 32
|
||||
ctx_inverse_mod_383:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$$frame, %rsp
|
||||
.cfi_adjust_cfa_offset $frame
|
||||
.cfi_end_prologue
|
||||
|
||||
lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot
|
||||
and \$-512, %rax # in the frame...
|
||||
mov $out_ptr, 8*4(%rsp)
|
||||
mov $nx_ptr, 8*5(%rsp)
|
||||
|
||||
mov 8*0($in_ptr), @acc[0] # load input
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
mov 8*4($in_ptr), @acc[4]
|
||||
mov 8*5($in_ptr), @acc[5]
|
||||
|
||||
mov 8*0($n_ptr), @acc[6] # load modulus
|
||||
mov 8*1($n_ptr), @acc[7]
|
||||
mov 8*2($n_ptr), @acc[8]
|
||||
mov 8*3($n_ptr), @acc[9]
|
||||
mov 8*4($n_ptr), @acc[10]
|
||||
mov 8*5($n_ptr), @acc[11]
|
||||
|
||||
mov @acc[0], 8*0(%rax) # copy input to |a|
|
||||
mov @acc[1], 8*1(%rax)
|
||||
mov @acc[2], 8*2(%rax)
|
||||
mov @acc[3], 8*3(%rax)
|
||||
mov @acc[4], 8*4(%rax)
|
||||
mov @acc[5], 8*5(%rax)
|
||||
|
||||
mov @acc[6], 8*6(%rax) # copy modulus to |b|
|
||||
mov @acc[7], 8*7(%rax)
|
||||
mov @acc[8], 8*8(%rax)
|
||||
mov @acc[9], 8*9(%rax)
|
||||
mov @acc[10], 8*10(%rax)
|
||||
mov %rax, $in_ptr
|
||||
mov @acc[11], 8*11(%rax)
|
||||
|
||||
################################# first iteration
|
||||
mov \$31, $cnt
|
||||
call __ab_approximation_31
|
||||
#mov $f0, 8*7(%rsp)
|
||||
#mov $g0, 8*8(%rsp)
|
||||
mov $f1, 8*9(%rsp)
|
||||
mov $g1, 8*10(%rsp)
|
||||
|
||||
mov \$256, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||||
call __smulx_383_n_shift_by_31
|
||||
#mov $f0, 8*7(%rsp) # corrected |f0|
|
||||
#mov $g0, 8*8(%rsp) # corrected |g0|
|
||||
mov $f0, 8*12($out_ptr) # initialize |u| with |f0|
|
||||
|
||||
mov 8*9(%rsp), $f0 # |f1|
|
||||
mov 8*10(%rsp), $g0 # |g1|
|
||||
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
|
||||
call __smulx_383_n_shift_by_31
|
||||
#mov $f0, 8*9(%rsp) # corrected |f1|
|
||||
#mov $g0, 8*10(%rsp) # corrected |g1|
|
||||
mov $f0, 8*12($out_ptr) # initialize |v| with |f1|
|
||||
|
||||
################################# second iteration
|
||||
xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||||
mov \$31, $cnt
|
||||
call __ab_approximation_31
|
||||
#mov $f0, 8*7(%rsp)
|
||||
#mov $g0, 8*8(%rsp)
|
||||
mov $f1, 8*9(%rsp)
|
||||
mov $g1, 8*10(%rsp)
|
||||
|
||||
mov \$256, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||||
call __smulx_383_n_shift_by_31
|
||||
mov $f0, 8*7(%rsp) # corrected |f0|
|
||||
mov $g0, 8*8(%rsp) # corrected |g0|
|
||||
|
||||
mov 8*9(%rsp), $f0 # |f1|
|
||||
mov 8*10(%rsp), $g0 # |g1|
|
||||
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
|
||||
call __smulx_383_n_shift_by_31
|
||||
#mov $f0, 8*9(%rsp) # corrected |f1|
|
||||
#mov $g0, 8*10(%rsp) # corrected |g1|
|
||||
|
||||
mov 8*12($in_ptr), %rax # |u|
|
||||
mov 8*18($in_ptr), @acc[3] # |v|
|
||||
mov $f0, %rbx
|
||||
mov %rax, @acc[2]
|
||||
imulq 8*7(%rsp) # |u|*|f0|
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[3], %rax
|
||||
mov %rdx, @acc[1]
|
||||
imulq 8*8(%rsp) # |v|*|g0|
|
||||
add %rax, @acc[0]
|
||||
adc %rdx, @acc[1]
|
||||
mov @acc[0], 8*6($out_ptr) # destination |u|
|
||||
mov @acc[1], 8*7($out_ptr)
|
||||
sar \$63, @acc[1] # sign extension
|
||||
mov @acc[1], 8*8($out_ptr)
|
||||
mov @acc[1], 8*9($out_ptr)
|
||||
mov @acc[1], 8*10($out_ptr)
|
||||
mov @acc[1], 8*11($out_ptr)
|
||||
lea 8*12($in_ptr), $in_ptr # make in_ptr "rewindable" with xor
|
||||
|
||||
mov @acc[2], %rax
|
||||
imulq %rbx # |u|*|f1|
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[3], %rax
|
||||
mov %rdx, @acc[1]
|
||||
imulq %rcx # |v|*|g1|
|
||||
add %rax, @acc[0]
|
||||
adc %rdx, @acc[1]
|
||||
mov @acc[0], 8*12($out_ptr) # destination |v|
|
||||
mov @acc[1], 8*13($out_ptr)
|
||||
sar \$63, @acc[1] # sign extension
|
||||
mov @acc[1], 8*14($out_ptr)
|
||||
mov @acc[1], 8*15($out_ptr)
|
||||
mov @acc[1], 8*16($out_ptr)
|
||||
mov @acc[1], 8*17($out_ptr)
|
||||
___
|
||||
for($i=2; $i<23; $i++) {
|
||||
my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31"
|
||||
: "__smulx_191_n_shift_by_31";
|
||||
my $smul_767x63 = $i>11 ? "__smulx_767x63"
|
||||
: "__smulx_383x63";
|
||||
$code.=<<___;
|
||||
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||||
mov \$31, $cnt
|
||||
call __ab_approximation_31
|
||||
#mov $f0, 8*7(%rsp)
|
||||
#mov $g0, 8*8(%rsp)
|
||||
mov $f1, 8*9(%rsp)
|
||||
mov $g1, 8*10(%rsp)
|
||||
|
||||
mov \$256, $out_ptr
|
||||
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||||
call $smul_n_shift
|
||||
mov $f0, 8*7(%rsp) # corrected |f0|
|
||||
mov $g0, 8*8(%rsp) # corrected |g0|
|
||||
|
||||
mov 8*9(%rsp), $f0 # |f1|
|
||||
mov 8*10(%rsp), $g0 # |g1|
|
||||
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
|
||||
call $smul_n_shift
|
||||
mov $f0, 8*9(%rsp) # corrected |f1|
|
||||
mov $g0, 8*10(%rsp) # corrected |g1|
|
||||
|
||||
mov 8*7(%rsp), $f0 # |f0|
|
||||
mov 8*8(%rsp), $g0 # |g0|
|
||||
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
|
||||
lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
|
||||
call __smulx_383x63
|
||||
|
||||
mov 8*9(%rsp), $f0 # |f1|
|
||||
mov 8*10(%rsp), $g0 # |g1|
|
||||
lea 8*6($out_ptr),$out_ptr # pointer to destination |v|
|
||||
call $smul_767x63
|
||||
___
|
||||
$code.=<<___ if ($i==11);
|
||||
sar \$63, @acc[5] # sign extension
|
||||
mov @acc[5], 8*6($out_ptr)
|
||||
mov @acc[5], 8*7($out_ptr)
|
||||
mov @acc[5], 8*8($out_ptr)
|
||||
mov @acc[5], 8*9($out_ptr)
|
||||
mov @acc[5], 8*10($out_ptr)
|
||||
mov @acc[5], 8*11($out_ptr)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
################################# two[!] last iterations in one go
|
||||
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||||
mov \$53, $cnt # 31 + 766 % 31
|
||||
#call __ab_approximation_31 # |a| and |b| are exact, just load
|
||||
mov 8*0($in_ptr), @acc[0] # |a_lo|
|
||||
#xor @acc[1], @acc[1] # |a_hi|
|
||||
mov 8*6($in_ptr), @acc[2] # |b_lo|
|
||||
#xor @acc[3], @acc[3] # |b_hi|
|
||||
call __inner_loop_62
|
||||
#mov $f0, 8*7(%rsp)
|
||||
#mov $g0, 8*8(%rsp)
|
||||
#mov $f1, 8*9(%rsp)
|
||||
#mov $g1, 8*10(%rsp)
|
||||
|
||||
#mov 8*7(%rsp), $f0 # |f0|
|
||||
#mov 8*8(%rsp), $g0 # |g0|
|
||||
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
|
||||
#lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
|
||||
#call __smulx_383x63
|
||||
|
||||
#mov 8*9(%rsp), $f0 # |f1|
|
||||
#mov 8*10(%rsp), $g0 # |g1|
|
||||
mov $f1, $f0
|
||||
mov $g1, $g0
|
||||
mov 8*4(%rsp), $out_ptr # original out_ptr
|
||||
call __smulx_767x63
|
||||
|
||||
mov 8*5(%rsp), $in_ptr # original n_ptr
|
||||
mov %rax, %rdx # top limb of the result
|
||||
sar \$63, %rax # result's sign as mask
|
||||
|
||||
mov %rax, @acc[0] # mask |modulus|
|
||||
mov %rax, @acc[1]
|
||||
mov %rax, @acc[2]
|
||||
and 8*0($in_ptr), @acc[0]
|
||||
and 8*1($in_ptr), @acc[1]
|
||||
mov %rax, @acc[3]
|
||||
and 8*2($in_ptr), @acc[2]
|
||||
and 8*3($in_ptr), @acc[3]
|
||||
mov %rax, @acc[4]
|
||||
and 8*4($in_ptr), @acc[4]
|
||||
and 8*5($in_ptr), %rax
|
||||
|
||||
add @acc[0], @acc[6] # conditionally add |modulus|<<384
|
||||
adc @acc[1], @acc[7]
|
||||
adc @acc[2], @acc[8]
|
||||
adc @acc[3], @acc[9]
|
||||
adc @acc[4], %rcx
|
||||
adc %rax, %rdx
|
||||
|
||||
mov @acc[6], 8*6($out_ptr) # store absolute value
|
||||
mov @acc[7], 8*7($out_ptr)
|
||||
mov @acc[8], 8*8($out_ptr)
|
||||
mov @acc[9], 8*9($out_ptr)
|
||||
mov %rcx, 8*10($out_ptr)
|
||||
mov %rdx, 8*11($out_ptr)
|
||||
|
||||
lea $frame(%rsp), %r8 # size optimization
|
||||
mov 8*0(%r8),%r15
|
||||
.cfi_restore %r15
|
||||
mov 8*1(%r8),%r14
|
||||
.cfi_restore %r14
|
||||
mov 8*2(%r8),%r13
|
||||
.cfi_restore %r13
|
||||
mov 8*3(%r8),%r12
|
||||
.cfi_restore %r12
|
||||
mov 8*4(%r8),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 8*5(%r8),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 8*6(%r8),%rsp
|
||||
.cfi_adjust_cfa_offset -$frame-8*6
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size ctx_inverse_mod_383,.-ctx_inverse_mod_383
|
||||
___
|
||||
########################################################################
|
||||
# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
|
||||
# to the maximum bit-length of the *result*, and "63" - to the maximum
|
||||
# bit-length of the |f?| and |g?| single-limb multiplicands. However!
|
||||
# The latter should not be taken literally, as they are always chosen so
|
||||
# that "bad things" don't happen. For example, there comes a point when
|
||||
# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
|
||||
# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
|
||||
# because past that point |f0| is always 1 and |g0| is always 0. And,
|
||||
# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
|
||||
# perform full-width |u|*|f1| multiplication, half-width one with sign
|
||||
# extension is sufficient...
|
||||
{
|
||||
my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
|
||||
my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
|
||||
my $fx = @acc[9];
|
||||
|
||||
$code.=<<___;
|
||||
.type __smulx_767x63,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulx_767x63:
|
||||
mov 8*0($in_ptr), @acc[0] # load |u|
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
mov 8*4($in_ptr), @acc[4]
|
||||
mov 8*5($in_ptr), @acc[5]
|
||||
|
||||
mov $f0, %rax
|
||||
sar \$63, %rax # |f0|'s sign as mask
|
||||
xor $fx, $fx # overrides in_ptr
|
||||
sub %rax, $fx # |f0|'s sign as bit
|
||||
|
||||
mov $out_ptr, 8*1(%rsp)
|
||||
mov $in_ptr, 8*2(%rsp)
|
||||
lea 8*6($in_ptr), $in_ptr # pointer to |v|
|
||||
|
||||
xor %rax, $f0 # conditionally negate |f0|
|
||||
add $fx, $f0
|
||||
|
||||
xor %rax, @acc[0] # conditionally negate |u|
|
||||
xor %rax, @acc[1]
|
||||
xor %rax, @acc[2]
|
||||
xor %rax, @acc[3]
|
||||
xor %rax, @acc[4]
|
||||
xor @acc[5], %rax
|
||||
add $fx, @acc[0]
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, %rax
|
||||
|
||||
mulx @acc[0], @acc[0], $fx # |u|*|f0|
|
||||
mulx @acc[1], @acc[1], @acc[5]
|
||||
add $fx, @acc[1]
|
||||
___
|
||||
for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
|
||||
$code.=<<___;
|
||||
mulx @acc[$i], @acc[$i], $a
|
||||
adc $b, @acc[$i]
|
||||
___
|
||||
($a, $b) = ($b, $a);
|
||||
}
|
||||
$code.=<<___;
|
||||
adc \$0, $fx
|
||||
imulq %rdx
|
||||
add $fx, %rax
|
||||
adc \$0, %rdx
|
||||
|
||||
mov @acc[0], 8*0($out_ptr) # offload |u|*|f0|
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov %rax, 8*5($out_ptr)
|
||||
mov %rdx, 8*6($out_ptr)
|
||||
sar \$63, %rdx # sign extension
|
||||
mov %rdx, 8*7($out_ptr)
|
||||
___
|
||||
{
|
||||
my $fx=$in_ptr;
|
||||
$code.=<<___;
|
||||
mov $g0, $f0 # load |g0|
|
||||
mov $g0, %rax
|
||||
|
||||
mov 8*0($in_ptr), @acc[0] # load |v|
|
||||
mov 8*1($in_ptr), @acc[1]
|
||||
mov 8*2($in_ptr), @acc[2]
|
||||
mov 8*3($in_ptr), @acc[3]
|
||||
mov 8*4($in_ptr), @acc[4]
|
||||
mov 8*5($in_ptr), @acc[5]
|
||||
mov 8*6($in_ptr), @acc[6]
|
||||
mov 8*7($in_ptr), @acc[7]
|
||||
mov 8*8($in_ptr), @acc[8]
|
||||
mov 8*9($in_ptr), @acc[9]
|
||||
mov 8*10($in_ptr), @acc[10]
|
||||
mov 8*11($in_ptr), @acc[11]
|
||||
|
||||
sar \$63, %rax # |g0|'s sign as mask
|
||||
xor $fx, $fx # overrides in_ptr
|
||||
sub %rax, $fx # |g0|'s sign as bit
|
||||
|
||||
xor %rax, $f0 # conditionally negate |g0|
|
||||
add $fx, $f0
|
||||
|
||||
xor %rax, @acc[0] # conditionally negate |v|
|
||||
xor %rax, @acc[1]
|
||||
xor %rax, @acc[2]
|
||||
xor %rax, @acc[3]
|
||||
xor %rax, @acc[4]
|
||||
xor %rax, @acc[5]
|
||||
xor %rax, @acc[6]
|
||||
xor %rax, @acc[7]
|
||||
xor %rax, @acc[8]
|
||||
xor %rax, @acc[9]
|
||||
xor %rax, @acc[10]
|
||||
xor %rax, @acc[11]
|
||||
add $fx, @acc[0]
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
adc \$0, @acc[6]
|
||||
adc \$0, @acc[7]
|
||||
adc \$0, @acc[8]
|
||||
adc \$0, @acc[9]
|
||||
adc \$0, @acc[10]
|
||||
adc \$0, @acc[11]
|
||||
|
||||
mulx @acc[0], @acc[0], %rax # |v|*|g0|
|
||||
mulx @acc[1], @acc[1], $fx
|
||||
add %rax, @acc[1]
|
||||
___
|
||||
for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) {
|
||||
$code.=<<___;
|
||||
mulx @acc[$i], @acc[$i], $a
|
||||
adc $b, @acc[$i]
|
||||
___
|
||||
($a, $b) = ($b, $a);
|
||||
}
|
||||
$code.=<<___;
|
||||
mulx @acc[11], @acc[11], $fx
|
||||
mov 8*1(%rsp), %rdx # out_ptr
|
||||
mov 8*2(%rsp), $in_ptr # restore original in_ptr
|
||||
adc @acc[11], %rax
|
||||
|
||||
add 8*0(%rdx), @acc[0] # accumulate |u|*|f0|
|
||||
adc 8*1(%rdx), @acc[1]
|
||||
adc 8*2(%rdx), @acc[2]
|
||||
adc 8*3(%rdx), @acc[3]
|
||||
adc 8*4(%rdx), @acc[4]
|
||||
adc 8*5(%rdx), @acc[5]
|
||||
adc 8*6(%rdx), @acc[6]
|
||||
mov 8*7(%rdx), @acc[11] # sign extension
|
||||
adc @acc[11], @acc[7]
|
||||
adc @acc[11], @acc[8]
|
||||
adc @acc[11], @acc[9]
|
||||
adc @acc[11], @acc[10]
|
||||
adc @acc[11], %rax
|
||||
|
||||
mov %rdx, $out_ptr # restore original out_ptr
|
||||
|
||||
mov @acc[0], 8*0(%rdx)
|
||||
mov @acc[1], 8*1(%rdx)
|
||||
mov @acc[2], 8*2(%rdx)
|
||||
mov @acc[3], 8*3(%rdx)
|
||||
mov @acc[4], 8*4(%rdx)
|
||||
mov @acc[5], 8*5(%rdx)
|
||||
mov @acc[6], 8*6(%rdx)
|
||||
mov @acc[7], 8*7(%rdx)
|
||||
mov @acc[8], 8*8(%rdx)
|
||||
mov @acc[9], 8*9(%rdx)
|
||||
mov @acc[10], 8*10(%rdx)
|
||||
mov %rax, 8*11(%rdx)
|
||||
|
||||
ret
|
||||
.size __smulx_767x63,.-__smulx_767x63
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.type __smulx_383x63,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulx_383x63:
|
||||
___
|
||||
for($j=0; $j<2; $j++) {
|
||||
my $k = 8*6*$j;
|
||||
$code.=<<___;
|
||||
mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
|
||||
mov $k+8*1($in_ptr), @acc[1]
|
||||
mov $k+8*2($in_ptr), @acc[2]
|
||||
mov $k+8*3($in_ptr), @acc[3]
|
||||
mov $k+8*4($in_ptr), @acc[4]
|
||||
mov $k+8*5($in_ptr), @acc[5]
|
||||
|
||||
mov $f0, $fx
|
||||
sar \$63, $fx # |f0|'s sign as mask (or |g0|'s)
|
||||
xor %rax, %rax
|
||||
sub $fx, %rax # |f0|'s sign as bit (or |g0|'s)
|
||||
|
||||
xor $fx, $f0 # conditionally negate |f0|
|
||||
add %rax, $f0
|
||||
|
||||
xor $fx, @acc[0] # conditionally negate |u| (or |v|)
|
||||
xor $fx, @acc[1]
|
||||
xor $fx, @acc[2]
|
||||
xor $fx, @acc[3]
|
||||
xor $fx, @acc[4]
|
||||
xor $fx, @acc[5]
|
||||
add %rax, @acc[0]
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
|
||||
mulx @acc[0], @acc[0], $fx # |u|*|f0| (or |v|*|g0|)
|
||||
mulx @acc[1], @acc[1], %rax
|
||||
add $fx, @acc[1]
|
||||
___
|
||||
for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) {
|
||||
$code.=<<___;
|
||||
mulx @acc[$i], @acc[$i], $a
|
||||
adc $b, @acc[$i]
|
||||
___
|
||||
($a, $b) = ($b, $a);
|
||||
}
|
||||
$code.=<<___ if ($j==0);
|
||||
mulx @acc[$i], @acc[$i], %rax
|
||||
mov $g0, $f0
|
||||
adc $fx, @acc[$i]
|
||||
|
||||
mov @acc[0], 8*0($out_ptr) # offload |u|*|f0|
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
mulx @acc[$i], @acc[$i], %rax
|
||||
adc $fx, @acc[$i]
|
||||
|
||||
add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0|
|
||||
adc 8*1($out_ptr), @acc[1]
|
||||
adc 8*2($out_ptr), @acc[2]
|
||||
adc 8*3($out_ptr), @acc[3]
|
||||
adc 8*4($out_ptr), @acc[4]
|
||||
adc 8*5($out_ptr), @acc[5]
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov @acc[5], 8*5($out_ptr)
|
||||
|
||||
ret
|
||||
.size __smulx_383x63,.-__smulx_383x63
|
||||
___
|
||||
########################################################################
|
||||
# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
|
||||
# the names refers to maximum bit-lengths of |a| and |b|. As already
|
||||
# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
|
||||
# chosen so that "bad things" don't happen. For example, so that the
|
||||
# sum of the products doesn't overflow, and that the final result is
|
||||
# never wider than inputs...
|
||||
{
|
||||
$code.=<<___;
|
||||
.type __smulx_383_n_shift_by_31,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulx_383_n_shift_by_31:
|
||||
mov $f0, @acc[8]
|
||||
xor @acc[6], @acc[6]
|
||||
___
|
||||
my $f0 = @acc[8];
|
||||
for($j=0; $j<2; $j++) {
|
||||
my $k = 8*6*$j;
|
||||
$code.=<<___;
|
||||
mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
|
||||
mov $k+8*1($in_ptr), @acc[1]
|
||||
mov $k+8*2($in_ptr), @acc[2]
|
||||
mov $k+8*3($in_ptr), @acc[3]
|
||||
mov $k+8*4($in_ptr), @acc[4]
|
||||
mov $k+8*5($in_ptr), @acc[5]
|
||||
|
||||
mov %rdx, %rax
|
||||
sar \$63, %rax # |f0|'s sign as mask (or |g0|'s)
|
||||
xor $fx, $fx
|
||||
sub %rax, $fx # |f0|'s sign as bit (or |g0|'s)
|
||||
|
||||
xor %rax, %rdx # conditionally negate |f0| (or |g0|)
|
||||
add $fx, %rdx
|
||||
|
||||
xor %rax, @acc[0] # conditionally negate |a| (or |b|)
|
||||
xor %rax, @acc[1]
|
||||
xor %rax, @acc[2]
|
||||
xor %rax, @acc[3]
|
||||
xor %rax, @acc[4]
|
||||
xor @acc[5], %rax
|
||||
add $fx, @acc[0]
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, %rax
|
||||
|
||||
mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|)
|
||||
mulx @acc[1], @acc[1], @acc[5]
|
||||
add $fx, @acc[1]
|
||||
___
|
||||
for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
|
||||
$code.=<<___;
|
||||
mulx @acc[$i], @acc[$i], $a
|
||||
adc $b, @acc[$i]
|
||||
___
|
||||
($a, $b) = ($b, $a);
|
||||
}
|
||||
$code.=<<___ if ($j==0);
|
||||
adc \$0, $fx
|
||||
imulq %rdx
|
||||
add $fx, %rax
|
||||
adc %rdx, @acc[6]
|
||||
|
||||
mov $g0, %rdx
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov %rax, 8*5($out_ptr)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
adc \$0, $fx
|
||||
imulq %rdx
|
||||
add $fx, %rax
|
||||
adc \$0, %rdx
|
||||
|
||||
add 8*0($out_ptr), @acc[0]
|
||||
adc 8*1($out_ptr), @acc[1]
|
||||
adc 8*2($out_ptr), @acc[2]
|
||||
adc 8*3($out_ptr), @acc[3]
|
||||
adc 8*4($out_ptr), @acc[4]
|
||||
adc 8*5($out_ptr), %rax
|
||||
adc %rdx, @acc[6]
|
||||
mov $f0, %rdx
|
||||
|
||||
shrd \$31, @acc[1], @acc[0]
|
||||
shrd \$31, @acc[2], @acc[1]
|
||||
shrd \$31, @acc[3], @acc[2]
|
||||
shrd \$31, @acc[4], @acc[3]
|
||||
shrd \$31, %rax, @acc[4]
|
||||
shrd \$31, @acc[6], %rax
|
||||
|
||||
sar \$63, @acc[6] # sign as mask
|
||||
xor $fx, $fx
|
||||
sub @acc[6], $fx # sign as bit
|
||||
|
||||
xor @acc[6], @acc[0] # conditionally negate the result
|
||||
xor @acc[6], @acc[1]
|
||||
xor @acc[6], @acc[2]
|
||||
xor @acc[6], @acc[3]
|
||||
xor @acc[6], @acc[4]
|
||||
xor @acc[6], %rax
|
||||
add $fx, @acc[0]
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
adc \$0, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, %rax
|
||||
|
||||
mov @acc[0], 8*0($out_ptr)
|
||||
mov @acc[1], 8*1($out_ptr)
|
||||
mov @acc[2], 8*2($out_ptr)
|
||||
mov @acc[3], 8*3($out_ptr)
|
||||
mov @acc[4], 8*4($out_ptr)
|
||||
mov %rax, 8*5($out_ptr)
|
||||
|
||||
xor @acc[6], %rdx # conditionally negate |f0|
|
||||
xor @acc[6], $g0 # conditionally negate |g0|
|
||||
add $fx, %rdx
|
||||
add $fx, $g0
|
||||
|
||||
ret
|
||||
.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31
|
||||
___
|
||||
} {
|
||||
$code.=<<___;
|
||||
.type __smulx_191_n_shift_by_31,\@abi-omnipotent
|
||||
.align 32
|
||||
__smulx_191_n_shift_by_31:
|
||||
mov $f0, @acc[8]
|
||||
___
|
||||
my $f0 = @acc[8];
|
||||
for($j=0; $j<2; $j++) {
|
||||
my $k = 8*6*$j;
|
||||
my @acc=@acc;
|
||||
@acc=@acc[3..5] if ($j);
|
||||
$code.=<<___;
|
||||
mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
|
||||
mov $k+8*1($in_ptr), @acc[1]
|
||||
mov $k+8*2($in_ptr), @acc[2]
|
||||
|
||||
mov %rdx, %rax
|
||||
sar \$63, %rax # |f0|'s sign as mask (or |g0|'s)
|
||||
xor $fx, $fx
|
||||
sub %rax, $fx # |f0|'s sign as bit (or |g0|'s)
|
||||
|
||||
xor %rax, %rdx # conditionally negate |f0| (or |g0|)
|
||||
add $fx, %rdx
|
||||
|
||||
xor %rax, @acc[0] # conditionally negate |a| (or |b|)
|
||||
xor %rax, @acc[1]
|
||||
xor @acc[2], %rax
|
||||
add $fx, @acc[0]
|
||||
adc \$0, @acc[1]
|
||||
adc \$0, %rax
|
||||
|
||||
mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|)
|
||||
mulx @acc[1], @acc[1], @acc[2]
|
||||
add $fx, @acc[1]
|
||||
adc \$0, @acc[2]
|
||||
imulq %rdx
|
||||
add %rax, @acc[2]
|
||||
adc \$0, %rdx
|
||||
___
|
||||
$code.=<<___ if ($j==0);
|
||||
mov %rdx, @acc[6]
|
||||
mov $g0, %rdx
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
add @acc[0], @acc[3]
|
||||
adc @acc[1], @acc[4]
|
||||
adc @acc[2], @acc[5]
|
||||
adc %rdx, @acc[6]
|
||||
mov $f0, %rdx
|
||||
|
||||
shrd \$31, @acc[4], @acc[3]
|
||||
shrd \$31, @acc[5], @acc[4]
|
||||
shrd \$31, @acc[6], @acc[5]
|
||||
|
||||
sar \$63, @acc[6] # sign as mask
|
||||
xor $fx, $fx
|
||||
sub @acc[6], $fx # sign as bit
|
||||
|
||||
xor @acc[6], @acc[3] # conditionally negate the result
|
||||
xor @acc[6], @acc[4]
|
||||
xor @acc[6], @acc[5]
|
||||
add $fx, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
|
||||
mov @acc[3], 8*0($out_ptr)
|
||||
mov @acc[4], 8*1($out_ptr)
|
||||
mov @acc[5], 8*2($out_ptr)
|
||||
|
||||
xor @acc[6], %rdx # conditionally negate |f0|
|
||||
xor @acc[6], $g0 # conditionally negate |g0|
|
||||
add $fx, %rdx
|
||||
add $fx, $g0
|
||||
|
||||
ret
|
||||
.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31
|
||||
___
|
||||
} }
|
||||
|
||||
{
|
||||
my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
|
||||
my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
|
||||
my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
|
||||
my ($a_, $b_) = ($a_lo, $b_lo);
|
||||
{
|
||||
my @a = ($a_lo, $t1, $a_hi);
|
||||
my @b = ($b_lo, $t2, $b_hi);
|
||||
|
||||
$code.=<<___;
|
||||
.type __ab_approximation_31,\@abi-omnipotent
|
||||
.align 32
|
||||
__ab_approximation_31:
|
||||
mov 8*5($in_ptr), @a[2] # load |a| in reverse order
|
||||
mov 8*11($in_ptr), @b[2] # load |b| in reverse order
|
||||
mov 8*4($in_ptr), @a[1]
|
||||
mov 8*10($in_ptr), @b[1]
|
||||
mov 8*3($in_ptr), @a[0]
|
||||
mov 8*9($in_ptr), @b[0]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0 # check top-most limbs, ...
|
||||
cmovz @a[1], @a[2]
|
||||
cmovz @b[1], @b[2]
|
||||
cmovz @a[0], @a[1]
|
||||
mov 8*2($in_ptr), @a[0]
|
||||
cmovz @b[0], @b[1]
|
||||
mov 8*8($in_ptr), @b[0]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0 # ... ones before top-most, ...
|
||||
cmovz @a[1], @a[2]
|
||||
cmovz @b[1], @b[2]
|
||||
cmovz @a[0], @a[1]
|
||||
mov 8*1($in_ptr), @a[0]
|
||||
cmovz @b[0], @b[1]
|
||||
mov 8*7($in_ptr), @b[0]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0 # ... and ones before that ...
|
||||
cmovz @a[1], @a[2]
|
||||
cmovz @b[1], @b[2]
|
||||
cmovz @a[0], @a[1]
|
||||
mov 8*0($in_ptr), @a[0]
|
||||
cmovz @b[0], @b[1]
|
||||
mov 8*6($in_ptr), @b[0]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0 # ... and ones before that ...
|
||||
cmovz @a[1], @a[2]
|
||||
cmovz @b[1], @b[2]
|
||||
cmovz @a[0], @a[1]
|
||||
cmovz @b[0], @b[1]
|
||||
|
||||
mov @a[2], $t0
|
||||
or @b[2], $t0
|
||||
bsr $t0, %rcx
|
||||
lea 1(%rcx), %rcx
|
||||
cmovz @a[0], @a[2]
|
||||
cmovz @b[0], @b[2]
|
||||
cmovz $t0, %rcx
|
||||
neg %rcx
|
||||
#and \$63, %rcx # debugging artefact
|
||||
|
||||
shldq %cl, @a[1], @a[2] # align second limb to the left
|
||||
shldq %cl, @b[1], @b[2]
|
||||
|
||||
mov \$0x7FFFFFFF, %eax
|
||||
and %rax, @a[0]
|
||||
and %rax, @b[0]
|
||||
andn @a[2], %rax, @a[2]
|
||||
andn @b[2], %rax, @b[2]
|
||||
or @a[2], @a[0]
|
||||
or @b[2], @b[0]
|
||||
|
||||
jmp __inner_loop_31
|
||||
|
||||
ret
|
||||
.size __ab_approximation_31,.-__ab_approximation_31
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.type __inner_loop_31,\@abi-omnipotent
|
||||
.align 32
|
||||
__inner_loop_31: ################# by Thomas Pornin
|
||||
mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0
|
||||
mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1
|
||||
mov \$0x7FFFFFFF7FFFFFFF, $bias
|
||||
|
||||
.Loop_31:
|
||||
cmp $b_, $a_ # if |a_|<|b_|, swap the variables
|
||||
mov $a_, $t0
|
||||
mov $b_, $t1
|
||||
mov $fg0, $t2
|
||||
mov $fg1, $t3
|
||||
cmovb $b_, $a_
|
||||
cmovb $t0, $b_
|
||||
cmovb $fg1, $fg0
|
||||
cmovb $t2, $fg1
|
||||
|
||||
sub $b_, $a_ # |a_|-|b_|
|
||||
sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1|
|
||||
add $bias, $fg0
|
||||
|
||||
test \$1, $t0 # if |a_| was even, roll back
|
||||
cmovz $t0, $a_
|
||||
cmovz $t1, $b_
|
||||
cmovz $t2, $fg0
|
||||
cmovz $t3, $fg1
|
||||
|
||||
shr \$1, $a_ # |a_|>>=1
|
||||
add $fg1, $fg1 # |f1|<<=1, |g1|<<=1
|
||||
sub $bias, $fg1
|
||||
sub \$1, $cnt
|
||||
jnz .Loop_31
|
||||
|
||||
shr \$32, $bias
|
||||
mov %ecx, %edx # $fg0, $f0
|
||||
mov ${fg1}d, ${f1}d
|
||||
shr \$32, $g0
|
||||
shr \$32, $g1
|
||||
sub $bias, $f0 # remove the bias
|
||||
sub $bias, $g0
|
||||
sub $bias, $f1
|
||||
sub $bias, $g1
|
||||
|
||||
ret
|
||||
.size __inner_loop_31,.-__inner_loop_31
|
||||
|
||||
.type __inner_loop_62,\@abi-omnipotent
|
||||
.align 32
|
||||
__inner_loop_62:
|
||||
mov \$1, $f0 # |f0|=1
|
||||
xor $g0, $g0 # |g0|=0
|
||||
xor $f1, $f1 # |f1|=0
|
||||
mov \$1, $g1 # |g1|=1
|
||||
|
||||
.Loop_62:
|
||||
xor $t0, $t0
|
||||
test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_|
|
||||
mov $b_lo, $t1
|
||||
cmovnz $b_lo, $t0
|
||||
sub $a_lo, $t1 # |b_|-|a_|
|
||||
mov $a_lo, $t2
|
||||
sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even)
|
||||
cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_|
|
||||
cmovc $t2, $b_lo # |b_| = |a_|
|
||||
mov $f0, $t0 # exchange |f0| and |f1|
|
||||
cmovc $f1, $f0
|
||||
cmovc $t0, $f1
|
||||
mov $g0, $t1 # exchange |g0| and |g1|
|
||||
cmovc $g1, $g0
|
||||
cmovc $t1, $g1
|
||||
xor $t0, $t0
|
||||
xor $t1, $t1
|
||||
shr \$1, $a_lo
|
||||
test \$1, $t2 # if |a_| was odd, then we'll be subtracting...
|
||||
cmovnz $f1, $t0
|
||||
cmovnz $g1, $t1
|
||||
add $f1, $f1 # |f1|<<=1
|
||||
add $g1, $g1 # |g1|<<=1
|
||||
sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even)
|
||||
sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...)
|
||||
sub \$1, $cnt
|
||||
jnz .Loop_62
|
||||
|
||||
ret
|
||||
.size __inner_loop_62,.-__inner_loop_62
|
||||
___
|
||||
}
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
122
blst/asm/div3w-armv8.pl
Executable file
122
blst/asm/div3w-armv8.pl
Executable file
|
@ -0,0 +1,122 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl div_3_limbs
|
||||
.type div_3_limbs,%function
|
||||
.align 5
|
||||
div_3_limbs:
|
||||
ldp x4,x5,[x0] // load R
|
||||
eor x0,x0,x0 // Q = 0
|
||||
mov x3,#64 // loop counter
|
||||
nop
|
||||
|
||||
.Loop:
|
||||
subs x6,x4,x1 // R - D
|
||||
add x0,x0,x0 // Q <<= 1
|
||||
sbcs x7,x5,x2
|
||||
add x0,x0,#1 // Q + speculative bit
|
||||
csel x4,x4,x6,lo // select between R and R - D
|
||||
extr x1,x2,x1,#1 // D >>= 1
|
||||
csel x5,x5,x7,lo
|
||||
lsr x2,x2,#1
|
||||
sbc x0,x0,xzr // subtract speculative bit
|
||||
sub x3,x3,#1
|
||||
cbnz x3,.Loop
|
||||
|
||||
asr x3,x0,#63 // top bit -> mask
|
||||
add x0,x0,x0 // Q <<= 1
|
||||
subs x6,x4,x1 // R - D
|
||||
add x0,x0,#1 // Q + specilative bit
|
||||
sbcs x7,x5,x2
|
||||
sbc x0,x0,xzr // subtract speculative bit
|
||||
|
||||
orr x0,x0,x3 // all ones if overflow
|
||||
|
||||
ret
|
||||
.size div_3_limbs,.-div_3_limbs
|
||||
___
|
||||
{
|
||||
my ($div_rem, $divisor, $quot) = map("x$_",(0..2));
|
||||
my @div = map("x$_",(3..4));
|
||||
my @acc = map("x$_",(5..7));
|
||||
my @t = map("x$_",(8..11));
|
||||
|
||||
$code.=<<___;
|
||||
.globl quot_rem_128
|
||||
.type quot_rem_128,%function
|
||||
.align 5
|
||||
quot_rem_128:
|
||||
ldp @div[0],@div[1],[$divisor]
|
||||
|
||||
mul @acc[0],@div[0],$quot // divisor[0:1} * quotient
|
||||
umulh @acc[1],@div[0],$quot
|
||||
mul @t[3], @div[1],$quot
|
||||
umulh @acc[2],@div[1],$quot
|
||||
|
||||
ldp @t[0],@t[1],[$div_rem] // load 3 limbs of the dividend
|
||||
ldr @t[2],[$div_rem,#16]
|
||||
|
||||
adds @acc[1],@acc[1],@t[3]
|
||||
adc @acc[2],@acc[2],xzr
|
||||
|
||||
subs @t[0],@t[0],@acc[0] // dividend - divisor * quotient
|
||||
sbcs @t[1],@t[1],@acc[1]
|
||||
sbcs @t[2],@t[2],@acc[2]
|
||||
sbc @acc[0],xzr,xzr // borrow -> mask
|
||||
|
||||
add $quot,$quot,@acc[0] // if borrowed, adjust the quotient ...
|
||||
and @div[0],@div[0],@acc[0]
|
||||
and @div[1],@div[1],@acc[0]
|
||||
adds @t[0],@t[0],@div[0] // ... and add divisor
|
||||
adc @t[1],@t[1],@div[1]
|
||||
|
||||
stp @t[0],@t[1],[$div_rem] // save 2 limbs of the remainder
|
||||
str $quot,[$div_rem,#16] // and one limb of the quotient
|
||||
|
||||
mov x0,$quot // return adjusted quotient
|
||||
|
||||
ret
|
||||
.size quot_rem_128,.-quot_rem_128
|
||||
|
||||
.globl quot_rem_64
|
||||
.type quot_rem_64,%function
|
||||
.align 5
|
||||
quot_rem_64:
|
||||
ldr @div[0],[$divisor]
|
||||
ldr @t[0],[$div_rem] // load 1 limb of the dividend
|
||||
|
||||
mul @acc[0],@div[0],$quot // divisor * quotient
|
||||
|
||||
sub @t[0],@t[0],@acc[0] // dividend - divisor * quotient
|
||||
|
||||
stp @t[0],$quot,[$div_rem] // save remainder and quotient
|
||||
|
||||
mov x0,$quot // return quotient
|
||||
|
||||
ret
|
||||
.size quot_rem_64,.-quot_rem_64
|
||||
___
|
||||
}
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
184
blst/asm/div3w-x86_64.pl
Executable file
184
blst/asm/div3w-x86_64.pl
Executable file
|
@ -0,0 +1,184 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
$c_ref=<<'___';
|
||||
/*
|
||||
* |div_top| points at two most significant limbs of the dividend, |d_hi|
|
||||
* and |d_lo| are two most significant limbs of the divisor. If divisor
|
||||
* is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
|
||||
* The divisor is required to be "bitwise left-aligned," and dividend's
|
||||
* top limbs to be not larger than the divisor's. The latter limitation
|
||||
* can be problematic in the first iteration of multi-precision division,
|
||||
* where in most general case the condition would have to be "smaller."
|
||||
* The subroutine considers four limbs, two of which are "overlapping,"
|
||||
* hence the name... Another way to look at it is to think of the pair
|
||||
* of the dividend's limbs being suffixed with a zero:
|
||||
* +-------+-------+-------+
|
||||
* R | | | 0 |
|
||||
* +-------+-------+-------+
|
||||
* +-------+-------+
|
||||
* D | | |
|
||||
* +-------+-------+
|
||||
*/
|
||||
limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)
|
||||
{
|
||||
llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0];
|
||||
llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo;
|
||||
limb_t Q = 0, mask;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < LIMB_BITS; i++) {
|
||||
Q <<= 1;
|
||||
mask = (R >= D);
|
||||
Q |= mask;
|
||||
R -= (D & ((llimb_t)0 - mask));
|
||||
D >>= 1;
|
||||
}
|
||||
|
||||
mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */
|
||||
|
||||
Q <<= 1;
|
||||
Q |= (R >= D);
|
||||
|
||||
return (Q | mask);
|
||||
}
|
||||
___
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl div_3_limbs
|
||||
.hidden div_3_limbs
|
||||
.type div_3_limbs,\@function,3
|
||||
.align 32
|
||||
div_3_limbs:
|
||||
mov (%rdi),%r8 # load R.lo
|
||||
mov 8(%rdi),%r9 # load R.hi
|
||||
xor %rax,%rax # Q = 0
|
||||
mov \$64,%ecx # loop counter
|
||||
|
||||
.Loop:
|
||||
mov %r8,%r10 # put aside R
|
||||
sub %rsi,%r8 # R -= D
|
||||
mov %r9,%r11
|
||||
sbb %rdx,%r9
|
||||
lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit
|
||||
mov %rdx,%rdi
|
||||
cmovc %r10,%r8 # restore R if R - D borrowed
|
||||
cmovc %r11,%r9
|
||||
sbb \$0,%rax # subtract speculative bit
|
||||
shl \$63,%rdi
|
||||
shr \$1,%rsi
|
||||
shr \$1,%rdx
|
||||
or %rdi,%rsi # D >>= 1
|
||||
sub \$1,%ecx
|
||||
jnz .Loop
|
||||
|
||||
lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit
|
||||
sar \$63,%rax # top bit -> mask
|
||||
|
||||
sub %rsi,%r8 # R -= D
|
||||
sbb %rdx,%r9
|
||||
sbb \$0,%rcx # subtract speculative bit
|
||||
|
||||
or %rcx,%rax # all ones if overflow
|
||||
|
||||
ret
|
||||
.size div_3_limbs,.-div_3_limbs
|
||||
___
|
||||
########################################################################
|
||||
# Calculate remainder and adjust the quotient, which can be off-by-one.
|
||||
# Then save quotient in limb next to top limb of the remainder. There is
|
||||
# place, because the remainder/next-iteration-dividend gets shorter by
|
||||
# one limb.
|
||||
{
|
||||
my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");
|
||||
my @acc = ("%r8", "%r9", "%rdx");
|
||||
my @tmp = ("%r10", "%r11", "%rax");
|
||||
|
||||
$code.=<<___;
|
||||
.globl quot_rem_128
|
||||
.hidden quot_rem_128
|
||||
.type quot_rem_128,\@function,3
|
||||
.align 32
|
||||
quot_rem_128:
|
||||
mov %rdx, %rax
|
||||
mov %rdx, $quotient
|
||||
|
||||
mulq 0($divisor) # divisor[0:1] * quotient
|
||||
mov %rax, @acc[0]
|
||||
mov $quotient, %rax
|
||||
mov %rdx, @acc[1]
|
||||
|
||||
mulq 8($divisor)
|
||||
add %rax, @acc[1]
|
||||
adc \$0, %rdx # %rdx is @acc[2]
|
||||
|
||||
mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend
|
||||
mov 8($div_rem), @tmp[1]
|
||||
mov 16($div_rem), @tmp[2]
|
||||
|
||||
sub @acc[0], @tmp[0] # dividend - divisor * quotient
|
||||
sbb @acc[1], @tmp[1]
|
||||
sbb @acc[2], @tmp[2]
|
||||
sbb @acc[0], @acc[0] # borrow -> mask
|
||||
|
||||
add @acc[0], $quotient # if borrowed, adjust the quotient ...
|
||||
mov @acc[0], @acc[1]
|
||||
and 0($divisor), @acc[0]
|
||||
and 8($divisor), @acc[1]
|
||||
add @acc[0], @tmp[0] # ... and add divisor
|
||||
adc @acc[1], @tmp[1]
|
||||
|
||||
mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ...
|
||||
mov @tmp[1], 8($div_rem)
|
||||
mov $quotient, 16($div_rem) # ... and 1 limb of the quotient
|
||||
|
||||
mov $quotient, %rax # return adjusted quotient
|
||||
|
||||
ret
|
||||
.size quot_rem_128,.-quot_rem_128
|
||||
|
||||
########################################################################
|
||||
# Unlike 128-bit case above, quotient is exact. As result just one limb
|
||||
# of the dividend is sufficient to calculate the remainder...
|
||||
|
||||
.globl quot_rem_64
|
||||
.hidden quot_rem_64
|
||||
.type quot_rem_64,\@function,3
|
||||
.align 32
|
||||
quot_rem_64:
|
||||
mov %rdx, %rax # return quotient
|
||||
imulq 0($divisor), %rdx # divisor[0] * quotient
|
||||
|
||||
mov 0($div_rem), @tmp[0] # load 1 limb of the dividend
|
||||
|
||||
sub %rdx, @tmp[0] # dividend - divisor * quotient
|
||||
|
||||
mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ...
|
||||
mov %rax, 8($div_rem) # ... and 1 limb of the quotient
|
||||
|
||||
ret
|
||||
.size quot_rem_64,.-quot_rem_64
|
||||
___
|
||||
}
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
409
blst/asm/mul_mont_256-armv8.pl
Executable file
409
blst/asm/mul_mont_256-armv8.pl
Executable file
|
@ -0,0 +1,409 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# As for "sparse" in subroutine names, see commentary in the
|
||||
# asm/mulx_mont_256-x86_64.pl module.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
|
||||
|
||||
@mod=map("x$_",(5..8));
|
||||
$bi="x9";
|
||||
@a=map("x$_",(10..13));
|
||||
@tmp=map("x$_",(14..17));
|
||||
@acc=map("x$_",(19..24));
|
||||
$m0=$n_ptr;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl mul_mont_sparse_256
|
||||
.hidden mul_mont_sparse_256
|
||||
.type mul_mont_sparse_256,%function
|
||||
.align 5
|
||||
mul_mont_sparse_256:
|
||||
stp x29,x30,[sp,#-64]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
stp x23,x24,[sp,#48]
|
||||
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldr $bi, [$b_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
|
||||
mul @acc[0],@a[0],$bi
|
||||
ldp @mod[0],@mod[1],[$n_ptr]
|
||||
mul @acc[1],@a[1],$bi
|
||||
ldp @mod[2],@mod[3],[$n_ptr,#16]
|
||||
mul @acc[2],@a[2],$bi
|
||||
mul @acc[3],@a[3],$bi
|
||||
|
||||
umulh @tmp[0],@a[0],$bi
|
||||
umulh @tmp[1],@a[1],$bi
|
||||
mul $m0,$n0,@acc[0]
|
||||
umulh @tmp[2],@a[2],$bi
|
||||
umulh @tmp[3],@a[3],$bi
|
||||
adds @acc[1],@acc[1],@tmp[0]
|
||||
//mul @tmp[0],@mod[0],$m0
|
||||
adcs @acc[2],@acc[2],@tmp[1]
|
||||
mul @tmp[1],@mod[1],$m0
|
||||
adcs @acc[3],@acc[3],@tmp[2]
|
||||
mul @tmp[2],@mod[2],$m0
|
||||
adc @acc[4],xzr, @tmp[3]
|
||||
mul @tmp[3],@mod[3],$m0
|
||||
___
|
||||
for ($i=1;$i<4;$i++) {
|
||||
$code.=<<___;
|
||||
ldr $bi,[$b_ptr,8*$i]
|
||||
subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0]
|
||||
umulh @tmp[0],@mod[0],$m0
|
||||
adcs @acc[1],@acc[1],@tmp[1]
|
||||
umulh @tmp[1],@mod[1],$m0
|
||||
adcs @acc[2],@acc[2],@tmp[2]
|
||||
umulh @tmp[2],@mod[2],$m0
|
||||
adcs @acc[3],@acc[3],@tmp[3]
|
||||
umulh @tmp[3],@mod[3],$m0
|
||||
adc @acc[4],@acc[4],xzr
|
||||
|
||||
adds @acc[0],@acc[1],@tmp[0]
|
||||
mul @tmp[0],@a[0],$bi
|
||||
adcs @acc[1],@acc[2],@tmp[1]
|
||||
mul @tmp[1],@a[1],$bi
|
||||
adcs @acc[2],@acc[3],@tmp[2]
|
||||
mul @tmp[2],@a[2],$bi
|
||||
adcs @acc[3],@acc[4],@tmp[3]
|
||||
mul @tmp[3],@a[3],$bi
|
||||
adc @acc[4],xzr,xzr
|
||||
|
||||
adds @acc[0],@acc[0],@tmp[0]
|
||||
umulh @tmp[0],@a[0],$bi
|
||||
adcs @acc[1],@acc[1],@tmp[1]
|
||||
umulh @tmp[1],@a[1],$bi
|
||||
adcs @acc[2],@acc[2],@tmp[2]
|
||||
mul $m0,$n0,@acc[0]
|
||||
umulh @tmp[2],@a[2],$bi
|
||||
adcs @acc[3],@acc[3],@tmp[3]
|
||||
umulh @tmp[3],@a[3],$bi
|
||||
adc @acc[4],@acc[4],xzr
|
||||
|
||||
adds @acc[1],@acc[1],@tmp[0]
|
||||
//mul @tmp[0],@mod[0],$m0
|
||||
adcs @acc[2],@acc[2],@tmp[1]
|
||||
mul @tmp[1],@mod[1],$m0
|
||||
adcs @acc[3],@acc[3],@tmp[2]
|
||||
mul @tmp[2],@mod[2],$m0
|
||||
adc @acc[4],@acc[4],@tmp[3]
|
||||
mul @tmp[3],@mod[3],$m0
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0]
|
||||
umulh @tmp[0],@mod[0],$m0
|
||||
adcs @acc[1],@acc[1],@tmp[1]
|
||||
umulh @tmp[1],@mod[1],$m0
|
||||
adcs @acc[2],@acc[2],@tmp[2]
|
||||
umulh @tmp[2],@mod[2],$m0
|
||||
adcs @acc[3],@acc[3],@tmp[3]
|
||||
umulh @tmp[3],@mod[3],$m0
|
||||
adc @acc[4],@acc[4],xzr
|
||||
|
||||
adds @acc[0],@acc[1],@tmp[0]
|
||||
adcs @acc[1],@acc[2],@tmp[1]
|
||||
adcs @acc[2],@acc[3],@tmp[2]
|
||||
adcs @acc[3],@acc[4],@tmp[3]
|
||||
adc @acc[4],xzr,xzr
|
||||
|
||||
subs @tmp[0],@acc[0],@mod[0]
|
||||
sbcs @tmp[1],@acc[1],@mod[1]
|
||||
sbcs @tmp[2],@acc[2],@mod[2]
|
||||
sbcs @tmp[3],@acc[3],@mod[3]
|
||||
sbcs xzr, @acc[4],xzr
|
||||
|
||||
csel @acc[0],@acc[0],@tmp[0],lo
|
||||
csel @acc[1],@acc[1],@tmp[1],lo
|
||||
csel @acc[2],@acc[2],@tmp[2],lo
|
||||
csel @acc[3],@acc[3],@tmp[3],lo
|
||||
|
||||
stp @acc[0],@acc[1],[$r_ptr]
|
||||
stp @acc[2],@acc[3],[$r_ptr,#16]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldp x23,x24,[x29,#48]
|
||||
ldr x29,[sp],#64
|
||||
ret
|
||||
.size mul_mont_sparse_256,.-mul_mont_sparse_256
|
||||
___
|
||||
{
|
||||
my @acc = (@a,@acc[0..3]);
|
||||
my @a = @mod;
|
||||
|
||||
$code.=<<___;
|
||||
.globl sqr_mont_sparse_256
|
||||
.hidden sqr_mont_sparse_256
|
||||
.type sqr_mont_sparse_256,%function
|
||||
.align 5
|
||||
sqr_mont_sparse_256:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-48]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
mov $n0,$n_ptr
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// | | | | | |a1*a0| |
|
||||
// | | | | |a2*a0| | |
|
||||
// | |a3*a2|a3*a0| | | |
|
||||
// | | | |a2*a1| | | |
|
||||
// | | |a3*a1| | | | |
|
||||
// *| | | | | | | | 2|
|
||||
// +|a3*a3|a2*a2|a1*a1|a0*a0|
|
||||
// |--+--+--+--+--+--+--+--|
|
||||
// |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x]
|
||||
//
|
||||
// "can't overflow" below mark carrying into high part of
|
||||
// multiplication result, which can't overflow, because it
|
||||
// can never be all ones.
|
||||
|
||||
mul @acc[1],@a[1],@a[0] // a[1]*a[0]
|
||||
umulh @tmp[1],@a[1],@a[0]
|
||||
mul @acc[2],@a[2],@a[0] // a[2]*a[0]
|
||||
umulh @tmp[2],@a[2],@a[0]
|
||||
mul @acc[3],@a[3],@a[0] // a[3]*a[0]
|
||||
umulh @acc[4],@a[3],@a[0]
|
||||
|
||||
adds @acc[2],@acc[2],@tmp[1] // accumulate high parts of multiplication
|
||||
mul @tmp[0],@a[2],@a[1] // a[2]*a[1]
|
||||
umulh @tmp[1],@a[2],@a[1]
|
||||
adcs @acc[3],@acc[3],@tmp[2]
|
||||
mul @tmp[2],@a[3],@a[1] // a[3]*a[1]
|
||||
umulh @tmp[3],@a[3],@a[1]
|
||||
adc @acc[4],@acc[4],xzr // can't overflow
|
||||
|
||||
mul @acc[5],@a[3],@a[2] // a[3]*a[2]
|
||||
umulh @acc[6],@a[3],@a[2]
|
||||
|
||||
adds @tmp[1],@tmp[1],@tmp[2] // accumulate high parts of multiplication
|
||||
mul @acc[0],@a[0],@a[0] // a[0]*a[0]
|
||||
adc @tmp[2],@tmp[3],xzr // can't overflow
|
||||
|
||||
adds @acc[3],@acc[3],@tmp[0] // accumulate low parts of multiplication
|
||||
umulh @a[0],@a[0],@a[0]
|
||||
adcs @acc[4],@acc[4],@tmp[1]
|
||||
mul @tmp[1],@a[1],@a[1] // a[1]*a[1]
|
||||
adcs @acc[5],@acc[5],@tmp[2]
|
||||
umulh @a[1],@a[1],@a[1]
|
||||
adc @acc[6],@acc[6],xzr // can't overflow
|
||||
|
||||
adds @acc[1],@acc[1],@acc[1] // acc[1-6]*=2
|
||||
mul @tmp[2],@a[2],@a[2] // a[2]*a[2]
|
||||
adcs @acc[2],@acc[2],@acc[2]
|
||||
umulh @a[2],@a[2],@a[2]
|
||||
adcs @acc[3],@acc[3],@acc[3]
|
||||
mul @tmp[3],@a[3],@a[3] // a[3]*a[3]
|
||||
adcs @acc[4],@acc[4],@acc[4]
|
||||
umulh @a[3],@a[3],@a[3]
|
||||
adcs @acc[5],@acc[5],@acc[5]
|
||||
adcs @acc[6],@acc[6],@acc[6]
|
||||
adc @acc[7],xzr,xzr
|
||||
|
||||
adds @acc[1],@acc[1],@a[0] // +a[i]*a[i]
|
||||
adcs @acc[2],@acc[2],@tmp[1]
|
||||
adcs @acc[3],@acc[3],@a[1]
|
||||
adcs @acc[4],@acc[4],@tmp[2]
|
||||
adcs @acc[5],@acc[5],@a[2]
|
||||
adcs @acc[6],@acc[6],@tmp[3]
|
||||
adc @acc[7],@acc[7],@a[3]
|
||||
|
||||
bl __mul_by_1_mont_256
|
||||
ldr x30,[x29,#8]
|
||||
|
||||
adds @acc[0],@acc[0],@acc[4] // accumulate upper half
|
||||
adcs @acc[1],@acc[1],@acc[5]
|
||||
adcs @acc[2],@acc[2],@acc[6]
|
||||
adcs @acc[3],@acc[3],@acc[7]
|
||||
adc @acc[4],xzr,xzr
|
||||
|
||||
subs @tmp[0],@acc[0],@mod[0]
|
||||
sbcs @tmp[1],@acc[1],@mod[1]
|
||||
sbcs @tmp[2],@acc[2],@mod[2]
|
||||
sbcs @tmp[3],@acc[3],@mod[3]
|
||||
sbcs xzr, @acc[4],xzr
|
||||
|
||||
csel @acc[0],@acc[0],@tmp[0],lo
|
||||
csel @acc[1],@acc[1],@tmp[1],lo
|
||||
csel @acc[2],@acc[2],@tmp[2],lo
|
||||
csel @acc[3],@acc[3],@tmp[3],lo
|
||||
|
||||
stp @acc[0],@acc[1],[$r_ptr]
|
||||
stp @acc[2],@acc[3],[$r_ptr,#16]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldr x29,[sp],#48
|
||||
autiasp
|
||||
ret
|
||||
.size sqr_mont_sparse_256,.-sqr_mont_sparse_256
|
||||
___
|
||||
}
|
||||
{
|
||||
my @a = (@a, $bi);
|
||||
|
||||
$code.=<<___;
|
||||
.globl from_mont_256
|
||||
.hidden from_mont_256
|
||||
.type from_mont_256,%function
|
||||
.align 5
|
||||
from_mont_256:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
|
||||
mov $n0,$n_ptr
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
|
||||
bl __mul_by_1_mont_256
|
||||
ldr x30,[x29,#8]
|
||||
|
||||
subs @tmp[0],@a[0],@mod[0]
|
||||
sbcs @tmp[1],@a[1],@mod[1]
|
||||
sbcs @tmp[2],@a[2],@mod[2]
|
||||
sbcs @tmp[3],@a[3],@mod[3]
|
||||
|
||||
csel @a[0],@a[0],@tmp[0],lo
|
||||
csel @a[1],@a[1],@tmp[1],lo
|
||||
csel @a[2],@a[2],@tmp[2],lo
|
||||
csel @a[3],@a[3],@tmp[3],lo
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
|
||||
ldr x29,[sp],#16
|
||||
autiasp
|
||||
ret
|
||||
.size from_mont_256,.-from_mont_256
|
||||
|
||||
.globl redc_mont_256
|
||||
.hidden redc_mont_256
|
||||
.type redc_mont_256,%function
|
||||
.align 5
|
||||
redc_mont_256:
|
||||
paciasp
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
|
||||
mov $n0,$n_ptr
|
||||
ldp @a[0],@a[1],[$a_ptr]
|
||||
ldp @a[2],@a[3],[$a_ptr,#16]
|
||||
|
||||
bl __mul_by_1_mont_256
|
||||
ldr x30,[x29,#8]
|
||||
|
||||
ldp @tmp[0],@tmp[1],[$a_ptr,#32]
|
||||
ldp @tmp[2],@tmp[3],[$a_ptr,#48]
|
||||
|
||||
adds @a[0],@a[0],@tmp[0]
|
||||
adcs @a[1],@a[1],@tmp[1]
|
||||
adcs @a[2],@a[2],@tmp[2]
|
||||
adcs @a[3],@a[3],@tmp[3]
|
||||
adc @a[4],xzr,xzr
|
||||
|
||||
subs @tmp[0],@a[0],@mod[0]
|
||||
sbcs @tmp[1],@a[1],@mod[1]
|
||||
sbcs @tmp[2],@a[2],@mod[2]
|
||||
sbcs @tmp[3],@a[3],@mod[3]
|
||||
sbcs xzr, @a[4],xzr
|
||||
|
||||
csel @a[0],@a[0],@tmp[0],lo
|
||||
csel @a[1],@a[1],@tmp[1],lo
|
||||
csel @a[2],@a[2],@tmp[2],lo
|
||||
csel @a[3],@a[3],@tmp[3],lo
|
||||
|
||||
stp @a[0],@a[1],[$r_ptr]
|
||||
stp @a[2],@a[3],[$r_ptr,#16]
|
||||
|
||||
ldr x29,[sp],#16
|
||||
autiasp
|
||||
ret
|
||||
.size redc_mont_256,.-redc_mont_256
|
||||
|
||||
.type __mul_by_1_mont_256,%function
|
||||
.align 5
|
||||
__mul_by_1_mont_256:
|
||||
mul $m0,$n0,@a[0]
|
||||
ldp @mod[0],@mod[1],[$b_ptr]
|
||||
ldp @mod[2],@mod[3],[$b_ptr,#16]
|
||||
___
|
||||
for ($i=1;$i<4;$i++) {
|
||||
$code.=<<___;
|
||||
//mul @tmp[0],@mod[0],$m0
|
||||
mul @tmp[1],@mod[1],$m0
|
||||
mul @tmp[2],@mod[2],$m0
|
||||
mul @tmp[3],@mod[3],$m0
|
||||
subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0]
|
||||
umulh @tmp[0],@mod[0],$m0
|
||||
adcs @a[1],@a[1],@tmp[1]
|
||||
umulh @tmp[1],@mod[1],$m0
|
||||
adcs @a[2],@a[2],@tmp[2]
|
||||
umulh @tmp[2],@mod[2],$m0
|
||||
adcs @a[3],@a[3],@tmp[3]
|
||||
umulh @tmp[3],@mod[3],$m0
|
||||
adc @a[4],xzr,xzr
|
||||
|
||||
adds @a[0],@a[1],@tmp[0]
|
||||
adcs @a[1],@a[2],@tmp[1]
|
||||
adcs @a[2],@a[3],@tmp[2]
|
||||
mul $m0,$n0,@a[0]
|
||||
adc @a[3],@a[4],@tmp[3]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
//mul @tmp[0],@mod[0],$m0
|
||||
mul @tmp[1],@mod[1],$m0
|
||||
mul @tmp[2],@mod[2],$m0
|
||||
mul @tmp[3],@mod[3],$m0
|
||||
subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0]
|
||||
umulh @tmp[0],@mod[0],$m0
|
||||
adcs @a[1],@a[1],@tmp[1]
|
||||
umulh @tmp[1],@mod[1],$m0
|
||||
adcs @a[2],@a[2],@tmp[2]
|
||||
umulh @tmp[2],@mod[2],$m0
|
||||
adcs @a[3],@a[3],@tmp[3]
|
||||
umulh @tmp[3],@mod[3],$m0
|
||||
adc @a[4],xzr,xzr
|
||||
|
||||
adds @a[0],@a[1],@tmp[0]
|
||||
adcs @a[1],@a[2],@tmp[1]
|
||||
adcs @a[2],@a[3],@tmp[2]
|
||||
adc @a[3],@a[4],@tmp[3]
|
||||
|
||||
ret
|
||||
.size __mul_by_1_mont_256,.-__mul_by_1_mont_256
|
||||
___
|
||||
}
|
||||
|
||||
print $code;
|
||||
|
||||
close STDOUT;
|
2015
blst/asm/mul_mont_384-armv8.pl
Executable file
2015
blst/asm/mul_mont_384-armv8.pl
Executable file
File diff suppressed because it is too large
Load diff
513
blst/asm/mulq_mont_256-x86_64.pl
Executable file
513
blst/asm/mulq_mont_256-x86_64.pl
Executable file
|
@ -0,0 +1,513 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# As for "sparse" in subroutine names, see commentary in the
|
||||
# asm/mulx_mont_256-x86_64.pl module.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
# common argument layout
|
||||
($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
|
||||
$b_ptr = "%rbx";
|
||||
|
||||
{ ############################################################## 256 bits
|
||||
my @acc=map("%r$_",(9..15));
|
||||
|
||||
{ ############################################################## mulq
|
||||
my ($hi, $a0) = ("%rbp", $r_ptr);
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl mul_mont_sparse_256
|
||||
.hidden mul_mont_sparse_256
|
||||
.type mul_mont_sparse_256,\@function,5,"unwind"
|
||||
.align 32
|
||||
mul_mont_sparse_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
push $r_ptr
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov 8*0($b_org), %rax
|
||||
mov 8*0($a_ptr), @acc[4]
|
||||
mov 8*1($a_ptr), @acc[5]
|
||||
mov 8*2($a_ptr), @acc[3]
|
||||
mov 8*3($a_ptr), $hi
|
||||
mov $b_org, $b_ptr # evacuate from %rdx
|
||||
|
||||
mov %rax, @acc[6]
|
||||
mulq @acc[4] # a[0]*b[0]
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[6], %rax
|
||||
mov %rdx, @acc[1]
|
||||
call __mulq_mont_sparse_256
|
||||
|
||||
mov 8(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov 16(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
mov 24(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
mov 32(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 40(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 48(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 56(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -56
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size mul_mont_sparse_256,.-mul_mont_sparse_256
|
||||
|
||||
.globl sqr_mont_sparse_256
|
||||
.hidden sqr_mont_sparse_256
|
||||
.type sqr_mont_sparse_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
sqr_mont_sparse_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
push $r_ptr
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov 8*0($a_ptr), %rax
|
||||
mov $n_ptr, $n0
|
||||
mov 8*1($a_ptr), @acc[5]
|
||||
mov $b_org, $n_ptr
|
||||
mov 8*2($a_ptr), @acc[3]
|
||||
lea ($a_ptr), $b_ptr
|
||||
mov 8*3($a_ptr), $hi
|
||||
|
||||
mov %rax, @acc[6]
|
||||
mulq %rax # a[0]*a[0]
|
||||
mov %rax, @acc[0]
|
||||
mov @acc[6], %rax
|
||||
mov %rdx, @acc[1]
|
||||
call __mulq_mont_sparse_256
|
||||
|
||||
mov 8(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov 16(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
mov 24(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
mov 32(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 40(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 48(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 56(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -56
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size sqr_mont_sparse_256,.-sqr_mont_sparse_256
|
||||
___
|
||||
{
|
||||
my @acc=@acc;
|
||||
$code.=<<___;
|
||||
.type __mulq_mont_sparse_256,\@abi-omnipotent
|
||||
.align 32
|
||||
__mulq_mont_sparse_256:
|
||||
mulq @acc[5] # a[1]*b[0]
|
||||
add %rax, @acc[1]
|
||||
mov @acc[6], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[2]
|
||||
|
||||
mulq @acc[3] # a[2]*b[0]
|
||||
add %rax, @acc[2]
|
||||
mov @acc[6], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[3]
|
||||
|
||||
mulq $hi # a[3]*b[0]
|
||||
add %rax, @acc[3]
|
||||
mov 8($b_ptr), %rax
|
||||
adc \$0, %rdx
|
||||
xor @acc[5], @acc[5]
|
||||
mov %rdx, @acc[4]
|
||||
|
||||
___
|
||||
for (my $i=1; $i<4; $i++) {
|
||||
my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1];
|
||||
$code.=<<___;
|
||||
mov @acc[0], $a0
|
||||
imulq $n0, @acc[0]
|
||||
|
||||
################################# Multiply by b[$i]
|
||||
mov %rax, @acc[6]
|
||||
mulq 8*0($a_ptr)
|
||||
add %rax, @acc[1]
|
||||
mov @acc[6], %rax
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*1($a_ptr)
|
||||
add %rax, @acc[2]
|
||||
mov @acc[6], %rax
|
||||
adc \$0, %rdx
|
||||
add $hi, @acc[2]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*2($a_ptr)
|
||||
add %rax, @acc[3]
|
||||
mov @acc[6], %rax
|
||||
adc \$0, %rdx
|
||||
add $hi, @acc[3]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*3($a_ptr)
|
||||
add %rax, @acc[4]
|
||||
mov @acc[0], %rax
|
||||
adc \$0, %rdx
|
||||
add $hi, @acc[4]
|
||||
adc %rdx, @acc[5] # can't overflow
|
||||
xor @acc[6], @acc[6]
|
||||
|
||||
################################# reduction
|
||||
mulq 8*0($n_ptr)
|
||||
add %rax, $a0 # guaranteed to be zero
|
||||
mov @acc[0], %rax
|
||||
adc %rdx, $a0
|
||||
|
||||
mulq 8*1($n_ptr)
|
||||
add %rax, @acc[1]
|
||||
mov @acc[0], %rax
|
||||
adc \$0, %rdx
|
||||
add $a0, @acc[1]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*2($n_ptr)
|
||||
add %rax, @acc[2]
|
||||
mov @acc[0], %rax
|
||||
adc \$0, %rdx
|
||||
add $hi, @acc[2]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*3($n_ptr)
|
||||
add %rax, @acc[3]
|
||||
mov $b_next, %rax
|
||||
adc \$0, %rdx
|
||||
add $hi, @acc[3]
|
||||
adc \$0, %rdx
|
||||
add %rdx, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
adc \$0, @acc[6]
|
||||
___
|
||||
push(@acc,shift(@acc));
|
||||
}
|
||||
$code.=<<___;
|
||||
imulq $n0, %rax
|
||||
mov 8(%rsp), $a_ptr # restore $r_ptr
|
||||
|
||||
################################# last reduction
|
||||
mov %rax, @acc[6]
|
||||
mulq 8*0($n_ptr)
|
||||
add %rax, @acc[0] # guaranteed to be zero
|
||||
mov @acc[6], %rax
|
||||
adc %rdx, @acc[0]
|
||||
|
||||
mulq 8*1($n_ptr)
|
||||
add %rax, @acc[1]
|
||||
mov @acc[6], %rax
|
||||
adc \$0, %rdx
|
||||
add @acc[0], @acc[1]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*2($n_ptr)
|
||||
add %rax, @acc[2]
|
||||
mov @acc[6], %rax
|
||||
adc \$0, %rdx
|
||||
add $hi, @acc[2]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*3($n_ptr)
|
||||
mov @acc[2], $b_ptr
|
||||
add $hi, @acc[3]
|
||||
adc \$0, %rdx
|
||||
add %rax, @acc[3]
|
||||
mov @acc[1], %rax
|
||||
adc \$0, %rdx
|
||||
add %rdx, @acc[4]
|
||||
adc \$0, @acc[5]
|
||||
|
||||
#################################
|
||||
# Branch-less conditional subtraction of modulus
|
||||
|
||||
mov @acc[3], @acc[0]
|
||||
sub 8*0($n_ptr), @acc[1]
|
||||
sbb 8*1($n_ptr), @acc[2]
|
||||
sbb 8*2($n_ptr), @acc[3]
|
||||
mov @acc[4], $hi
|
||||
sbb 8*3($n_ptr), @acc[4]
|
||||
sbb \$0, @acc[5]
|
||||
|
||||
cmovc %rax, @acc[1]
|
||||
cmovc $b_ptr, @acc[2]
|
||||
cmovc @acc[0], @acc[3]
|
||||
mov @acc[1], 8*0($a_ptr)
|
||||
cmovc $hi, @acc[4]
|
||||
mov @acc[2], 8*1($a_ptr)
|
||||
mov @acc[3], 8*2($a_ptr)
|
||||
mov @acc[4], 8*3($a_ptr)
|
||||
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256
|
||||
___
|
||||
} }
|
||||
{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted"
|
||||
|
||||
$code.=<<___;
|
||||
.globl from_mont_256
|
||||
.hidden from_mont_256
|
||||
.type from_mont_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
from_mont_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov $b_org, $n_ptr
|
||||
call __mulq_by_1_mont_256
|
||||
|
||||
#################################
|
||||
# Branch-less conditional acc[0:3] - modulus
|
||||
|
||||
#mov @acc[4], %rax # __mulq_by_1_mont_256 does it
|
||||
mov @acc[5], @acc[1]
|
||||
mov @acc[6], @acc[2]
|
||||
mov @acc[0], @acc[3]
|
||||
|
||||
sub 8*0($n_ptr), @acc[4]
|
||||
sbb 8*1($n_ptr), @acc[5]
|
||||
sbb 8*2($n_ptr), @acc[6]
|
||||
sbb 8*3($n_ptr), @acc[0]
|
||||
|
||||
cmovnc @acc[4], %rax
|
||||
cmovnc @acc[5], @acc[1]
|
||||
cmovnc @acc[6], @acc[2]
|
||||
mov %rax, 8*0($r_ptr)
|
||||
cmovnc @acc[0], @acc[3]
|
||||
mov @acc[1], 8*1($r_ptr)
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
|
||||
mov 8(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov 16(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
mov 24(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
mov 32(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 40(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 48(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 56(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -56
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size from_mont_256,.-from_mont_256
|
||||
|
||||
.globl redc_mont_256
|
||||
.hidden redc_mont_256
|
||||
.type redc_mont_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
redc_mont_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov $b_org, $n_ptr
|
||||
call __mulq_by_1_mont_256
|
||||
|
||||
add 8*4($a_ptr), @acc[4] # accumulate upper half
|
||||
adc 8*5($a_ptr), @acc[5]
|
||||
mov @acc[4], %rax
|
||||
adc 8*6($a_ptr), @acc[6]
|
||||
mov @acc[5], @acc[1]
|
||||
adc 8*7($a_ptr), @acc[0]
|
||||
sbb $a_ptr, $a_ptr
|
||||
|
||||
#################################
|
||||
# Branch-less conditional acc[0:4] - modulus
|
||||
|
||||
mov @acc[6], @acc[2]
|
||||
sub 8*0($n_ptr), @acc[4]
|
||||
sbb 8*1($n_ptr), @acc[5]
|
||||
sbb 8*2($n_ptr), @acc[6]
|
||||
mov @acc[0], @acc[3]
|
||||
sbb 8*3($n_ptr), @acc[0]
|
||||
sbb \$0, $a_ptr
|
||||
|
||||
cmovnc @acc[4], %rax
|
||||
cmovnc @acc[5], @acc[1]
|
||||
cmovnc @acc[6], @acc[2]
|
||||
mov %rax, 8*0($r_ptr)
|
||||
cmovnc @acc[0], @acc[3]
|
||||
mov @acc[1], 8*1($r_ptr)
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
|
||||
mov 8(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov 16(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
mov 24(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
mov 32(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 40(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 48(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 56(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -56
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size redc_mont_256,.-redc_mont_256
|
||||
___
|
||||
{
|
||||
my @acc=@acc;
|
||||
|
||||
$code.=<<___;
|
||||
.type __mulq_by_1_mont_256,\@abi-omnipotent
|
||||
.align 32
|
||||
__mulq_by_1_mont_256:
|
||||
mov 8*0($a_ptr), %rax
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
|
||||
mov %rax, @acc[4]
|
||||
imulq $n0, %rax
|
||||
mov %rax, @acc[0]
|
||||
___
|
||||
for (my $i=0; $i<4; $i++) {
|
||||
my $hi = @acc[4];
|
||||
$code.=<<___;
|
||||
################################# reduction $i
|
||||
mulq 8*0($n_ptr)
|
||||
add %rax, @acc[4] # guaranteed to be zero
|
||||
mov @acc[0], %rax
|
||||
adc %rdx, @acc[4]
|
||||
|
||||
mulq 8*1($n_ptr)
|
||||
add %rax, @acc[1]
|
||||
mov @acc[0], %rax
|
||||
adc \$0, %rdx
|
||||
add @acc[4], @acc[1]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*2($n_ptr)
|
||||
___
|
||||
$code.=<<___ if ($i<3);
|
||||
mov @acc[1], @acc[5]
|
||||
imulq $n0, @acc[1]
|
||||
___
|
||||
$code.=<<___;
|
||||
add %rax, @acc[2]
|
||||
mov @acc[0], %rax
|
||||
adc \$0, %rdx
|
||||
add $hi, @acc[2]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*3($n_ptr)
|
||||
add %rax, @acc[3]
|
||||
mov @acc[1], %rax
|
||||
adc \$0, %rdx
|
||||
add $hi, @acc[3]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[4]
|
||||
___
|
||||
push(@acc,shift(@acc));
|
||||
}
|
||||
$code.=<<___;
|
||||
ret
|
||||
.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256
|
||||
___
|
||||
} } }
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
2675
blst/asm/mulq_mont_384-x86_64.pl
Executable file
2675
blst/asm/mulq_mont_384-x86_64.pl
Executable file
File diff suppressed because it is too large
Load diff
486
blst/asm/mulx_mont_256-x86_64.pl
Executable file
486
blst/asm/mulx_mont_256-x86_64.pl
Executable file
|
@ -0,0 +1,486 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# "Sparse" in subroutine names refers to most significant limb of the
|
||||
# modulus. Though "sparse" is a bit of misnomer, because limitation is
|
||||
# just not-all-ones. Or in other words not larger than 2^256-2^192-1.
|
||||
# In general Montgomery multiplication algorithm can handle one of the
|
||||
# inputs being non-reduced and capped by 1<<radix_width, 1<<256 in this
|
||||
# case, rather than the modulus. Whether or not mul_mont_sparse_256, a
|
||||
# *taylored* implementation of the algorithm, can handle such input can
|
||||
# be circumstantial. For example, in most general case it depends on
|
||||
# similar "bit sparsity" of individual limbs of the second, fully reduced
|
||||
# multiplicand. If you can't make such assumption about the limbs, then
|
||||
# non-reduced value shouldn't be larger than "same old" 2^256-2^192-1.
|
||||
# This requirement can be met by conditionally subtracting "bitwise
|
||||
# left-aligned" modulus. For example, if modulus is 200 bits wide, you
|
||||
# would need to conditionally subtract the value of modulus<<56. Common
|
||||
# source of non-reduced values is redc_mont_256 treating 512-bit inputs.
|
||||
# Well, more specifically ones with upper half not smaller than modulus.
|
||||
# Just in case, why limitation at all and not general-purpose 256-bit
|
||||
# subroutines? Unlike the 384-bit case, accounting for additional carry
|
||||
# has disproportionate impact on performance, especially in adcx/adox
|
||||
# implementation.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
# common argument layout
|
||||
($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
|
||||
$b_ptr = "%rbx";
|
||||
|
||||
{ ############################################################## 255 bits
|
||||
my @acc=map("%r$_",(10..15));
|
||||
|
||||
{ ############################################################## mulq
|
||||
my ($lo,$hi)=("%rbp","%r9");
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl mulx_mont_sparse_256
|
||||
.hidden mulx_mont_sparse_256
|
||||
.type mulx_mont_sparse_256,\@function,5,"unwind"
|
||||
.align 32
|
||||
mulx_mont_sparse_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$8,%rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov $b_org, $b_ptr # evacuate from %rdx
|
||||
mov 8*0($b_org), %rdx
|
||||
mov 8*0($a_ptr), @acc[4]
|
||||
mov 8*1($a_ptr), @acc[5]
|
||||
mov 8*2($a_ptr), $lo
|
||||
mov 8*3($a_ptr), $hi
|
||||
lea -128($a_ptr), $a_ptr # control u-op density
|
||||
lea -128($n_ptr), $n_ptr # control u-op density
|
||||
|
||||
mulx @acc[4], %rax, @acc[1] # a[0]*b[0]
|
||||
call __mulx_mont_sparse_256
|
||||
|
||||
mov 8(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov 16(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
mov 24(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
mov 32(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 40(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 48(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 56(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -56
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size mulx_mont_sparse_256,.-mulx_mont_sparse_256
|
||||
|
||||
.globl sqrx_mont_sparse_256
|
||||
.hidden sqrx_mont_sparse_256
|
||||
.type sqrx_mont_sparse_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
sqrx_mont_sparse_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$8,%rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov $a_ptr, $b_ptr
|
||||
mov $n_ptr, $n0
|
||||
mov $b_org, $n_ptr
|
||||
mov 8*0($a_ptr), %rdx
|
||||
mov 8*1($a_ptr), @acc[5]
|
||||
mov 8*2($a_ptr), $lo
|
||||
mov 8*3($a_ptr), $hi
|
||||
lea -128($b_ptr), $a_ptr # control u-op density
|
||||
lea -128($n_ptr), $n_ptr # control u-op density
|
||||
|
||||
mulx %rdx, %rax, @acc[1] # a[0]*a[0]
|
||||
call __mulx_mont_sparse_256
|
||||
|
||||
mov 8(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov 16(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
mov 24(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
mov 32(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 40(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 48(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 56(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -56
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256
|
||||
___
|
||||
{
|
||||
my @acc=@acc;
|
||||
$code.=<<___;
|
||||
.type __mulx_mont_sparse_256,\@abi-omnipotent
|
||||
.align 32
|
||||
__mulx_mont_sparse_256:
|
||||
mulx @acc[5], @acc[5], @acc[2]
|
||||
mulx $lo, $lo, @acc[3]
|
||||
add @acc[5], @acc[1]
|
||||
mulx $hi, $hi, @acc[4]
|
||||
mov 8($b_ptr), %rdx
|
||||
adc $lo, @acc[2]
|
||||
adc $hi, @acc[3]
|
||||
adc \$0, @acc[4]
|
||||
|
||||
___
|
||||
for (my $i=1; $i<4; $i++) {
|
||||
my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : "%rax";
|
||||
my $a5 = $i==1 ? @acc[5] : $lo;
|
||||
$code.=<<___;
|
||||
mov %rax, @acc[0]
|
||||
imulq $n0, %rax
|
||||
|
||||
################################# Multiply by b[$i]
|
||||
xor $a5, $a5 # [@acc[5]=0,] cf=0, of=0
|
||||
mulx 8*0+128($a_ptr), $lo, $hi
|
||||
adox $lo, @acc[1]
|
||||
adcx $hi, @acc[2]
|
||||
|
||||
mulx 8*1+128($a_ptr), $lo, $hi
|
||||
adox $lo, @acc[2]
|
||||
adcx $hi, @acc[3]
|
||||
|
||||
mulx 8*2+128($a_ptr), $lo, $hi
|
||||
adox $lo, @acc[3]
|
||||
adcx $hi, @acc[4]
|
||||
|
||||
mulx 8*3+128($a_ptr), $lo, $hi
|
||||
mov %rax, %rdx
|
||||
adox $lo, @acc[4]
|
||||
adcx @acc[5], $hi # cf=0
|
||||
adox $hi, @acc[5] # of=0
|
||||
|
||||
################################# reduction
|
||||
mulx 8*0+128($n_ptr), $lo, %rax
|
||||
adcx $lo, @acc[0] # guaranteed to be zero
|
||||
adox @acc[1], %rax
|
||||
|
||||
mulx 8*1+128($n_ptr), $lo, $hi
|
||||
adcx $lo, %rax # @acc[1]
|
||||
adox $hi, @acc[2]
|
||||
|
||||
mulx 8*2+128($n_ptr), $lo, $hi
|
||||
adcx $lo, @acc[2]
|
||||
adox $hi, @acc[3]
|
||||
|
||||
mulx 8*3+128($n_ptr), $lo, $hi
|
||||
mov $b_next, %rdx
|
||||
adcx $lo, @acc[3]
|
||||
adox $hi, @acc[4]
|
||||
adcx @acc[0], @acc[4]
|
||||
adox @acc[0], @acc[5]
|
||||
adcx @acc[0], @acc[5]
|
||||
adox @acc[0], @acc[0] # acc[5] in next iteration
|
||||
adc \$0, @acc[0] # cf=0, of=0
|
||||
___
|
||||
push(@acc,shift(@acc));
|
||||
}
|
||||
$code.=<<___;
|
||||
imulq $n0, %rdx
|
||||
|
||||
################################# last reduction
|
||||
xor $lo, $lo # cf=0, of=0
|
||||
mulx 8*0+128($n_ptr), @acc[0], $hi
|
||||
adcx %rax, @acc[0] # guaranteed to be zero
|
||||
adox $hi, @acc[1]
|
||||
|
||||
mulx 8*1+128($n_ptr), $lo, $hi
|
||||
adcx $lo, @acc[1]
|
||||
adox $hi, @acc[2]
|
||||
|
||||
mulx 8*2+128($n_ptr), $lo, $hi
|
||||
adcx $lo, @acc[2]
|
||||
adox $hi, @acc[3]
|
||||
|
||||
mulx 8*3+128($n_ptr), $lo, $hi
|
||||
mov @acc[1], %rdx
|
||||
lea 128($n_ptr), $n_ptr
|
||||
adcx $lo, @acc[3]
|
||||
adox $hi, @acc[4]
|
||||
mov @acc[2], %rax
|
||||
adcx @acc[0], @acc[4]
|
||||
adox @acc[0], @acc[5]
|
||||
adc \$0, @acc[5]
|
||||
|
||||
#################################
|
||||
# Branch-less conditional acc[1:5] - modulus
|
||||
|
||||
mov @acc[3], $lo
|
||||
sub 8*0($n_ptr), @acc[1]
|
||||
sbb 8*1($n_ptr), @acc[2]
|
||||
sbb 8*2($n_ptr), @acc[3]
|
||||
mov @acc[4], $hi
|
||||
sbb 8*3($n_ptr), @acc[4]
|
||||
sbb \$0, @acc[5]
|
||||
|
||||
cmovc %rdx, @acc[1]
|
||||
cmovc %rax, @acc[2]
|
||||
cmovc $lo, @acc[3]
|
||||
mov @acc[1], 8*0($r_ptr)
|
||||
cmovc $hi, @acc[4]
|
||||
mov @acc[2], 8*1($r_ptr)
|
||||
mov @acc[3], 8*2($r_ptr)
|
||||
mov @acc[4], 8*3($r_ptr)
|
||||
|
||||
ret
|
||||
.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256
|
||||
___
|
||||
} }
|
||||
{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted"
|
||||
|
||||
$code.=<<___;
|
||||
.globl fromx_mont_256
|
||||
.hidden fromx_mont_256
|
||||
.type fromx_mont_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
fromx_mont_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov $b_org, $n_ptr
|
||||
call __mulx_by_1_mont_256
|
||||
|
||||
#################################
|
||||
# Branch-less conditional acc[0:3] - modulus
|
||||
|
||||
#mov @acc[4], %rax # __mulq_by_1_mont_256 does it
|
||||
mov @acc[5], %rdx
|
||||
mov @acc[0], @acc[2]
|
||||
mov @acc[1], @acc[3]
|
||||
|
||||
sub 8*0($n_ptr), @acc[4]
|
||||
sbb 8*1($n_ptr), @acc[5]
|
||||
sbb 8*2($n_ptr), @acc[0]
|
||||
sbb 8*3($n_ptr), @acc[1]
|
||||
|
||||
cmovnc @acc[4], %rax
|
||||
cmovnc @acc[5], %rdx
|
||||
cmovnc @acc[0], @acc[2]
|
||||
mov %rax, 8*0($r_ptr)
|
||||
cmovnc @acc[1], @acc[3]
|
||||
mov %rdx, 8*1($r_ptr)
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
|
||||
mov 8(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov 16(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
mov 24(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
mov 32(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 40(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 48(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 56(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -56
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size fromx_mont_256,.-fromx_mont_256
|
||||
|
||||
.globl redcx_mont_256
|
||||
.hidden redcx_mont_256
|
||||
.type redcx_mont_256,\@function,4,"unwind"
|
||||
.align 32
|
||||
redcx_mont_256:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
sub \$8, %rsp
|
||||
.cfi_adjust_cfa_offset 8
|
||||
.cfi_end_prologue
|
||||
|
||||
mov $b_org, $n_ptr
|
||||
call __mulx_by_1_mont_256
|
||||
|
||||
add 8*4($a_ptr), @acc[4] # accumulate upper half
|
||||
adc 8*5($a_ptr), @acc[5]
|
||||
mov @acc[4], %rax
|
||||
adc 8*6($a_ptr), @acc[0]
|
||||
mov @acc[5], %rdx
|
||||
adc 8*7($a_ptr), @acc[1]
|
||||
sbb $a_ptr, $a_ptr
|
||||
|
||||
#################################
|
||||
# Branch-less conditional acc[0:4] - modulus
|
||||
|
||||
mov @acc[0], @acc[2]
|
||||
sub 8*0($n_ptr), @acc[4]
|
||||
sbb 8*1($n_ptr), @acc[5]
|
||||
sbb 8*2($n_ptr), @acc[0]
|
||||
mov @acc[1], @acc[3]
|
||||
sbb 8*3($n_ptr), @acc[1]
|
||||
sbb \$0, $a_ptr
|
||||
|
||||
cmovnc @acc[4], %rax
|
||||
cmovnc @acc[5], %rdx
|
||||
cmovnc @acc[0], @acc[2]
|
||||
mov %rax, 8*0($r_ptr)
|
||||
cmovnc @acc[1], @acc[3]
|
||||
mov %rdx, 8*1($r_ptr)
|
||||
mov @acc[2], 8*2($r_ptr)
|
||||
mov @acc[3], 8*3($r_ptr)
|
||||
|
||||
mov 8(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov 16(%rsp),%r14
|
||||
.cfi_restore %r14
|
||||
mov 24(%rsp),%r13
|
||||
.cfi_restore %r13
|
||||
mov 32(%rsp),%r12
|
||||
.cfi_restore %r12
|
||||
mov 40(%rsp),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov 48(%rsp),%rbp
|
||||
.cfi_restore %rbp
|
||||
lea 56(%rsp),%rsp
|
||||
.cfi_adjust_cfa_offset -56
|
||||
.cfi_epilogue
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size redcx_mont_256,.-redcx_mont_256
|
||||
___
|
||||
{
|
||||
my @acc=@acc;
|
||||
|
||||
$code.=<<___;
|
||||
.type __mulx_by_1_mont_256,\@abi-omnipotent
|
||||
.align 32
|
||||
__mulx_by_1_mont_256:
|
||||
mov 8*0($a_ptr), %rax
|
||||
mov 8*1($a_ptr), @acc[1]
|
||||
mov 8*2($a_ptr), @acc[2]
|
||||
mov 8*3($a_ptr), @acc[3]
|
||||
|
||||
mov %rax, @acc[4]
|
||||
imulq $n0, %rax
|
||||
mov %rax, @acc[0]
|
||||
___
|
||||
for (my $i=0; $i<4; $i++) {
|
||||
my $hi = @acc[4];
|
||||
$code.=<<___;
|
||||
################################# reduction $i
|
||||
mulq 8*0($n_ptr)
|
||||
add %rax, @acc[4] # guaranteed to be zero
|
||||
mov @acc[0], %rax
|
||||
adc %rdx, @acc[4]
|
||||
|
||||
mulq 8*1($n_ptr)
|
||||
add %rax, @acc[1]
|
||||
mov @acc[0], %rax
|
||||
adc \$0, %rdx
|
||||
add @acc[4], @acc[1]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*2($n_ptr)
|
||||
___
|
||||
$code.=<<___ if ($i<3);
|
||||
mov @acc[1], @acc[5]
|
||||
imulq $n0, @acc[1]
|
||||
___
|
||||
$code.=<<___;
|
||||
add %rax, @acc[2]
|
||||
mov @acc[0], %rax
|
||||
adc \$0, %rdx
|
||||
add $hi, @acc[2]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, $hi
|
||||
|
||||
mulq 8*3($n_ptr)
|
||||
add %rax, @acc[3]
|
||||
mov @acc[1], %rax
|
||||
adc \$0, %rdx
|
||||
add $hi, @acc[3]
|
||||
adc \$0, %rdx
|
||||
mov %rdx, @acc[4]
|
||||
___
|
||||
push(@acc,shift(@acc));
|
||||
}
|
||||
$code.=<<___;
|
||||
ret
|
||||
.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256
|
||||
___
|
||||
} } }
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
2384
blst/asm/mulx_mont_384-x86_64.pl
Executable file
2384
blst/asm/mulx_mont_384-x86_64.pl
Executable file
File diff suppressed because it is too large
Load diff
541
blst/asm/sha256-armv8.pl
Executable file
541
blst/asm/sha256-armv8.pl
Executable file
|
@ -0,0 +1,541 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
|
||||
# project.
|
||||
# ====================================================================
|
||||
#
|
||||
# sha256_block procedure for ARMv8.
|
||||
#
|
||||
# This module is stripped of scalar code paths, with raionale that all
|
||||
# known processors are NEON-capable.
|
||||
#
|
||||
# See original module at CRYPTOGAMS for further details.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
$BITS=256;
|
||||
$SZ=4;
|
||||
@Sigma0=( 2,13,22);
|
||||
@Sigma1=( 6,11,25);
|
||||
@sigma0=( 7,18, 3);
|
||||
@sigma1=(17,19,10);
|
||||
$rounds=64;
|
||||
$reg_t="w";
|
||||
$pre="blst_";
|
||||
|
||||
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.align 6
|
||||
.type .LK$BITS,%object
|
||||
.LK$BITS:
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
.long 0 //terminator
|
||||
.size .LK$BITS,.-.LK$BITS
|
||||
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm"
|
||||
.align 2
|
||||
___
|
||||
|
||||
if ($SZ==4) {
|
||||
my $Ktbl="x3";
|
||||
|
||||
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
|
||||
my @MSG=map("v$_.16b",(4..7));
|
||||
my ($W0,$W1)=("v16.4s","v17.4s");
|
||||
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
|
||||
|
||||
$code.=<<___;
|
||||
.globl ${pre}sha256_block_armv8
|
||||
.type ${pre}sha256_block_armv8,%function
|
||||
.align 6
|
||||
${pre}sha256_block_armv8:
|
||||
.Lv8_entry:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
|
||||
ld1.32 {$ABCD,$EFGH},[$ctx]
|
||||
adr $Ktbl,.LK256
|
||||
|
||||
.Loop_hw:
|
||||
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
|
||||
sub $num,$num,#1
|
||||
ld1.32 {$W0},[$Ktbl],#16
|
||||
rev32 @MSG[0],@MSG[0]
|
||||
rev32 @MSG[1],@MSG[1]
|
||||
rev32 @MSG[2],@MSG[2]
|
||||
rev32 @MSG[3],@MSG[3]
|
||||
orr $ABCD_SAVE,$ABCD,$ABCD // offload
|
||||
orr $EFGH_SAVE,$EFGH,$EFGH
|
||||
___
|
||||
for($i=0;$i<12;$i++) {
|
||||
$code.=<<___;
|
||||
ld1.32 {$W1},[$Ktbl],#16
|
||||
add.i32 $W0,$W0,@MSG[0]
|
||||
sha256su0 @MSG[0],@MSG[1]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
sha256su1 @MSG[0],@MSG[2],@MSG[3]
|
||||
___
|
||||
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
||||
}
|
||||
$code.=<<___;
|
||||
ld1.32 {$W1},[$Ktbl],#16
|
||||
add.i32 $W0,$W0,@MSG[0]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
|
||||
ld1.32 {$W0},[$Ktbl],#16
|
||||
add.i32 $W1,$W1,@MSG[1]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W1
|
||||
sha256h2 $EFGH,$abcd,$W1
|
||||
|
||||
ld1.32 {$W1},[$Ktbl]
|
||||
add.i32 $W0,$W0,@MSG[2]
|
||||
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
|
||||
add.i32 $W1,$W1,@MSG[3]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W1
|
||||
sha256h2 $EFGH,$abcd,$W1
|
||||
|
||||
add.i32 $ABCD,$ABCD,$ABCD_SAVE
|
||||
add.i32 $EFGH,$EFGH,$EFGH_SAVE
|
||||
|
||||
cbnz $num,.Loop_hw
|
||||
|
||||
st1.32 {$ABCD,$EFGH},[$ctx]
|
||||
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
.size ${pre}sha256_block_armv8,.-${pre}sha256_block_armv8
|
||||
___
|
||||
}
|
||||
|
||||
if ($SZ==4) { ######################################### NEON stuff #
|
||||
# You'll surely note a lot of similarities with sha256-armv4 module,
|
||||
# and of course it's not a coincidence. sha256-armv4 was used as
|
||||
# initial template, but was adapted for ARMv8 instruction set and
|
||||
# extensively re-tuned for all-round performance.
|
||||
|
||||
my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
|
||||
my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
|
||||
my $Ktbl="x16";
|
||||
my $Xfer="x17";
|
||||
my @X = map("q$_",(0..3));
|
||||
my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
|
||||
my $j=0;
|
||||
|
||||
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
|
||||
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
|
||||
my $arg = pop;
|
||||
$arg = "#$arg" if ($arg*1 eq $arg);
|
||||
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
|
||||
}
|
||||
|
||||
sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
|
||||
sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
|
||||
sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
|
||||
|
||||
sub Xupdate()
|
||||
{ use integer;
|
||||
my $body = shift;
|
||||
my @insns = (&$body,&$body,&$body,&$body);
|
||||
my ($a,$b,$c,$d,$e,$f,$g,$h);
|
||||
|
||||
&ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15]
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&ushr_32 ($T2,$T0,$sigma0[0]);
|
||||
eval(shift(@insns));
|
||||
&ushr_32 ($T1,$T0,$sigma0[2]);
|
||||
eval(shift(@insns));
|
||||
&add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
|
||||
eval(shift(@insns));
|
||||
&sli_32 ($T2,$T0,32-$sigma0[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&ushr_32 ($T3,$T0,$sigma0[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&eor_8 ($T1,$T1,$T2);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&sli_32 ($T3,$T0,32-$sigma0[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&ushr_32 ($T4,$T7,$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&sli_32 ($T4,$T7,32-$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&ushr_32 ($T5,$T7,$sigma1[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&ushr_32 ($T3,$T7,$sigma1[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&sli_u32 ($T3,$T7,32-$sigma1[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&eor_8 ($T5,$T5,$T4);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&eor_8 ($T5,$T5,$T3); # sigma1(X[14..15])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&ushr_32 ($T6,@X[0],$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
&ushr_32 ($T7,@X[0],$sigma1[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&sli_32 ($T6,@X[0],32-$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
&ushr_32 ($T5,@X[0],$sigma1[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&eor_8 ($T7,$T7,$T6);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&sli_32 ($T5,@X[0],32-$sigma1[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&ld1_32 ("{$T0}","[$Ktbl], #16");
|
||||
eval(shift(@insns));
|
||||
&eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&eor_8 ($T5,$T5,$T5);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&mov (&Dhi($T5), &Dlo($T7));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&add_32 ($T0,$T0,@X[0]);
|
||||
while($#insns>=1) { eval(shift(@insns)); }
|
||||
&st1_32 ("{$T0}","[$Xfer], #16");
|
||||
eval(shift(@insns));
|
||||
|
||||
push(@X,shift(@X)); # "rotate" X[]
|
||||
}
|
||||
|
||||
sub Xpreload()
|
||||
{ use integer;
|
||||
my $body = shift;
|
||||
my @insns = (&$body,&$body,&$body,&$body);
|
||||
my ($a,$b,$c,$d,$e,$f,$g,$h);
|
||||
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&ld1_8 ("{@X[0]}","[$inp],#16");
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&ld1_32 ("{$T0}","[$Ktbl],#16");
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&rev32 (@X[0],@X[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&add_32 ($T0,$T0,@X[0]);
|
||||
foreach (@insns) { eval; } # remaining instructions
|
||||
&st1_32 ("{$T0}","[$Xfer], #16");
|
||||
|
||||
push(@X,shift(@X)); # "rotate" X[]
|
||||
}
|
||||
|
||||
sub body_00_15 () {
|
||||
(
|
||||
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
|
||||
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
|
||||
'&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
|
||||
'&and ($t1,$f,$e)',
|
||||
'&bic ($t4,$g,$e)',
|
||||
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
|
||||
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
|
||||
'&orr ($t1,$t1,$t4)', # Ch(e,f,g)
|
||||
'&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
|
||||
'&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
|
||||
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
|
||||
'&ror ($t0,$t0,"#$Sigma1[0]")',
|
||||
'&eor ($t2,$a,$b)', # a^b, b^c in next round
|
||||
'&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
|
||||
'&add ($h,$h,$t0)', # h+=Sigma1(e)
|
||||
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
|
||||
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
|
||||
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
|
||||
'&ror ($t4,$t4,"#$Sigma0[0]")',
|
||||
'&add ($d,$d,$h)', # d+=h
|
||||
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
|
||||
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
|
||||
)
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
.globl ${pre}sha256_block_data_order
|
||||
.type ${pre}sha256_block_data_order,%function
|
||||
.align 4
|
||||
${pre}sha256_block_data_order:
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
sub sp,sp,#16*4
|
||||
|
||||
adr $Ktbl,.LK256
|
||||
add $num,$inp,$num,lsl#6 // len to point at the end of inp
|
||||
|
||||
ld1.8 {@X[0]},[$inp], #16
|
||||
ld1.8 {@X[1]},[$inp], #16
|
||||
ld1.8 {@X[2]},[$inp], #16
|
||||
ld1.8 {@X[3]},[$inp], #16
|
||||
ld1.32 {$T0},[$Ktbl], #16
|
||||
ld1.32 {$T1},[$Ktbl], #16
|
||||
ld1.32 {$T2},[$Ktbl], #16
|
||||
ld1.32 {$T3},[$Ktbl], #16
|
||||
rev32 @X[0],@X[0] // yes, even on
|
||||
rev32 @X[1],@X[1] // big-endian
|
||||
rev32 @X[2],@X[2]
|
||||
rev32 @X[3],@X[3]
|
||||
mov $Xfer,sp
|
||||
add.32 $T0,$T0,@X[0]
|
||||
add.32 $T1,$T1,@X[1]
|
||||
add.32 $T2,$T2,@X[2]
|
||||
st1.32 {$T0-$T1},[$Xfer], #32
|
||||
add.32 $T3,$T3,@X[3]
|
||||
st1.32 {$T2-$T3},[$Xfer]
|
||||
sub $Xfer,$Xfer,#32
|
||||
|
||||
ldp $A,$B,[$ctx]
|
||||
ldp $C,$D,[$ctx,#8]
|
||||
ldp $E,$F,[$ctx,#16]
|
||||
ldp $G,$H,[$ctx,#24]
|
||||
ldr $t1,[sp,#0]
|
||||
mov $t2,wzr
|
||||
eor $t3,$B,$C
|
||||
mov $t4,wzr
|
||||
b .L_00_48
|
||||
|
||||
.align 4
|
||||
.L_00_48:
|
||||
___
|
||||
&Xupdate(\&body_00_15);
|
||||
&Xupdate(\&body_00_15);
|
||||
&Xupdate(\&body_00_15);
|
||||
&Xupdate(\&body_00_15);
|
||||
$code.=<<___;
|
||||
cmp $t1,#0 // check for K256 terminator
|
||||
ldr $t1,[sp,#0]
|
||||
sub $Xfer,$Xfer,#64
|
||||
bne .L_00_48
|
||||
|
||||
sub $Ktbl,$Ktbl,#256 // rewind $Ktbl
|
||||
cmp $inp,$num
|
||||
mov $Xfer, #64
|
||||
csel $Xfer, $Xfer, xzr, eq
|
||||
sub $inp,$inp,$Xfer // avoid SEGV
|
||||
mov $Xfer,sp
|
||||
___
|
||||
&Xpreload(\&body_00_15);
|
||||
&Xpreload(\&body_00_15);
|
||||
&Xpreload(\&body_00_15);
|
||||
&Xpreload(\&body_00_15);
|
||||
$code.=<<___;
|
||||
add $A,$A,$t4 // h+=Sigma0(a) from the past
|
||||
ldp $t0,$t1,[$ctx,#0]
|
||||
add $A,$A,$t2 // h+=Maj(a,b,c) from the past
|
||||
ldp $t2,$t3,[$ctx,#8]
|
||||
add $A,$A,$t0 // accumulate
|
||||
add $B,$B,$t1
|
||||
ldp $t0,$t1,[$ctx,#16]
|
||||
add $C,$C,$t2
|
||||
add $D,$D,$t3
|
||||
ldp $t2,$t3,[$ctx,#24]
|
||||
add $E,$E,$t0
|
||||
add $F,$F,$t1
|
||||
ldr $t1,[sp,#0]
|
||||
stp $A,$B,[$ctx,#0]
|
||||
add $G,$G,$t2
|
||||
mov $t2,wzr
|
||||
stp $C,$D,[$ctx,#8]
|
||||
add $H,$H,$t3
|
||||
stp $E,$F,[$ctx,#16]
|
||||
eor $t3,$B,$C
|
||||
stp $G,$H,[$ctx,#24]
|
||||
mov $t4,wzr
|
||||
mov $Xfer,sp
|
||||
b.ne .L_00_48
|
||||
|
||||
ldr x29,[x29]
|
||||
add sp,sp,#16*4+16
|
||||
ret
|
||||
.size ${pre}sha256_block_data_order,.-${pre}sha256_block_data_order
|
||||
___
|
||||
}
|
||||
|
||||
{
|
||||
my ($out,$inp,$len) = map("x$_",(0..2));
|
||||
|
||||
$code.=<<___;
|
||||
.globl ${pre}sha256_emit
|
||||
.hidden ${pre}sha256_emit
|
||||
.type ${pre}sha256_emit,%function
|
||||
.align 4
|
||||
${pre}sha256_emit:
|
||||
ldp x4,x5,[$inp]
|
||||
ldp x6,x7,[$inp,#16]
|
||||
#ifndef __AARCH64EB__
|
||||
rev x4,x4
|
||||
rev x5,x5
|
||||
rev x6,x6
|
||||
rev x7,x7
|
||||
#endif
|
||||
str w4,[$out,#4]
|
||||
lsr x4,x4,#32
|
||||
str w5,[$out,#12]
|
||||
lsr x5,x5,#32
|
||||
str w6,[$out,#20]
|
||||
lsr x6,x6,#32
|
||||
str w7,[$out,#28]
|
||||
lsr x7,x7,#32
|
||||
str w4,[$out,#0]
|
||||
str w5,[$out,#8]
|
||||
str w6,[$out,#16]
|
||||
str w7,[$out,#24]
|
||||
ret
|
||||
.size ${pre}sha256_emit,.-${pre}sha256_emit
|
||||
|
||||
.globl ${pre}sha256_bcopy
|
||||
.hidden ${pre}sha256_bcopy
|
||||
.type ${pre}sha256_bcopy,%function
|
||||
.align 4
|
||||
${pre}sha256_bcopy:
|
||||
.Loop_bcopy:
|
||||
ldrb w3,[$inp],#1
|
||||
sub $len,$len,#1
|
||||
strb w3,[$out],#1
|
||||
cbnz $len,.Loop_bcopy
|
||||
ret
|
||||
.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy
|
||||
|
||||
.globl ${pre}sha256_hcopy
|
||||
.hidden ${pre}sha256_hcopy
|
||||
.type ${pre}sha256_hcopy,%function
|
||||
.align 4
|
||||
${pre}sha256_hcopy:
|
||||
ldp x4,x5,[$inp]
|
||||
ldp x6,x7,[$inp,#16]
|
||||
stp x4,x5,[$out]
|
||||
stp x6,x7,[$out,#16]
|
||||
ret
|
||||
.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy
|
||||
___
|
||||
}
|
||||
|
||||
{ my %opcode = (
|
||||
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
|
||||
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
|
||||
|
||||
sub unsha256 {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
|
||||
&&
|
||||
sprintf ".inst\t0x%08x\t//%s %s",
|
||||
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
}
|
||||
|
||||
open SELF,$0;
|
||||
while(<SELF>) {
|
||||
next if (/^#!/);
|
||||
last if (!s/^#/\/\// and !/^$/);
|
||||
print;
|
||||
}
|
||||
close SELF;
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
|
||||
s/\`([^\`]*)\`/eval($1)/ge;
|
||||
|
||||
s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or
|
||||
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
|
||||
|
||||
s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
|
||||
|
||||
s/\.[ui]?8(\s)/$1/;
|
||||
s/\.\w?64\b// and s/\.16b/\.2d/g or
|
||||
s/\.\w?32\b// and s/\.16b/\.4s/g;
|
||||
m/\bext\b/ and s/\.2d/\.16b/g or
|
||||
m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT;
|
337
blst/asm/sha256-portable-x86_64.pl
Executable file
337
blst/asm/sha256-portable-x86_64.pl
Executable file
|
@ -0,0 +1,337 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
|
||||
# project.
|
||||
# ====================================================================
|
||||
#
|
||||
# sha256_block procedure for x86_64.
|
||||
#
|
||||
# Scalar-only version with minor twist minimizing 'lea' instructions.
|
||||
|
||||
$flavour = shift;
|
||||
$output = pop;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
$pre="blst_";
|
||||
$func="${pre}sha256_block_data_order";
|
||||
$TABLE="K256";
|
||||
$SZ=4;
|
||||
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
|
||||
"%r8d","%r9d","%r10d","%r11d");
|
||||
($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
|
||||
@Sigma0=( 2,13,22);
|
||||
@Sigma1=( 6,11,25);
|
||||
@sigma0=( 7,18, 3);
|
||||
@sigma1=(17,19,10);
|
||||
$rounds=64;
|
||||
|
||||
$ctx="%rdi"; # 1st arg, zapped by $a3
|
||||
$inp="%rsi"; # 2nd arg
|
||||
$Tbl="%rbp";
|
||||
|
||||
$_ctx="16*$SZ+0*8(%rsp)";
|
||||
$_inp="16*$SZ+1*8(%rsp)";
|
||||
$_end="16*$SZ+2*8(%rsp)";
|
||||
$framesz="16*$SZ+3*8";
|
||||
|
||||
sub ROUND_00_15()
|
||||
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
|
||||
my $STRIDE=$SZ;
|
||||
# $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
|
||||
|
||||
$code.=<<___;
|
||||
ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
|
||||
mov $f,$a2
|
||||
|
||||
xor $e,$a0
|
||||
ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
|
||||
xor $g,$a2 # f^g
|
||||
|
||||
mov $T1,`$SZ*($i&0xf)`(%rsp)
|
||||
xor $a,$a1
|
||||
and $e,$a2 # (f^g)&e
|
||||
|
||||
ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
|
||||
add $h,$T1 # T1+=h
|
||||
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
|
||||
|
||||
ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
|
||||
xor $e,$a0
|
||||
add $a2,$T1 # T1+=Ch(e,f,g)
|
||||
|
||||
mov $a,$a2
|
||||
add `$SZ*$i`($Tbl),$T1 # T1+=K[round]
|
||||
xor $a,$a1
|
||||
|
||||
xor $b,$a2 # a^b, b^c in next round
|
||||
ror \$$Sigma1[0],$a0 # Sigma1(e)
|
||||
mov $b,$h
|
||||
|
||||
and $a2,$a3
|
||||
ror \$$Sigma0[0],$a1 # Sigma0(a)
|
||||
add $a0,$T1 # T1+=Sigma1(e)
|
||||
|
||||
xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
|
||||
add $T1,$d # d+=T1
|
||||
add $T1,$h # h+=T1
|
||||
___
|
||||
$code.=<<___ if ($i==31);
|
||||
lea `16*$SZ`($Tbl),$Tbl # round+=16
|
||||
___
|
||||
$code.=<<___ if ($i<15);
|
||||
add $a1,$h # h+=Sigma0(a)
|
||||
___
|
||||
($a2,$a3) = ($a3,$a2);
|
||||
}
|
||||
|
||||
sub ROUND_16_XX()
|
||||
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
|
||||
|
||||
$code.=<<___;
|
||||
mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
|
||||
mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
|
||||
|
||||
mov $a0,$T1
|
||||
ror \$`$sigma0[1]-$sigma0[0]`,$a0
|
||||
add $a1,$a # modulo-scheduled h+=Sigma0(a)
|
||||
mov $a2,$a1
|
||||
ror \$`$sigma1[1]-$sigma1[0]`,$a2
|
||||
|
||||
xor $T1,$a0
|
||||
shr \$$sigma0[2],$T1
|
||||
ror \$$sigma0[0],$a0
|
||||
xor $a1,$a2
|
||||
shr \$$sigma1[2],$a1
|
||||
|
||||
ror \$$sigma1[0],$a2
|
||||
xor $a0,$T1 # sigma0(X[(i+1)&0xf])
|
||||
xor $a1,$a2 # sigma1(X[(i+14)&0xf])
|
||||
add `$SZ*(($i+9)&0xf)`(%rsp),$T1
|
||||
|
||||
add `$SZ*($i&0xf)`(%rsp),$T1
|
||||
mov $e,$a0
|
||||
add $a2,$T1
|
||||
mov $a,$a1
|
||||
___
|
||||
&ROUND_00_15(@_);
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
.text
|
||||
|
||||
.globl $func
|
||||
.type $func,\@function,3,"unwind"
|
||||
.align 16
|
||||
$func:
|
||||
.cfi_startproc
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
shl \$4,%rdx # num*16
|
||||
sub \$$framesz,%rsp
|
||||
.cfi_adjust_cfa_offset $framesz
|
||||
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
|
||||
mov $ctx,$_ctx # save ctx, 1st arg
|
||||
mov $inp,$_inp # save inp, 2nd arh
|
||||
mov %rdx,$_end # save end pointer, "3rd" arg
|
||||
.cfi_end_prologue
|
||||
|
||||
mov $SZ*0($ctx),$A
|
||||
mov $SZ*1($ctx),$B
|
||||
mov $SZ*2($ctx),$C
|
||||
mov $SZ*3($ctx),$D
|
||||
mov $SZ*4($ctx),$E
|
||||
mov $SZ*5($ctx),$F
|
||||
mov $SZ*6($ctx),$G
|
||||
mov $SZ*7($ctx),$H
|
||||
jmp .Lloop
|
||||
|
||||
.align 16
|
||||
.Lloop:
|
||||
mov $B,$a3
|
||||
lea $TABLE(%rip),$Tbl
|
||||
xor $C,$a3 # magic
|
||||
___
|
||||
for($i=0;$i<16;$i++) {
|
||||
$code.=" mov $SZ*$i($inp),$T1\n";
|
||||
$code.=" mov @ROT[4],$a0\n";
|
||||
$code.=" mov @ROT[0],$a1\n";
|
||||
$code.=" bswap $T1\n";
|
||||
&ROUND_00_15($i,@ROT);
|
||||
unshift(@ROT,pop(@ROT));
|
||||
}
|
||||
$code.=<<___;
|
||||
jmp .Lrounds_16_xx
|
||||
.align 16
|
||||
.Lrounds_16_xx:
|
||||
___
|
||||
for(;$i<32;$i++) {
|
||||
&ROUND_16_XX($i,@ROT);
|
||||
unshift(@ROT,pop(@ROT));
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
cmpb \$0x19,`$SZ-1`($Tbl)
|
||||
jnz .Lrounds_16_xx
|
||||
|
||||
mov $_ctx,$ctx
|
||||
add $a1,$A # modulo-scheduled h+=Sigma0(a)
|
||||
lea 16*$SZ($inp),$inp
|
||||
|
||||
add $SZ*0($ctx),$A
|
||||
add $SZ*1($ctx),$B
|
||||
add $SZ*2($ctx),$C
|
||||
add $SZ*3($ctx),$D
|
||||
add $SZ*4($ctx),$E
|
||||
add $SZ*5($ctx),$F
|
||||
add $SZ*6($ctx),$G
|
||||
add $SZ*7($ctx),$H
|
||||
|
||||
cmp $_end,$inp
|
||||
|
||||
mov $A,$SZ*0($ctx)
|
||||
mov $B,$SZ*1($ctx)
|
||||
mov $C,$SZ*2($ctx)
|
||||
mov $D,$SZ*3($ctx)
|
||||
mov $E,$SZ*4($ctx)
|
||||
mov $F,$SZ*5($ctx)
|
||||
mov $G,$SZ*6($ctx)
|
||||
mov $H,$SZ*7($ctx)
|
||||
jb .Lloop
|
||||
|
||||
lea $framesz+6*8(%rsp),%r11
|
||||
.cfi_def_cfa %r11,8
|
||||
mov $framesz(%rsp),%r15
|
||||
.cfi_restore %r15
|
||||
mov -40(%r11),%r14
|
||||
.cfi_restore %r14
|
||||
mov -32(%r11),%r13
|
||||
.cfi_restore %r13
|
||||
mov -24(%r11),%r12
|
||||
.cfi_restore %r12
|
||||
mov -16(%r11),%rbp
|
||||
.cfi_restore %rbp
|
||||
mov -8(%r11),%rbx
|
||||
.cfi_restore %rbx
|
||||
.cfi_epilogue
|
||||
lea (%r11),%rsp
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size $func,.-$func
|
||||
|
||||
.align 64
|
||||
.type $TABLE,\@object
|
||||
$TABLE:
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
.asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
|
||||
___
|
||||
{
|
||||
my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order
|
||||
("%rdi","%rsi","%rdx"); # Unix order
|
||||
$code.=<<___;
|
||||
.globl ${pre}sha256_emit
|
||||
.hidden ${pre}sha256_emit
|
||||
.type ${pre}sha256_emit,\@abi-omnipotent
|
||||
.align 16
|
||||
${pre}sha256_emit:
|
||||
mov 0($inp), %r8
|
||||
mov 8($inp), %r9
|
||||
mov 16($inp), %r10
|
||||
bswap %r8
|
||||
mov 24($inp), %r11
|
||||
bswap %r9
|
||||
mov %r8d, 4($out)
|
||||
bswap %r10
|
||||
mov %r9d, 12($out)
|
||||
bswap %r11
|
||||
mov %r10d, 20($out)
|
||||
shr \$32, %r8
|
||||
mov %r11d, 28($out)
|
||||
shr \$32, %r9
|
||||
mov %r8d, 0($out)
|
||||
shr \$32, %r10
|
||||
mov %r9d, 8($out)
|
||||
shr \$32, %r11
|
||||
mov %r10d, 16($out)
|
||||
mov %r11d, 24($out)
|
||||
ret
|
||||
.size ${pre}sha256_emit,.-${pre}sha256_emit
|
||||
|
||||
.globl ${pre}sha256_bcopy
|
||||
.hidden ${pre}sha256_bcopy
|
||||
.type ${pre}sha256_bcopy,\@abi-omnipotent
|
||||
.align 16
|
||||
${pre}sha256_bcopy:
|
||||
sub $inp, $out
|
||||
.Loop_bcopy:
|
||||
movzb ($inp), %eax
|
||||
lea 1($inp), $inp
|
||||
mov %al, -1($out,$inp)
|
||||
dec $len
|
||||
jnz .Loop_bcopy
|
||||
ret
|
||||
.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy
|
||||
|
||||
.globl ${pre}sha256_hcopy
|
||||
.hidden ${pre}sha256_hcopy
|
||||
.type ${pre}sha256_hcopy,\@abi-omnipotent
|
||||
.align 16
|
||||
${pre}sha256_hcopy:
|
||||
mov 0($inp), %r8
|
||||
mov 8($inp), %r9
|
||||
mov 16($inp), %r10
|
||||
mov 24($inp), %r11
|
||||
mov %r8, 0($out)
|
||||
mov %r9, 8($out)
|
||||
mov %r10, 16($out)
|
||||
mov %r11, 24($out)
|
||||
ret
|
||||
.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy
|
||||
___
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
789
blst/asm/sha256-x86_64.pl
Executable file
789
blst/asm/sha256-x86_64.pl
Executable file
|
@ -0,0 +1,789 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# Copyright Supranational LLC
|
||||
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
|
||||
# project.
|
||||
# ====================================================================
|
||||
#
|
||||
# sha256_block procedure for x86_64.
|
||||
#
|
||||
# This module is stripped of AVX and even scalar code paths, with
|
||||
# raionale that
|
||||
#
|
||||
# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one*
|
||||
# processor, venerable Sandy Bridge;
|
||||
# b) AVX2 incurs costly power transitions, which would be justifiable
|
||||
# if AVX2 code was executing most of the time, which is not the
|
||||
# case in the context;
|
||||
# c) all comtemporary processors support SSSE3, so that nobody would
|
||||
# actually use scalar code path anyway;
|
||||
#
|
||||
# See original module at CRYPTOGAMS for further details.
|
||||
|
||||
$flavour = shift;
|
||||
$output = pop;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||||
or die "can't call $xlate: $!";
|
||||
|
||||
$pre="blst_";
|
||||
$func="${pre}sha256_block_data_order";
|
||||
$TABLE="K256";
|
||||
$SZ=4;
|
||||
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
|
||||
"%r8d","%r9d","%r10d","%r11d");
|
||||
($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
|
||||
@Sigma0=( 2,13,22);
|
||||
@Sigma1=( 6,11,25);
|
||||
@sigma0=( 7,18, 3);
|
||||
@sigma1=(17,19,10);
|
||||
$rounds=64;
|
||||
|
||||
$ctx="%rdi"; # 1st arg, zapped by $a3
|
||||
$inp="%rsi"; # 2nd arg
|
||||
$Tbl="%rbp";
|
||||
|
||||
$_ctx="16*$SZ+0*8(%rsp)";
|
||||
$_inp="16*$SZ+1*8(%rsp)";
|
||||
$_end="16*$SZ+2*8(%rsp)";
|
||||
$framesz="16*$SZ+3*8";
|
||||
|
||||
$code=<<___;
|
||||
.text
|
||||
|
||||
.align 64
|
||||
.type $TABLE,\@object
|
||||
$TABLE:
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
|
||||
.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
|
||||
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
|
||||
.asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
|
||||
___
|
||||
|
||||
######################################################################
|
||||
# SIMD code paths
|
||||
#
|
||||
{{{
|
||||
######################################################################
|
||||
# Intel SHA Extensions implementation of SHA256 update function.
|
||||
#
|
||||
my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
|
||||
|
||||
my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
|
||||
my @MSG=map("%xmm$_",(3..6));
|
||||
|
||||
$code.=<<___;
|
||||
.globl ${pre}sha256_block_data_order_shaext
|
||||
.hidden ${pre}sha256_block_data_order_shaext
|
||||
.type ${pre}sha256_block_data_order_shaext,\@function,3,"unwind"
|
||||
.align 64
|
||||
${pre}sha256_block_data_order_shaext:
|
||||
.cfi_startproc
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
sub \$0x58,%rsp
|
||||
.cfi_adjust_cfa_offset 0x58
|
||||
movaps %xmm6,-0x58(%r11)
|
||||
.cfi_offset %xmm6,-0x60
|
||||
movaps %xmm7,-0x48(%r11)
|
||||
.cfi_offset %xmm7,-0x50
|
||||
movaps %xmm8,-0x38(%r11)
|
||||
.cfi_offset %xmm8,-0x40
|
||||
movaps %xmm9,-0x28(%r11)
|
||||
.cfi_offset %xmm9,-0x30
|
||||
movaps %xmm10,-0x18(%r11)
|
||||
.cfi_offset %xmm10,-0x20
|
||||
.cfi_end_prologue
|
||||
___
|
||||
$code.=<<___;
|
||||
lea K256+0x80(%rip),$Tbl
|
||||
movdqu ($ctx),$ABEF # DCBA
|
||||
movdqu 16($ctx),$CDGH # HGFE
|
||||
movdqa 0x100-0x80($Tbl),$TMP # byte swap mask
|
||||
|
||||
pshufd \$0x1b,$ABEF,$Wi # ABCD
|
||||
pshufd \$0xb1,$ABEF,$ABEF # CDAB
|
||||
pshufd \$0x1b,$CDGH,$CDGH # EFGH
|
||||
movdqa $TMP,$BSWAP # offload
|
||||
palignr \$8,$CDGH,$ABEF # ABEF
|
||||
punpcklqdq $Wi,$CDGH # CDGH
|
||||
jmp .Loop_shaext
|
||||
|
||||
.align 16
|
||||
.Loop_shaext:
|
||||
movdqu ($inp),@MSG[0]
|
||||
movdqu 0x10($inp),@MSG[1]
|
||||
movdqu 0x20($inp),@MSG[2]
|
||||
pshufb $TMP,@MSG[0]
|
||||
movdqu 0x30($inp),@MSG[3]
|
||||
|
||||
movdqa 0*16-0x80($Tbl),$Wi
|
||||
paddd @MSG[0],$Wi
|
||||
pshufb $TMP,@MSG[1]
|
||||
movdqa $CDGH,$CDGH_SAVE # offload
|
||||
sha256rnds2 $ABEF,$CDGH # 0-3
|
||||
pshufd \$0x0e,$Wi,$Wi
|
||||
nop
|
||||
movdqa $ABEF,$ABEF_SAVE # offload
|
||||
sha256rnds2 $CDGH,$ABEF
|
||||
|
||||
movdqa 1*16-0x80($Tbl),$Wi
|
||||
paddd @MSG[1],$Wi
|
||||
pshufb $TMP,@MSG[2]
|
||||
sha256rnds2 $ABEF,$CDGH # 4-7
|
||||
pshufd \$0x0e,$Wi,$Wi
|
||||
lea 0x40($inp),$inp
|
||||
sha256msg1 @MSG[1],@MSG[0]
|
||||
sha256rnds2 $CDGH,$ABEF
|
||||
|
||||
movdqa 2*16-0x80($Tbl),$Wi
|
||||
paddd @MSG[2],$Wi
|
||||
pshufb $TMP,@MSG[3]
|
||||
sha256rnds2 $ABEF,$CDGH # 8-11
|
||||
pshufd \$0x0e,$Wi,$Wi
|
||||
movdqa @MSG[3],$TMP
|
||||
palignr \$4,@MSG[2],$TMP
|
||||
nop
|
||||
paddd $TMP,@MSG[0]
|
||||
sha256msg1 @MSG[2],@MSG[1]
|
||||
sha256rnds2 $CDGH,$ABEF
|
||||
|
||||
movdqa 3*16-0x80($Tbl),$Wi
|
||||
paddd @MSG[3],$Wi
|
||||
sha256msg2 @MSG[3],@MSG[0]
|
||||
sha256rnds2 $ABEF,$CDGH # 12-15
|
||||
pshufd \$0x0e,$Wi,$Wi
|
||||
movdqa @MSG[0],$TMP
|
||||
palignr \$4,@MSG[3],$TMP
|
||||
nop
|
||||
paddd $TMP,@MSG[1]
|
||||
sha256msg1 @MSG[3],@MSG[2]
|
||||
sha256rnds2 $CDGH,$ABEF
|
||||
___
|
||||
for($i=4;$i<16-3;$i++) {
|
||||
$code.=<<___;
|
||||
movdqa $i*16-0x80($Tbl),$Wi
|
||||
paddd @MSG[0],$Wi
|
||||
sha256msg2 @MSG[0],@MSG[1]
|
||||
sha256rnds2 $ABEF,$CDGH # 16-19...
|
||||
pshufd \$0x0e,$Wi,$Wi
|
||||
movdqa @MSG[1],$TMP
|
||||
palignr \$4,@MSG[0],$TMP
|
||||
nop
|
||||
paddd $TMP,@MSG[2]
|
||||
sha256msg1 @MSG[0],@MSG[3]
|
||||
sha256rnds2 $CDGH,$ABEF
|
||||
___
|
||||
push(@MSG,shift(@MSG));
|
||||
}
|
||||
$code.=<<___;
|
||||
movdqa 13*16-0x80($Tbl),$Wi
|
||||
paddd @MSG[0],$Wi
|
||||
sha256msg2 @MSG[0],@MSG[1]
|
||||
sha256rnds2 $ABEF,$CDGH # 52-55
|
||||
pshufd \$0x0e,$Wi,$Wi
|
||||
movdqa @MSG[1],$TMP
|
||||
palignr \$4,@MSG[0],$TMP
|
||||
sha256rnds2 $CDGH,$ABEF
|
||||
paddd $TMP,@MSG[2]
|
||||
|
||||
movdqa 14*16-0x80($Tbl),$Wi
|
||||
paddd @MSG[1],$Wi
|
||||
sha256rnds2 $ABEF,$CDGH # 56-59
|
||||
pshufd \$0x0e,$Wi,$Wi
|
||||
sha256msg2 @MSG[1],@MSG[2]
|
||||
movdqa $BSWAP,$TMP
|
||||
sha256rnds2 $CDGH,$ABEF
|
||||
|
||||
movdqa 15*16-0x80($Tbl),$Wi
|
||||
paddd @MSG[2],$Wi
|
||||
nop
|
||||
sha256rnds2 $ABEF,$CDGH # 60-63
|
||||
pshufd \$0x0e,$Wi,$Wi
|
||||
dec $num
|
||||
nop
|
||||
sha256rnds2 $CDGH,$ABEF
|
||||
|
||||
paddd $CDGH_SAVE,$CDGH
|
||||
paddd $ABEF_SAVE,$ABEF
|
||||
jnz .Loop_shaext
|
||||
|
||||
pshufd \$0xb1,$CDGH,$CDGH # DCHG
|
||||
pshufd \$0x1b,$ABEF,$TMP # FEBA
|
||||
pshufd \$0xb1,$ABEF,$ABEF # BAFE
|
||||
punpckhqdq $CDGH,$ABEF # DCBA
|
||||
palignr \$8,$TMP,$CDGH # HGFE
|
||||
|
||||
movdqu $ABEF,($ctx)
|
||||
movdqu $CDGH,16($ctx)
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps -0x58(%r11),%xmm6
|
||||
movaps -0x48(%r11),%xmm7
|
||||
movaps -0x38(%r11),%xmm8
|
||||
movaps -0x28(%r11),%xmm9
|
||||
movaps -0x18(%r11),%xmm10
|
||||
mov %r11,%rsp
|
||||
.cfi_def_cfa %r11,8
|
||||
.cfi_epilogue
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size ${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext
|
||||
___
|
||||
}}}
|
||||
{{{
|
||||
|
||||
my $a4=$T1;
|
||||
my ($a,$b,$c,$d,$e,$f,$g,$h);
|
||||
|
||||
sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
||||
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
||||
my $arg = pop;
|
||||
$arg = "\$$arg" if ($arg*1 eq $arg);
|
||||
$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
||||
}
|
||||
|
||||
sub body_00_15 () {
|
||||
(
|
||||
'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
|
||||
|
||||
'&ror ($a0,$Sigma1[2]-$Sigma1[1])',
|
||||
'&mov ($a,$a1)',
|
||||
'&mov ($a4,$f)',
|
||||
|
||||
'&ror ($a1,$Sigma0[2]-$Sigma0[1])',
|
||||
'&xor ($a0,$e)',
|
||||
'&xor ($a4,$g)', # f^g
|
||||
|
||||
'&ror ($a0,$Sigma1[1]-$Sigma1[0])',
|
||||
'&xor ($a1,$a)',
|
||||
'&and ($a4,$e)', # (f^g)&e
|
||||
|
||||
'&xor ($a0,$e)',
|
||||
'&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
|
||||
'&mov ($a2,$a)',
|
||||
|
||||
'&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
|
||||
'&ror ($a1,$Sigma0[1]-$Sigma0[0])',
|
||||
'&xor ($a2,$b)', # a^b, b^c in next round
|
||||
|
||||
'&add ($h,$a4)', # h+=Ch(e,f,g)
|
||||
'&ror ($a0,$Sigma1[0])', # Sigma1(e)
|
||||
'&and ($a3,$a2)', # (b^c)&(a^b)
|
||||
|
||||
'&xor ($a1,$a)',
|
||||
'&add ($h,$a0)', # h+=Sigma1(e)
|
||||
'&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
|
||||
|
||||
'&ror ($a1,$Sigma0[0])', # Sigma0(a)
|
||||
'&add ($d,$h)', # d+=h
|
||||
'&add ($h,$a3)', # h+=Maj(a,b,c)
|
||||
|
||||
'&mov ($a0,$d)',
|
||||
'&add ($a1,$h);'. # h+=Sigma0(a)
|
||||
'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
|
||||
);
|
||||
}
|
||||
|
||||
######################################################################
|
||||
# SSSE3 code path
|
||||
#
|
||||
{
|
||||
my $Tbl = $inp;
|
||||
my $_ctx="0(%rbp)";
|
||||
my $_inp="8(%rbp)";
|
||||
my $_end="16(%rbp)";
|
||||
my $framesz=4*8+$win64*16*4+8;
|
||||
|
||||
my @X = map("%xmm$_",(0..3));
|
||||
my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
|
||||
|
||||
$code.=<<___;
|
||||
.globl ${func}
|
||||
.hidden ${func}
|
||||
.type ${func},\@function,3,"unwind"
|
||||
.align 64
|
||||
${func}:
|
||||
.cfi_startproc
|
||||
push %rbp
|
||||
.cfi_push %rbp
|
||||
push %rbx
|
||||
.cfi_push %rbx
|
||||
push %r12
|
||||
.cfi_push %r12
|
||||
push %r13
|
||||
.cfi_push %r13
|
||||
push %r14
|
||||
.cfi_push %r14
|
||||
push %r15
|
||||
.cfi_push %r15
|
||||
shl \$4,%rdx # num*16
|
||||
sub \$$framesz,%rsp
|
||||
.cfi_adjust_cfa_offset $framesz
|
||||
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
|
||||
mov $ctx,0(%rsp) # save ctx, 1st arg
|
||||
#mov $inp,8(%rsp) # save inp, 2nd arg
|
||||
mov %rdx,16(%rsp) # save end pointer, "3rd" arg
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,0x20(%rsp)
|
||||
.cfi_offset %xmm6,-0x78
|
||||
movaps %xmm7,0x30(%rsp)
|
||||
.cfi_offset %xmm7,-0x68
|
||||
movaps %xmm8,0x40(%rsp)
|
||||
.cfi_offset %xmm8,-0x58
|
||||
movaps %xmm9,0x50(%rsp)
|
||||
.cfi_offset %xmm9,-0x48
|
||||
___
|
||||
$code.=<<___;
|
||||
mov %rsp,%rbp
|
||||
.cfi_def_cfa_register %rbp
|
||||
.cfi_end_prologue
|
||||
|
||||
lea -16*$SZ(%rsp),%rsp
|
||||
mov $SZ*0($ctx),$A
|
||||
and \$-64,%rsp # align stack
|
||||
mov $SZ*1($ctx),$B
|
||||
mov $SZ*2($ctx),$C
|
||||
mov $SZ*3($ctx),$D
|
||||
mov $SZ*4($ctx),$E
|
||||
mov $SZ*5($ctx),$F
|
||||
mov $SZ*6($ctx),$G
|
||||
mov $SZ*7($ctx),$H
|
||||
___
|
||||
|
||||
$code.=<<___;
|
||||
#movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t4
|
||||
#movdqa $TABLE+`$SZ*$rounds`+64(%rip),$t5
|
||||
jmp .Lloop_ssse3
|
||||
.align 16
|
||||
.Lloop_ssse3:
|
||||
movdqa $TABLE+`$SZ*$rounds`(%rip),$t3
|
||||
mov $inp,$_inp # offload $inp
|
||||
movdqu 0x00($inp),@X[0]
|
||||
movdqu 0x10($inp),@X[1]
|
||||
movdqu 0x20($inp),@X[2]
|
||||
pshufb $t3,@X[0]
|
||||
movdqu 0x30($inp),@X[3]
|
||||
lea $TABLE(%rip),$Tbl
|
||||
pshufb $t3,@X[1]
|
||||
movdqa 0x00($Tbl),$t0
|
||||
movdqa 0x10($Tbl),$t1
|
||||
pshufb $t3,@X[2]
|
||||
paddd @X[0],$t0
|
||||
movdqa 0x20($Tbl),$t2
|
||||
pshufb $t3,@X[3]
|
||||
movdqa 0x30($Tbl),$t3
|
||||
paddd @X[1],$t1
|
||||
paddd @X[2],$t2
|
||||
paddd @X[3],$t3
|
||||
movdqa $t0,0x00(%rsp)
|
||||
mov $A,$a1
|
||||
movdqa $t1,0x10(%rsp)
|
||||
mov $B,$a3
|
||||
movdqa $t2,0x20(%rsp)
|
||||
xor $C,$a3 # magic
|
||||
movdqa $t3,0x30(%rsp)
|
||||
mov $E,$a0
|
||||
jmp .Lssse3_00_47
|
||||
|
||||
.align 16
|
||||
.Lssse3_00_47:
|
||||
sub \$`-16*$SZ`,$Tbl # size optimization
|
||||
___
|
||||
sub Xupdate_256_SSSE3 () {
|
||||
(
|
||||
'&movdqa ($t0,@X[1]);',
|
||||
'&movdqa ($t3,@X[3])',
|
||||
'&palignr ($t0,@X[0],$SZ)', # X[1..4]
|
||||
'&palignr ($t3,@X[2],$SZ);', # X[9..12]
|
||||
'&movdqa ($t1,$t0)',
|
||||
'&movdqa ($t2,$t0);',
|
||||
'&psrld ($t0,$sigma0[2])',
|
||||
'&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
|
||||
'&psrld ($t2,$sigma0[0])',
|
||||
'&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
|
||||
'&pslld ($t1,8*$SZ-$sigma0[1]);'.
|
||||
'&pxor ($t0,$t2)',
|
||||
'&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
|
||||
'&pxor ($t0,$t1)',
|
||||
'&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
|
||||
'&pxor ($t0,$t2);',
|
||||
'&movdqa ($t2,$t3)',
|
||||
'&pxor ($t0,$t1);', # sigma0(X[1..4])
|
||||
'&psrld ($t3,$sigma1[2])',
|
||||
'&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
|
||||
'&psrlq ($t2,$sigma1[0])',
|
||||
'&pxor ($t3,$t2);',
|
||||
'&psrlq ($t2,$sigma1[1]-$sigma1[0])',
|
||||
'&pxor ($t3,$t2)',
|
||||
'&pshufb ($t3,$t4)', # sigma1(X[14..15])
|
||||
'&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
|
||||
'&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
|
||||
'&movdqa ($t2,$t3);',
|
||||
'&psrld ($t3,$sigma1[2])',
|
||||
'&psrlq ($t2,$sigma1[0])',
|
||||
'&pxor ($t3,$t2);',
|
||||
'&psrlq ($t2,$sigma1[1]-$sigma1[0])',
|
||||
'&pxor ($t3,$t2);',
|
||||
'&movdqa ($t2,16*$j."($Tbl)")',
|
||||
'&pshufb ($t3,$t5)',
|
||||
'&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
|
||||
);
|
||||
}
|
||||
|
||||
sub SSSE3_256_00_47 () {
|
||||
my $j = shift;
|
||||
my $body = shift;
|
||||
my @X = @_;
|
||||
my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
|
||||
|
||||
if (0) {
|
||||
foreach (Xupdate_256_SSSE3()) { # 36 instructions
|
||||
eval;
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
}
|
||||
} else { # squeeze extra 4% on Westmere and 19% on Atom
|
||||
eval(shift(@insns)); #@
|
||||
&movdqa ($t0,@X[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&movdqa ($t3,@X[3]);
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
&palignr ($t0,@X[0],$SZ); # X[1..4]
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&palignr ($t3,@X[2],$SZ); # X[9..12]
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
&movdqa ($t1,$t0);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&movdqa ($t2,$t0);
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
&psrld ($t0,$sigma0[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&paddd (@X[0],$t3); # X[0..3] += X[9..12]
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
&psrld ($t2,$sigma0[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pshufd ($t3,@X[3],0b11111010); # X[4..15]
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
&pslld ($t1,8*$SZ-$sigma0[1]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pxor ($t0,$t2);
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
&psrld ($t2,$sigma0[1]-$sigma0[0]);
|
||||
eval(shift(@insns));
|
||||
&pxor ($t0,$t1);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pslld ($t1,$sigma0[1]-$sigma0[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pxor ($t0,$t2);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
&movdqa ($t2,$t3);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pxor ($t0,$t1); # sigma0(X[1..4])
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&psrld ($t3,$sigma1[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
&psrlq ($t2,$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pxor ($t3,$t2);
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
&psrlq ($t2,$sigma1[1]-$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pxor ($t3,$t2);
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
#&pshufb ($t3,$t4); # sigma1(X[14..15])
|
||||
&pshufd ($t3,$t3,0b10000000);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&psrldq ($t3,8);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
&paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pshufd ($t3,@X[0],0b01010000); # X[16..17]
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
&movdqa ($t2,$t3);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&psrld ($t3,$sigma1[2]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
&psrlq ($t2,$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pxor ($t3,$t2);
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
&psrlq ($t2,$sigma1[1]-$sigma1[0]);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&pxor ($t3,$t2);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns)); #@
|
||||
#&pshufb ($t3,$t5);
|
||||
&pshufd ($t3,$t3,0b00001000);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&movdqa ($t2,16*$j."($Tbl)");
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
&pslldq ($t3,8);
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
&paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
|
||||
eval(shift(@insns)); #@
|
||||
eval(shift(@insns));
|
||||
eval(shift(@insns));
|
||||
}
|
||||
&paddd ($t2,@X[0]);
|
||||
foreach (@insns) { eval; } # remaining instructions
|
||||
&movdqa (16*$j."(%rsp)",$t2);
|
||||
}
|
||||
|
||||
for ($i=0,$j=0; $j<4; $j++) {
|
||||
&SSSE3_256_00_47($j,\&body_00_15,@X);
|
||||
push(@X,shift(@X)); # rotate(@X)
|
||||
}
|
||||
&cmpb ($SZ-1+16*$SZ."($Tbl)",0);
|
||||
&jne (".Lssse3_00_47");
|
||||
|
||||
for ($i=0; $i<16; ) {
|
||||
foreach(body_00_15()) { eval; }
|
||||
}
|
||||
$code.=<<___;
|
||||
mov $_ctx,$ctx
|
||||
mov $a1,$A
|
||||
mov $_inp,$inp
|
||||
|
||||
add $SZ*0($ctx),$A
|
||||
add $SZ*1($ctx),$B
|
||||
add $SZ*2($ctx),$C
|
||||
add $SZ*3($ctx),$D
|
||||
add $SZ*4($ctx),$E
|
||||
add $SZ*5($ctx),$F
|
||||
add $SZ*6($ctx),$G
|
||||
add $SZ*7($ctx),$H
|
||||
|
||||
lea 16*$SZ($inp),$inp
|
||||
cmp $_end,$inp
|
||||
|
||||
mov $A,$SZ*0($ctx)
|
||||
mov $B,$SZ*1($ctx)
|
||||
mov $C,$SZ*2($ctx)
|
||||
mov $D,$SZ*3($ctx)
|
||||
mov $E,$SZ*4($ctx)
|
||||
mov $F,$SZ*5($ctx)
|
||||
mov $G,$SZ*6($ctx)
|
||||
mov $H,$SZ*7($ctx)
|
||||
jb .Lloop_ssse3
|
||||
|
||||
xorps %xmm0, %xmm0
|
||||
lea $framesz+6*8(%rbp),%r11
|
||||
.cfi_def_cfa %r11,8
|
||||
movaps %xmm0, 0x00(%rsp) # scrub the stack
|
||||
movaps %xmm0, 0x10(%rsp)
|
||||
movaps %xmm0, 0x20(%rsp)
|
||||
movaps %xmm0, 0x30(%rsp)
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 0x20(%rbp),%xmm6
|
||||
movaps 0x30(%rbp),%xmm7
|
||||
movaps 0x40(%rbp),%xmm8
|
||||
movaps 0x50(%rbp),%xmm9
|
||||
___
|
||||
$code.=<<___;
|
||||
mov $framesz(%rbp),%r15
|
||||
.cfi_restore %r15
|
||||
mov -40(%r11),%r14
|
||||
.cfi_restore %r14
|
||||
mov -32(%r11),%r13
|
||||
.cfi_restore %r13
|
||||
mov -24(%r11),%r12
|
||||
.cfi_restore %r12
|
||||
mov -16(%r11),%rbx
|
||||
.cfi_restore %rbx
|
||||
mov -8(%r11),%rbp
|
||||
.cfi_restore %rbp
|
||||
.cfi_epilogue
|
||||
lea (%r11),%rsp
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size ${func},.-${func}
|
||||
___
|
||||
}
|
||||
}}}
|
||||
{
|
||||
my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order
|
||||
("%rdi","%rsi","%rdx"); # Unix order
|
||||
$code.=<<___;
|
||||
.globl ${pre}sha256_emit
|
||||
.hidden ${pre}sha256_emit
|
||||
.type ${pre}sha256_emit,\@abi-omnipotent
|
||||
.align 16
|
||||
${pre}sha256_emit:
|
||||
mov 0($inp), %r8
|
||||
mov 8($inp), %r9
|
||||
mov 16($inp), %r10
|
||||
bswap %r8
|
||||
mov 24($inp), %r11
|
||||
bswap %r9
|
||||
mov %r8d, 4($out)
|
||||
bswap %r10
|
||||
mov %r9d, 12($out)
|
||||
bswap %r11
|
||||
mov %r10d, 20($out)
|
||||
shr \$32, %r8
|
||||
mov %r11d, 28($out)
|
||||
shr \$32, %r9
|
||||
mov %r8d, 0($out)
|
||||
shr \$32, %r10
|
||||
mov %r9d, 8($out)
|
||||
shr \$32, %r11
|
||||
mov %r10d, 16($out)
|
||||
mov %r11d, 24($out)
|
||||
ret
|
||||
.size ${pre}sha256_emit,.-${pre}sha256_emit
|
||||
|
||||
.globl ${pre}sha256_bcopy
|
||||
.hidden ${pre}sha256_bcopy
|
||||
.type ${pre}sha256_bcopy,\@abi-omnipotent
|
||||
.align 16
|
||||
${pre}sha256_bcopy:
|
||||
sub $inp, $out
|
||||
.Loop_bcopy:
|
||||
movzb ($inp), %eax
|
||||
lea 1($inp), $inp
|
||||
mov %al, -1($out,$inp)
|
||||
dec $len
|
||||
jnz .Loop_bcopy
|
||||
ret
|
||||
.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy
|
||||
|
||||
.globl ${pre}sha256_hcopy
|
||||
.hidden ${pre}sha256_hcopy
|
||||
.type ${pre}sha256_hcopy,\@abi-omnipotent
|
||||
.align 16
|
||||
${pre}sha256_hcopy:
|
||||
mov 0($inp), %r8
|
||||
mov 8($inp), %r9
|
||||
mov 16($inp), %r10
|
||||
mov 24($inp), %r11
|
||||
mov %r8, 0($out)
|
||||
mov %r9, 8($out)
|
||||
mov %r10, 16($out)
|
||||
mov %r11, 24($out)
|
||||
ret
|
||||
.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy
|
||||
___
|
||||
}
|
||||
|
||||
sub sha256op38 {
|
||||
my $instr = shift;
|
||||
my %opcodelet = (
|
||||
"sha256rnds2" => 0xcb,
|
||||
"sha256msg1" => 0xcc,
|
||||
"sha256msg2" => 0xcd );
|
||||
|
||||
if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
|
||||
my @opcode=(0x0f,0x38);
|
||||
push @opcode,$opcodelet{$instr};
|
||||
push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
|
||||
return ".byte\t".join(',',@opcode);
|
||||
} else {
|
||||
return $instr."\t".@_[0];
|
||||
}
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
|
||||
s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
1781
blst/asm/x86_64-xlate.pl
Executable file
1781
blst/asm/x86_64-xlate.pl
Executable file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue