initial stuff

This commit is contained in:
John Doe 2022-09-09 02:47:49 -04:00
commit 943c07066e
99 changed files with 58786 additions and 0 deletions

412
blst/asm/add_mod_256-armv8.pl Executable file
View file

@ -0,0 +1,412 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
@mod=map("x$_",(4..7));
@a=map("x$_",(8..11));
@b=map("x$_",(12..15));
@t=map("x$_",(16,17,1..3));
$code.=<<___;
.text
.globl add_mod_256
.hidden add_mod_256
.type add_mod_256,%function
.align 5
add_mod_256:
ldp @a[0],@a[1],[$a_ptr]
ldp @b[0],@b[1],[$b_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
adds @a[0],@a[0],@b[0]
ldp @b[2],@b[3],[$b_ptr,#16]
adcs @a[1],@a[1],@b[1]
ldp @mod[0],@mod[1],[$n_ptr]
adcs @a[2],@a[2],@b[2]
ldp @mod[2],@mod[3],[$n_ptr,#16]
adcs @a[3],@a[3],@b[3]
adc @t[4],xzr,xzr
subs @t[0],@a[0],@mod[0]
sbcs @t[1],@a[1],@mod[1]
sbcs @t[2],@a[2],@mod[2]
sbcs @t[3],@a[3],@mod[3]
sbcs xzr,@t[4],xzr
csel @a[0],@a[0],@t[0],lo
csel @a[1],@a[1],@t[1],lo
csel @a[2],@a[2],@t[2],lo
stp @a[0],@a[1],[$r_ptr]
csel @a[3],@a[3],@t[3],lo
stp @a[2],@a[3],[$r_ptr,#16]
ret
.size add_mod_256,.-add_mod_256
.globl mul_by_3_mod_256
.hidden mul_by_3_mod_256
.type mul_by_3_mod_256,%function
.align 5
mul_by_3_mod_256:
ldp @b[0],@b[1],[$a_ptr]
ldp @b[2],@b[3],[$a_ptr,#16]
adds @a[0],@b[0],@b[0]
ldp @mod[0],@mod[1],[$b_ptr]
adcs @a[1],@b[1],@b[1]
ldp @mod[2],@mod[3],[$b_ptr,#16]
adcs @a[2],@b[2],@b[2]
adcs @a[3],@b[3],@b[3]
adc @t[4],xzr,xzr
subs @t[0],@a[0],@mod[0]
sbcs @t[1],@a[1],@mod[1]
sbcs @t[2],@a[2],@mod[2]
sbcs @t[3],@a[3],@mod[3]
sbcs xzr,@t[4],xzr
csel @a[0],@a[0],@t[0],lo
csel @a[1],@a[1],@t[1],lo
csel @a[2],@a[2],@t[2],lo
csel @a[3],@a[3],@t[3],lo
adds @a[0],@a[0],@b[0]
adcs @a[1],@a[1],@b[1]
adcs @a[2],@a[2],@b[2]
adcs @a[3],@a[3],@b[3]
adc @t[4],xzr,xzr
subs @t[0],@a[0],@mod[0]
sbcs @t[1],@a[1],@mod[1]
sbcs @t[2],@a[2],@mod[2]
sbcs @t[3],@a[3],@mod[3]
sbcs xzr,@t[4],xzr
csel @a[0],@a[0],@t[0],lo
csel @a[1],@a[1],@t[1],lo
csel @a[2],@a[2],@t[2],lo
stp @a[0],@a[1],[$r_ptr]
csel @a[3],@a[3],@t[3],lo
stp @a[2],@a[3],[$r_ptr,#16]
ret
.size mul_by_3_mod_256,.-mul_by_3_mod_256
.globl lshift_mod_256
.hidden lshift_mod_256
.type lshift_mod_256,%function
.align 5
lshift_mod_256:
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @mod[0],@mod[1],[$n_ptr]
ldp @mod[2],@mod[3],[$n_ptr,#16]
.Loop_lshift_mod_256:
adds @a[0],@a[0],@a[0]
sub $b_ptr,$b_ptr,#1
adcs @a[1],@a[1],@a[1]
adcs @a[2],@a[2],@a[2]
adcs @a[3],@a[3],@a[3]
adc @t[4],xzr,xzr
subs @b[0],@a[0],@mod[0]
sbcs @b[1],@a[1],@mod[1]
sbcs @b[2],@a[2],@mod[2]
sbcs @b[3],@a[3],@mod[3]
sbcs xzr,@t[4],xzr
csel @a[0],@a[0],@b[0],lo
csel @a[1],@a[1],@b[1],lo
csel @a[2],@a[2],@b[2],lo
csel @a[3],@a[3],@b[3],lo
cbnz $b_ptr,.Loop_lshift_mod_256
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
ret
.size lshift_mod_256,.-lshift_mod_256
.globl rshift_mod_256
.hidden rshift_mod_256
.type rshift_mod_256,%function
.align 5
rshift_mod_256:
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @mod[0],@mod[1],[$n_ptr]
ldp @mod[2],@mod[3],[$n_ptr,#16]
.Loop_rshift:
adds @b[0],@a[0],@mod[0]
sub $b_ptr,$b_ptr,#1
adcs @b[1],@a[1],@mod[1]
adcs @b[2],@a[2],@mod[2]
adcs @b[3],@a[3],@mod[3]
adc @t[4],xzr,xzr
tst @a[0],#1
csel @b[0],@b[0],@a[0],ne
csel @b[1],@b[1],@a[1],ne
csel @b[2],@b[2],@a[2],ne
csel @b[3],@b[3],@a[3],ne
csel @t[4],@t[4],xzr,ne
extr @a[0],@b[1],@b[0],#1
extr @a[1],@b[2],@b[1],#1
extr @a[2],@b[3],@b[2],#1
extr @a[3],@t[4],@b[3],#1
cbnz $b_ptr,.Loop_rshift
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
ret
.size rshift_mod_256,.-rshift_mod_256
.globl cneg_mod_256
.hidden cneg_mod_256
.type cneg_mod_256,%function
.align 5
cneg_mod_256:
ldp @a[0],@a[1],[$a_ptr]
ldp @mod[0],@mod[1],[$n_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
subs @b[0],@mod[0],@a[0]
ldp @mod[2],@mod[3],[$n_ptr,#16]
orr @mod[0],@a[0],@a[1]
sbcs @b[1],@mod[1],@a[1]
orr @mod[1],@a[2],@a[3]
sbcs @b[2],@mod[2],@a[2]
orr @t[4],@mod[0],@mod[1]
sbc @b[3],@mod[3],@a[3]
cmp @t[4],#0
csetm @t[4],ne
ands $b_ptr,$b_ptr,@t[4]
csel @a[0],@a[0],@b[0],eq
csel @a[1],@a[1],@b[1],eq
csel @a[2],@a[2],@b[2],eq
stp @a[0],@a[1],[$r_ptr]
csel @a[3],@a[3],@b[3],eq
stp @a[2],@a[3],[$r_ptr,#16]
ret
.size cneg_mod_256,.-cneg_mod_256
.globl sub_mod_256
.hidden sub_mod_256
.type sub_mod_256,%function
.align 5
sub_mod_256:
ldp @a[0],@a[1],[$a_ptr]
ldp @b[0],@b[1],[$b_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
subs @a[0],@a[0],@b[0]
ldp @b[2],@b[3],[$b_ptr,#16]
sbcs @a[1],@a[1],@b[1]
ldp @mod[0],@mod[1],[$n_ptr]
sbcs @a[2],@a[2],@b[2]
ldp @mod[2],@mod[3],[$n_ptr,#16]
sbcs @a[3],@a[3],@b[3]
sbc @t[4],xzr,xzr
and @mod[0],@mod[0],@t[4]
and @mod[1],@mod[1],@t[4]
adds @a[0],@a[0],@mod[0]
and @mod[2],@mod[2],@t[4]
adcs @a[1],@a[1],@mod[1]
and @mod[3],@mod[3],@t[4]
adcs @a[2],@a[2],@mod[2]
stp @a[0],@a[1],[$r_ptr]
adc @a[3],@a[3],@mod[3]
stp @a[2],@a[3],[$r_ptr,#16]
ret
.size sub_mod_256,.-sub_mod_256
.globl check_mod_256
.hidden check_mod_256
.type check_mod_256,%function
.align 5
check_mod_256:
ldp @a[0],@a[1],[$r_ptr]
ldp @a[2],@a[3],[$r_ptr,#16]
ldp @mod[0],@mod[1],[$a_ptr]
ldp @mod[2],@mod[3],[$a_ptr,#16]
#ifdef __AARCH64EB__
rev @a[0],@a[0]
rev @a[1],@a[1]
rev @a[2],@a[2]
rev @a[3],@a[3]
#endif
subs xzr,@a[0],@mod[0]
sbcs xzr,@a[1],@mod[1]
orr @a[0],@a[0],@a[1]
sbcs xzr,@a[2],@mod[2]
orr @a[0],@a[0],@a[2]
sbcs xzr,@a[3],@mod[3]
orr @a[0],@a[0],@a[3]
sbc $a_ptr,xzr,xzr
cmp @a[0],#0
mov x0,#1
csel x0,x0,xzr,ne
and x0,x0,$a_ptr
ret
.size check_mod_256,.-check_mod_256
.globl add_n_check_mod_256
.hidden add_n_check_mod_256
.type add_n_check_mod_256,%function
.align 5
add_n_check_mod_256:
ldp @a[0],@a[1],[$a_ptr]
ldp @b[0],@b[1],[$b_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @b[2],@b[3],[$b_ptr,#16]
#ifdef __AARCH64EB__
rev @a[0],@a[0]
rev @b[0],@b[0]
rev @a[1],@a[1]
rev @b[1],@b[1]
rev @a[2],@a[2]
rev @b[2],@b[2]
rev @a[3],@a[3]
rev @b[3],@b[3]
#endif
adds @a[0],@a[0],@b[0]
ldp @mod[0],@mod[1],[$n_ptr]
adcs @a[1],@a[1],@b[1]
ldp @mod[2],@mod[3],[$n_ptr,#16]
adcs @a[2],@a[2],@b[2]
adcs @a[3],@a[3],@b[3]
adc @t[4],xzr,xzr
subs @t[0],@a[0],@mod[0]
sbcs @t[1],@a[1],@mod[1]
sbcs @t[2],@a[2],@mod[2]
sbcs @t[3],@a[3],@mod[3]
sbcs xzr,@t[4],xzr
csel @a[0],@a[0],@t[0],lo
csel @a[1],@a[1],@t[1],lo
csel @a[2],@a[2],@t[2],lo
csel @a[3],@a[3],@t[3],lo
orr @t[0], @a[0], @a[1]
orr @t[1], @a[2], @a[3]
orr @t[0], @t[0], @t[1]
#ifdef __AARCH64EB__
rev @a[0],@a[0]
rev @a[1],@a[1]
rev @a[2],@a[2]
rev @a[3],@a[3]
#endif
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
mov @t[1], #1
cmp @t[0], #0
csel x0, @t[1], xzr, ne
ret
.size add_n_check_mod_256,.-add_n_check_mod_256
.globl sub_n_check_mod_256
.hidden sub_n_check_mod_256
.type sub_n_check_mod_256,%function
.align 5
sub_n_check_mod_256:
ldp @a[0],@a[1],[$a_ptr]
ldp @b[0],@b[1],[$b_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @b[2],@b[3],[$b_ptr,#16]
#ifdef __AARCH64EB__
rev @a[0],@a[0]
rev @b[0],@b[0]
rev @a[1],@a[1]
rev @b[1],@b[1]
rev @a[2],@a[2]
rev @b[2],@b[2]
rev @a[3],@a[3]
rev @b[3],@b[3]
#endif
subs @a[0],@a[0],@b[0]
sbcs @a[1],@a[1],@b[1]
ldp @mod[0],@mod[1],[$n_ptr]
sbcs @a[2],@a[2],@b[2]
ldp @mod[2],@mod[3],[$n_ptr,#16]
sbcs @a[3],@a[3],@b[3]
sbc @t[4],xzr,xzr
and @mod[0],@mod[0],@t[4]
and @mod[1],@mod[1],@t[4]
adds @a[0],@a[0],@mod[0]
and @mod[2],@mod[2],@t[4]
adcs @a[1],@a[1],@mod[1]
and @mod[3],@mod[3],@t[4]
adcs @a[2],@a[2],@mod[2]
adc @a[3],@a[3],@mod[3]
orr @t[0], @a[0], @a[1]
orr @t[1], @a[2], @a[3]
orr @t[0], @t[0], @t[1]
#ifdef __AARCH64EB__
rev @a[0],@a[0]
rev @a[1],@a[1]
rev @a[2],@a[2]
rev @a[3],@a[3]
#endif
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
mov @t[1], #1
cmp @t[0], #0
csel x0, @t[1], xzr, ne
ret
.size sub_n_check_mod_256,.-sub_n_check_mod_256
___
print $code;
close STDOUT;

547
blst/asm/add_mod_256-x86_64.pl Executable file
View file

@ -0,0 +1,547 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
# common argument layout
($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx");
$b_ptr = "%rbx";
{ ############################################################## 256 bits add
my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12));
$code.=<<___;
.text
.globl add_mod_256
.hidden add_mod_256
.type add_mod_256,\@function,4,"unwind"
.align 32
add_mod_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov 8*0($a_ptr), @acc[0]
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov 8*3($a_ptr), @acc[3]
.Loaded_a_add_mod_256:
add 8*0($b_org), @acc[0]
adc 8*1($b_org), @acc[1]
mov @acc[0], @acc[4]
adc 8*2($b_org), @acc[2]
mov @acc[1], @acc[5]
adc 8*3($b_org), @acc[3]
sbb $b_org, $b_org
mov @acc[2], @acc[6]
sub 8*0($n_ptr), @acc[0]
sbb 8*1($n_ptr), @acc[1]
sbb 8*2($n_ptr), @acc[2]
mov @acc[3], @acc[7]
sbb 8*3($n_ptr), @acc[3]
sbb \$0, $b_org
cmovc @acc[4], @acc[0]
cmovc @acc[5], @acc[1]
mov @acc[0], 8*0($r_ptr)
cmovc @acc[6], @acc[2]
mov @acc[1], 8*1($r_ptr)
cmovc @acc[7], @acc[3]
mov @acc[2], 8*2($r_ptr)
mov @acc[3], 8*3($r_ptr)
mov 8(%rsp),%rbx
.cfi_restore %rbx
mov 16(%rsp),%rbp
.cfi_restore %rbp
lea 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.cfi_epilogue
ret
.cfi_endproc
.size add_mod_256,.-add_mod_256
########################################################################
.globl mul_by_3_mod_256
.hidden mul_by_3_mod_256
.type mul_by_3_mod_256,\@function,3,"unwind"
.align 32
mul_by_3_mod_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
.cfi_end_prologue
mov $b_org,$n_ptr
mov 8*0($a_ptr), @acc[0]
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov $a_ptr,$b_org
mov 8*3($a_ptr), @acc[3]
call __lshift_mod_256
mov 0(%rsp),%r12
.cfi_restore %r12
jmp .Loaded_a_add_mod_256
mov 8(%rsp),%rbx
.cfi_restore %rbx
mov 16(%rsp),%rbp
.cfi_restore %rbp
lea 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.cfi_epilogue
ret
.cfi_endproc
.size mul_by_3_mod_256,.-mul_by_3_mod_256
.type __lshift_mod_256,\@abi-omnipotent
.align 32
__lshift_mod_256:
add @acc[0], @acc[0]
adc @acc[1], @acc[1]
mov @acc[0], @acc[4]
adc @acc[2], @acc[2]
mov @acc[1], @acc[5]
adc @acc[3], @acc[3]
sbb @acc[8], @acc[8]
mov @acc[2], @acc[6]
sub 8*0($n_ptr), @acc[0]
sbb 8*1($n_ptr), @acc[1]
sbb 8*2($n_ptr), @acc[2]
mov @acc[3], @acc[7]
sbb 8*3($n_ptr), @acc[3]
sbb \$0, @acc[8]
cmovc @acc[4], @acc[0]
cmovc @acc[5], @acc[1]
cmovc @acc[6], @acc[2]
cmovc @acc[7], @acc[3]
ret
.size __lshift_mod_256,.-__lshift_mod_256
########################################################################
.globl lshift_mod_256
.hidden lshift_mod_256
.type lshift_mod_256,\@function,4,"unwind"
.align 32
lshift_mod_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
.cfi_end_prologue
mov 8*0($a_ptr), @acc[0]
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov 8*3($a_ptr), @acc[3]
.Loop_lshift_mod_256:
call __lshift_mod_256
dec %edx
jnz .Loop_lshift_mod_256
mov @acc[0], 8*0($r_ptr)
mov @acc[1], 8*1($r_ptr)
mov @acc[2], 8*2($r_ptr)
mov @acc[3], 8*3($r_ptr)
mov 0(%rsp),%r12
.cfi_restore %r12
mov 8(%rsp),%rbx
.cfi_restore %rbx
mov 16(%rsp),%rbp
.cfi_restore %rbp
lea 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.cfi_epilogue
ret
.cfi_endproc
.size lshift_mod_256,.-lshift_mod_256
########################################################################
.globl rshift_mod_256
.hidden rshift_mod_256
.type rshift_mod_256,\@function,4,"unwind"
.align 32
rshift_mod_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov 8*0($a_ptr), @acc[7]
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov 8*3($a_ptr), @acc[3]
.Loop_rshift_mod_256:
mov @acc[7], @acc[0]
and \$1, @acc[7]
mov 8*0($n_ptr), @acc[4]
neg @acc[7]
mov 8*1($n_ptr), @acc[5]
mov 8*2($n_ptr), @acc[6]
and @acc[7], @acc[4]
and @acc[7], @acc[5]
and @acc[7], @acc[6]
and 8*3($n_ptr), @acc[7]
add @acc[4], @acc[0]
adc @acc[5], @acc[1]
adc @acc[6], @acc[2]
adc @acc[7], @acc[3]
sbb @acc[4], @acc[4]
shr \$1, @acc[0]
mov @acc[1], @acc[7]
shr \$1, @acc[1]
mov @acc[2], @acc[6]
shr \$1, @acc[2]
mov @acc[3], @acc[5]
shr \$1, @acc[3]
shl \$63, @acc[7]
shl \$63, @acc[6]
or @acc[0], @acc[7]
shl \$63, @acc[5]
or @acc[6], @acc[1]
shl \$63, @acc[4]
or @acc[5], @acc[2]
or @acc[4], @acc[3]
dec %edx
jnz .Loop_rshift_mod_256
mov @acc[7], 8*0($r_ptr)
mov @acc[1], 8*1($r_ptr)
mov @acc[2], 8*2($r_ptr)
mov @acc[3], 8*3($r_ptr)
mov 8(%rsp),%rbx
.cfi_restore %rbx
mov 16(%rsp),%rbp
.cfi_restore %rbp
lea 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.cfi_epilogue
ret
.cfi_endproc
.size rshift_mod_256,.-rshift_mod_256
########################################################################
.globl cneg_mod_256
.hidden cneg_mod_256
.type cneg_mod_256,\@function,4,"unwind"
.align 32
cneg_mod_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
.cfi_end_prologue
mov 8*0($a_ptr), @acc[8] # load a[0:3]
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov @acc[8], @acc[0]
mov 8*3($a_ptr), @acc[3]
or @acc[1], @acc[8]
or @acc[2], @acc[8]
or @acc[3], @acc[8]
mov \$-1, @acc[7]
mov 8*0($n_ptr), @acc[4] # load n[0:3]
cmovnz @acc[7], @acc[8] # mask = a[0:3] ? -1 : 0
mov 8*1($n_ptr), @acc[5]
mov 8*2($n_ptr), @acc[6]
and @acc[8], @acc[4] # n[0:3] &= mask
mov 8*3($n_ptr), @acc[7]
and @acc[8], @acc[5]
and @acc[8], @acc[6]
and @acc[8], @acc[7]
sub @acc[0], @acc[4] # a[0:3] ? n[0:3]-a[0:3] : 0-0
sbb @acc[1], @acc[5]
sbb @acc[2], @acc[6]
sbb @acc[3], @acc[7]
or $b_org, $b_org # check condition flag
cmovz @acc[0], @acc[4] # flag ? n[0:3]-a[0:3] : a[0:3]
cmovz @acc[1], @acc[5]
mov @acc[4], 8*0($r_ptr)
cmovz @acc[2], @acc[6]
mov @acc[5], 8*1($r_ptr)
cmovz @acc[3], @acc[7]
mov @acc[6], 8*2($r_ptr)
mov @acc[7], 8*3($r_ptr)
mov 0(%rsp),%r12
.cfi_restore %r12
mov 8(%rsp),%rbx
.cfi_restore %rbx
mov 16(%rsp),%rbp
.cfi_restore %rbp
lea 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.cfi_epilogue
ret
.cfi_endproc
.size cneg_mod_256,.-cneg_mod_256
########################################################################
.globl sub_mod_256
.hidden sub_mod_256
.type sub_mod_256,\@function,4,"unwind"
.align 32
sub_mod_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov 8*0($a_ptr), @acc[0]
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov 8*3($a_ptr), @acc[3]
sub 8*0($b_org), @acc[0]
mov 8*0($n_ptr), @acc[4]
sbb 8*1($b_org), @acc[1]
mov 8*1($n_ptr), @acc[5]
sbb 8*2($b_org), @acc[2]
mov 8*2($n_ptr), @acc[6]
sbb 8*3($b_org), @acc[3]
mov 8*3($n_ptr), @acc[7]
sbb $b_org, $b_org
and $b_org, @acc[4]
and $b_org, @acc[5]
and $b_org, @acc[6]
and $b_org, @acc[7]
add @acc[4], @acc[0]
adc @acc[5], @acc[1]
mov @acc[0], 8*0($r_ptr)
adc @acc[6], @acc[2]
mov @acc[1], 8*1($r_ptr)
adc @acc[7], @acc[3]
mov @acc[2], 8*2($r_ptr)
mov @acc[3], 8*3($r_ptr)
mov 8(%rsp),%rbx
.cfi_restore %rbx
mov 16(%rsp),%rbp
.cfi_restore %rbp
lea 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.cfi_epilogue
ret
.cfi_endproc
.size sub_mod_256,.-sub_mod_256
########################################################################
.globl check_mod_256
.hidden check_mod_256
.type check_mod_256,\@function,2,"unwind"
.align 32
check_mod_256:
.cfi_startproc
mov 8*0($r_ptr), %rax
mov 8*1($r_ptr), @acc[1]
mov 8*2($r_ptr), @acc[2]
mov 8*3($r_ptr), @acc[3]
mov %rax, @acc[0] # see if it's zero
or @acc[1], %rax
or @acc[2], %rax
or @acc[3], %rax
sub 8*0($a_ptr), @acc[0] # does subtracting modulus borrow?
sbb 8*1($a_ptr), @acc[1]
sbb 8*2($a_ptr), @acc[2]
sbb 8*3($a_ptr), @acc[3]
sbb $a_ptr, $a_ptr
mov \$1, %rdx
cmp \$0, %rax
cmovne %rdx, %rax
and $a_ptr, %rax
.cfi_epilogue
ret
.cfi_endproc
.size check_mod_256,.-check_mod_256
########################################################################
.globl add_n_check_mod_256
.hidden add_n_check_mod_256
.type add_n_check_mod_256,\@function,4,"unwind"
.align 32
add_n_check_mod_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov 8*0($a_ptr), @acc[0]
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov 8*3($a_ptr), @acc[3]
add 8*0($b_org), @acc[0]
adc 8*1($b_org), @acc[1]
mov @acc[0], @acc[4]
adc 8*2($b_org), @acc[2]
mov @acc[1], @acc[5]
adc 8*3($b_org), @acc[3]
sbb $b_org, $b_org
mov @acc[2], @acc[6]
sub 8*0($n_ptr), @acc[0]
sbb 8*1($n_ptr), @acc[1]
sbb 8*2($n_ptr), @acc[2]
mov @acc[3], @acc[7]
sbb 8*3($n_ptr), @acc[3]
sbb \$0, $b_org
cmovc @acc[4], @acc[0]
cmovc @acc[5], @acc[1]
mov @acc[0], 8*0($r_ptr)
cmovc @acc[6], @acc[2]
mov @acc[1], 8*1($r_ptr)
cmovc @acc[7], @acc[3]
mov @acc[2], 8*2($r_ptr)
mov @acc[3], 8*3($r_ptr)
or @acc[1], @acc[0]
or @acc[3], @acc[2]
or @acc[2], @acc[0]
mov \$1, %rax
cmovz @acc[0], %rax
mov 8(%rsp),%rbx
.cfi_restore %rbx
mov 16(%rsp),%rbp
.cfi_restore %rbp
lea 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.cfi_epilogue
ret
.cfi_endproc
.size add_n_check_mod_256,.-add_n_check_mod_256
########################################################################
.globl sub_n_check_mod_256
.hidden sub_n_check_mod_256
.type sub_n_check_mod_256,\@function,4,"unwind"
.align 32
sub_n_check_mod_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov 8*0($a_ptr), @acc[0]
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov 8*3($a_ptr), @acc[3]
sub 8*0($b_org), @acc[0]
mov 8*0($n_ptr), @acc[4]
sbb 8*1($b_org), @acc[1]
mov 8*1($n_ptr), @acc[5]
sbb 8*2($b_org), @acc[2]
mov 8*2($n_ptr), @acc[6]
sbb 8*3($b_org), @acc[3]
mov 8*3($n_ptr), @acc[7]
sbb $b_org, $b_org
and $b_org, @acc[4]
and $b_org, @acc[5]
and $b_org, @acc[6]
and $b_org, @acc[7]
add @acc[4], @acc[0]
adc @acc[5], @acc[1]
mov @acc[0], 8*0($r_ptr)
adc @acc[6], @acc[2]
mov @acc[1], 8*1($r_ptr)
adc @acc[7], @acc[3]
mov @acc[2], 8*2($r_ptr)
mov @acc[3], 8*3($r_ptr)
or @acc[1], @acc[0]
or @acc[3], @acc[2]
or @acc[2], @acc[0]
mov \$1, %rax
cmovz @acc[0], %rax
mov 8(%rsp),%rbx
.cfi_restore %rbx
mov 16(%rsp),%rbp
.cfi_restore %rbp
lea 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.cfi_epilogue
ret
.cfi_endproc
.size sub_n_check_mod_256,.-sub_n_check_mod_256
___
}
print $code;
close STDOUT;

872
blst/asm/add_mod_384-armv8.pl Executable file
View file

@ -0,0 +1,872 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
@mod=map("x$_",(4..9));
@a=map("x$_",(10..15));
@b=map("x$_",(16,17,19..22));
$carry=$n_ptr;
$code.=<<___;
.text
.globl add_mod_384
.hidden add_mod_384
.type add_mod_384,%function
.align 5
add_mod_384:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @mod[0],@mod[1],[$n_ptr]
ldp @mod[2],@mod[3],[$n_ptr,#16]
ldp @mod[4],@mod[5],[$n_ptr,#32]
bl __add_mod_384
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
stp @a[4],@a[5],[$r_ptr,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size add_mod_384,.-add_mod_384
.type __add_mod_384,%function
.align 5
__add_mod_384:
ldp @a[0],@a[1],[$a_ptr]
ldp @b[0],@b[1],[$b_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @b[2],@b[3],[$b_ptr,#16]
ldp @a[4],@a[5],[$a_ptr,#32]
ldp @b[4],@b[5],[$b_ptr,#32]
__add_mod_384_ab_are_loaded:
adds @a[0],@a[0],@b[0]
adcs @a[1],@a[1],@b[1]
adcs @a[2],@a[2],@b[2]
adcs @a[3],@a[3],@b[3]
adcs @a[4],@a[4],@b[4]
adcs @a[5],@a[5],@b[5]
adc $carry,xzr,xzr
subs @b[0],@a[0],@mod[0]
sbcs @b[1],@a[1],@mod[1]
sbcs @b[2],@a[2],@mod[2]
sbcs @b[3],@a[3],@mod[3]
sbcs @b[4],@a[4],@mod[4]
sbcs @b[5],@a[5],@mod[5]
sbcs xzr,$carry,xzr
csel @a[0],@a[0],@b[0],lo
csel @a[1],@a[1],@b[1],lo
csel @a[2],@a[2],@b[2],lo
csel @a[3],@a[3],@b[3],lo
csel @a[4],@a[4],@b[4],lo
csel @a[5],@a[5],@b[5],lo
ret
.size __add_mod_384,.-__add_mod_384
.globl add_mod_384x
.hidden add_mod_384x
.type add_mod_384x,%function
.align 5
add_mod_384x:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @mod[0],@mod[1],[$n_ptr]
ldp @mod[2],@mod[3],[$n_ptr,#16]
ldp @mod[4],@mod[5],[$n_ptr,#32]
bl __add_mod_384
stp @a[0],@a[1],[$r_ptr]
add $a_ptr,$a_ptr,#48
stp @a[2],@a[3],[$r_ptr,#16]
add $b_ptr,$b_ptr,#48
stp @a[4],@a[5],[$r_ptr,#32]
bl __add_mod_384
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr,#48]
stp @a[2],@a[3],[$r_ptr,#64]
stp @a[4],@a[5],[$r_ptr,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size add_mod_384x,.-add_mod_384x
.globl rshift_mod_384
.hidden rshift_mod_384
.type rshift_mod_384,%function
.align 5
rshift_mod_384:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @a[4],@a[5],[$a_ptr,#32]
ldp @mod[0],@mod[1],[$n_ptr]
ldp @mod[2],@mod[3],[$n_ptr,#16]
ldp @mod[4],@mod[5],[$n_ptr,#32]
.Loop_rshift_mod_384:
sub $b_ptr,$b_ptr,#1
bl __rshift_mod_384
cbnz $b_ptr,.Loop_rshift_mod_384
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
stp @a[4],@a[5],[$r_ptr,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size rshift_mod_384,.-rshift_mod_384
.type __rshift_mod_384,%function
.align 5
__rshift_mod_384:
sbfx @b[5],@a[0],#0,#1
and @b[0],@b[5],@mod[0]
and @b[1],@b[5],@mod[1]
adds @a[0],@a[0],@b[0]
and @b[2],@b[5],@mod[2]
adcs @a[1],@a[1],@b[1]
and @b[3],@b[5],@mod[3]
adcs @a[2],@a[2],@b[2]
and @b[4],@b[5],@mod[4]
adcs @a[3],@a[3],@b[3]
and @b[5],@b[5],@mod[5]
adcs @a[4],@a[4],@b[4]
extr @a[0],@a[1],@a[0],#1 // a[0:5] >>= 1
adcs @a[5],@a[5],@b[5]
extr @a[1],@a[2],@a[1],#1
adc @b[5],xzr,xzr
extr @a[2],@a[3],@a[2],#1
extr @a[3],@a[4],@a[3],#1
extr @a[4],@a[5],@a[4],#1
extr @a[5],@b[5],@a[5],#1
ret
.size __rshift_mod_384,.-__rshift_mod_384
.globl div_by_2_mod_384
.hidden div_by_2_mod_384
.type div_by_2_mod_384,%function
.align 5
div_by_2_mod_384:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @a[4],@a[5],[$a_ptr,#32]
ldp @mod[0],@mod[1],[$b_ptr]
ldp @mod[2],@mod[3],[$b_ptr,#16]
ldp @mod[4],@mod[5],[$b_ptr,#32]
bl __rshift_mod_384
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
stp @a[4],@a[5],[$r_ptr,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size div_by_2_mod_384,.-div_by_2_mod_384
.globl lshift_mod_384
.hidden lshift_mod_384
.type lshift_mod_384,%function
.align 5
lshift_mod_384:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @a[4],@a[5],[$a_ptr,#32]
ldp @mod[0],@mod[1],[$n_ptr]
ldp @mod[2],@mod[3],[$n_ptr,#16]
ldp @mod[4],@mod[5],[$n_ptr,#32]
.Loop_lshift_mod_384:
sub $b_ptr,$b_ptr,#1
bl __lshift_mod_384
cbnz $b_ptr,.Loop_lshift_mod_384
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
stp @a[4],@a[5],[$r_ptr,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size lshift_mod_384,.-lshift_mod_384
.type __lshift_mod_384,%function
.align 5
__lshift_mod_384:
adds @a[0],@a[0],@a[0]
adcs @a[1],@a[1],@a[1]
adcs @a[2],@a[2],@a[2]
adcs @a[3],@a[3],@a[3]
adcs @a[4],@a[4],@a[4]
adcs @a[5],@a[5],@a[5]
adc $carry,xzr,xzr
subs @b[0],@a[0],@mod[0]
sbcs @b[1],@a[1],@mod[1]
sbcs @b[2],@a[2],@mod[2]
sbcs @b[3],@a[3],@mod[3]
sbcs @b[4],@a[4],@mod[4]
sbcs @b[5],@a[5],@mod[5]
sbcs xzr,$carry,xzr
csel @a[0],@a[0],@b[0],lo
csel @a[1],@a[1],@b[1],lo
csel @a[2],@a[2],@b[2],lo
csel @a[3],@a[3],@b[3],lo
csel @a[4],@a[4],@b[4],lo
csel @a[5],@a[5],@b[5],lo
ret
.size __lshift_mod_384,.-__lshift_mod_384
.globl mul_by_3_mod_384
.hidden mul_by_3_mod_384
.type mul_by_3_mod_384,%function
.align 5
mul_by_3_mod_384:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @a[4],@a[5],[$a_ptr,#32]
ldp @mod[0],@mod[1],[$b_ptr]
ldp @mod[2],@mod[3],[$b_ptr,#16]
ldp @mod[4],@mod[5],[$b_ptr,#32]
bl __lshift_mod_384
ldp @b[0],@b[1],[$a_ptr]
ldp @b[2],@b[3],[$a_ptr,#16]
ldp @b[4],@b[5],[$a_ptr,#32]
bl __add_mod_384_ab_are_loaded
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
stp @a[4],@a[5],[$r_ptr,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size mul_by_3_mod_384,.-mul_by_3_mod_384
.globl mul_by_8_mod_384
.hidden mul_by_8_mod_384
.type mul_by_8_mod_384,%function
.align 5
mul_by_8_mod_384:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @a[4],@a[5],[$a_ptr,#32]
ldp @mod[0],@mod[1],[$b_ptr]
ldp @mod[2],@mod[3],[$b_ptr,#16]
ldp @mod[4],@mod[5],[$b_ptr,#32]
bl __lshift_mod_384
bl __lshift_mod_384
bl __lshift_mod_384
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
stp @a[4],@a[5],[$r_ptr,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size mul_by_8_mod_384,.-mul_by_8_mod_384
.globl mul_by_3_mod_384x
.hidden mul_by_3_mod_384x
.type mul_by_3_mod_384x,%function
.align 5
mul_by_3_mod_384x:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @a[4],@a[5],[$a_ptr,#32]
ldp @mod[0],@mod[1],[$b_ptr]
ldp @mod[2],@mod[3],[$b_ptr,#16]
ldp @mod[4],@mod[5],[$b_ptr,#32]
bl __lshift_mod_384
ldp @b[0],@b[1],[$a_ptr]
ldp @b[2],@b[3],[$a_ptr,#16]
ldp @b[4],@b[5],[$a_ptr,#32]
bl __add_mod_384_ab_are_loaded
stp @a[0],@a[1],[$r_ptr]
ldp @a[0],@a[1],[$a_ptr,#48]
stp @a[2],@a[3],[$r_ptr,#16]
ldp @a[2],@a[3],[$a_ptr,#64]
stp @a[4],@a[5],[$r_ptr,#32]
ldp @a[4],@a[5],[$a_ptr,#80]
bl __lshift_mod_384
ldp @b[0],@b[1],[$a_ptr,#48]
ldp @b[2],@b[3],[$a_ptr,#64]
ldp @b[4],@b[5],[$a_ptr,#80]
bl __add_mod_384_ab_are_loaded
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr,#48]
stp @a[2],@a[3],[$r_ptr,#64]
stp @a[4],@a[5],[$r_ptr,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size mul_by_3_mod_384x,.-mul_by_3_mod_384x
.globl mul_by_8_mod_384x
.hidden mul_by_8_mod_384x
.type mul_by_8_mod_384x,%function
.align 5
mul_by_8_mod_384x:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @a[4],@a[5],[$a_ptr,#32]
ldp @mod[0],@mod[1],[$b_ptr]
ldp @mod[2],@mod[3],[$b_ptr,#16]
ldp @mod[4],@mod[5],[$b_ptr,#32]
bl __lshift_mod_384
bl __lshift_mod_384
bl __lshift_mod_384
stp @a[0],@a[1],[$r_ptr]
ldp @a[0],@a[1],[$a_ptr,#48]
stp @a[2],@a[3],[$r_ptr,#16]
ldp @a[2],@a[3],[$a_ptr,#64]
stp @a[4],@a[5],[$r_ptr,#32]
ldp @a[4],@a[5],[$a_ptr,#80]
bl __lshift_mod_384
bl __lshift_mod_384
bl __lshift_mod_384
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr,#48]
stp @a[2],@a[3],[$r_ptr,#64]
stp @a[4],@a[5],[$r_ptr,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size mul_by_8_mod_384x,.-mul_by_8_mod_384x
.globl cneg_mod_384
.hidden cneg_mod_384
.type cneg_mod_384,%function
.align 5
cneg_mod_384:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @a[0],@a[1],[$a_ptr]
ldp @mod[0],@mod[1],[$n_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @mod[2],@mod[3],[$n_ptr,#16]
subs @b[0],@mod[0],@a[0]
ldp @a[4],@a[5],[$a_ptr,#32]
ldp @mod[4],@mod[5],[$n_ptr,#32]
orr $carry,@a[0],@a[1]
sbcs @b[1],@mod[1],@a[1]
orr $carry,$carry,@a[2]
sbcs @b[2],@mod[2],@a[2]
orr $carry,$carry,@a[3]
sbcs @b[3],@mod[3],@a[3]
orr $carry,$carry,@a[4]
sbcs @b[4],@mod[4],@a[4]
orr $carry,$carry,@a[5]
sbc @b[5],@mod[5],@a[5]
cmp $carry,#0
csetm $carry,ne
ands $b_ptr,$b_ptr,$carry
csel @a[0],@a[0],@b[0],eq
csel @a[1],@a[1],@b[1],eq
csel @a[2],@a[2],@b[2],eq
csel @a[3],@a[3],@b[3],eq
stp @a[0],@a[1],[$r_ptr]
csel @a[4],@a[4],@b[4],eq
stp @a[2],@a[3],[$r_ptr,#16]
csel @a[5],@a[5],@b[5],eq
stp @a[4],@a[5],[$r_ptr,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size cneg_mod_384,.-cneg_mod_384
.globl sub_mod_384
.hidden sub_mod_384
.type sub_mod_384,%function
.align 5
sub_mod_384:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @mod[0],@mod[1],[$n_ptr]
ldp @mod[2],@mod[3],[$n_ptr,#16]
ldp @mod[4],@mod[5],[$n_ptr,#32]
bl __sub_mod_384
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
stp @a[4],@a[5],[$r_ptr,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size sub_mod_384,.-sub_mod_384
.type __sub_mod_384,%function
.align 5
__sub_mod_384:
ldp @a[0],@a[1],[$a_ptr]
ldp @b[0],@b[1],[$b_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
ldp @b[2],@b[3],[$b_ptr,#16]
ldp @a[4],@a[5],[$a_ptr,#32]
ldp @b[4],@b[5],[$b_ptr,#32]
subs @a[0],@a[0],@b[0]
sbcs @a[1],@a[1],@b[1]
sbcs @a[2],@a[2],@b[2]
sbcs @a[3],@a[3],@b[3]
sbcs @a[4],@a[4],@b[4]
sbcs @a[5],@a[5],@b[5]
sbc $carry,xzr,xzr
and @b[0],@mod[0],$carry
and @b[1],@mod[1],$carry
adds @a[0],@a[0],@b[0]
and @b[2],@mod[2],$carry
adcs @a[1],@a[1],@b[1]
and @b[3],@mod[3],$carry
adcs @a[2],@a[2],@b[2]
and @b[4],@mod[4],$carry
adcs @a[3],@a[3],@b[3]
and @b[5],@mod[5],$carry
adcs @a[4],@a[4],@b[4]
adc @a[5],@a[5],@b[5]
ret
.size __sub_mod_384,.-__sub_mod_384
.globl sub_mod_384x
.hidden sub_mod_384x
.type sub_mod_384x,%function
.align 5
sub_mod_384x:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @mod[0],@mod[1],[$n_ptr]
ldp @mod[2],@mod[3],[$n_ptr,#16]
ldp @mod[4],@mod[5],[$n_ptr,#32]
bl __sub_mod_384
stp @a[0],@a[1],[$r_ptr]
add $a_ptr,$a_ptr,#48
stp @a[2],@a[3],[$r_ptr,#16]
add $b_ptr,$b_ptr,#48
stp @a[4],@a[5],[$r_ptr,#32]
bl __sub_mod_384
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr,#48]
stp @a[2],@a[3],[$r_ptr,#64]
stp @a[4],@a[5],[$r_ptr,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size sub_mod_384x,.-sub_mod_384x
.globl mul_by_1_plus_i_mod_384x
.hidden mul_by_1_plus_i_mod_384x
.type mul_by_1_plus_i_mod_384x,%function
.align 5
mul_by_1_plus_i_mod_384x:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @mod[0],@mod[1],[$b_ptr]
ldp @mod[2],@mod[3],[$b_ptr,#16]
ldp @mod[4],@mod[5],[$b_ptr,#32]
add $b_ptr,$a_ptr,#48
bl __sub_mod_384 // a->re - a->im
ldp @b[0],@b[1],[$a_ptr]
ldp @b[2],@b[3],[$a_ptr,#16]
ldp @b[4],@b[5],[$a_ptr,#32]
stp @a[0],@a[1],[$r_ptr]
ldp @a[0],@a[1],[$a_ptr,#48]
stp @a[2],@a[3],[$r_ptr,#16]
ldp @a[2],@a[3],[$a_ptr,#64]
stp @a[4],@a[5],[$r_ptr,#32]
ldp @a[4],@a[5],[$a_ptr,#80]
bl __add_mod_384_ab_are_loaded // a->re + a->im
ldr x30,[sp,#8]
stp @a[0],@a[1],[$r_ptr,#48]
stp @a[2],@a[3],[$r_ptr,#64]
stp @a[4],@a[5],[$r_ptr,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
.globl sgn0_pty_mod_384
.hidden sgn0_pty_mod_384
.type sgn0_pty_mod_384,%function
.align 5
sgn0_pty_mod_384:
ldp @a[0],@a[1],[$r_ptr]
ldp @a[2],@a[3],[$r_ptr,#16]
ldp @a[4],@a[5],[$r_ptr,#32]
ldp @mod[0],@mod[1],[$a_ptr]
ldp @mod[2],@mod[3],[$a_ptr,#16]
ldp @mod[4],@mod[5],[$a_ptr,#32]
and $r_ptr,@a[0],#1
adds @a[0],@a[0],@a[0]
adcs @a[1],@a[1],@a[1]
adcs @a[2],@a[2],@a[2]
adcs @a[3],@a[3],@a[3]
adcs @a[4],@a[4],@a[4]
adcs @a[5],@a[5],@a[5]
adc $carry,xzr,xzr
subs @a[0],@a[0],@mod[0]
sbcs @a[1],@a[1],@mod[1]
sbcs @a[2],@a[2],@mod[2]
sbcs @a[3],@a[3],@mod[3]
sbcs @a[4],@a[4],@mod[4]
sbcs @a[5],@a[5],@mod[5]
sbc $carry,$carry,xzr
mvn $carry,$carry
and $carry,$carry,#2
orr $r_ptr,$r_ptr,$carry
ret
.size sgn0_pty_mod_384,.-sgn0_pty_mod_384
.globl sgn0_pty_mod_384x
.hidden sgn0_pty_mod_384x
.type sgn0_pty_mod_384x,%function
.align 5
sgn0_pty_mod_384x:
ldp @a[0],@a[1],[$r_ptr]
ldp @a[2],@a[3],[$r_ptr,#16]
ldp @a[4],@a[5],[$r_ptr,#32]
ldp @mod[0],@mod[1],[$a_ptr]
ldp @mod[2],@mod[3],[$a_ptr,#16]
ldp @mod[4],@mod[5],[$a_ptr,#32]
and $b_ptr,@a[0],#1
orr $n_ptr,@a[0],@a[1]
adds @a[0],@a[0],@a[0]
orr $n_ptr,$n_ptr,@a[2]
adcs @a[1],@a[1],@a[1]
orr $n_ptr,$n_ptr,@a[3]
adcs @a[2],@a[2],@a[2]
orr $n_ptr,$n_ptr,@a[4]
adcs @a[3],@a[3],@a[3]
orr $n_ptr,$n_ptr,@a[5]
adcs @a[4],@a[4],@a[4]
adcs @a[5],@a[5],@a[5]
adc @b[0],xzr,xzr
subs @a[0],@a[0],@mod[0]
sbcs @a[1],@a[1],@mod[1]
sbcs @a[2],@a[2],@mod[2]
sbcs @a[3],@a[3],@mod[3]
sbcs @a[4],@a[4],@mod[4]
sbcs @a[5],@a[5],@mod[5]
sbc @b[0],@b[0],xzr
ldp @a[0],@a[1],[$r_ptr,#48]
ldp @a[2],@a[3],[$r_ptr,#64]
ldp @a[4],@a[5],[$r_ptr,#80]
mvn @b[0],@b[0]
and @b[0],@b[0],#2
orr $b_ptr,$b_ptr,@b[0]
and $r_ptr,@a[0],#1
orr $a_ptr,@a[0],@a[1]
adds @a[0],@a[0],@a[0]
orr $a_ptr,$a_ptr,@a[2]
adcs @a[1],@a[1],@a[1]
orr $a_ptr,$a_ptr,@a[3]
adcs @a[2],@a[2],@a[2]
orr $a_ptr,$a_ptr,@a[4]
adcs @a[3],@a[3],@a[3]
orr $a_ptr,$a_ptr,@a[5]
adcs @a[4],@a[4],@a[4]
adcs @a[5],@a[5],@a[5]
adc @b[0],xzr,xzr
subs @a[0],@a[0],@mod[0]
sbcs @a[1],@a[1],@mod[1]
sbcs @a[2],@a[2],@mod[2]
sbcs @a[3],@a[3],@mod[3]
sbcs @a[4],@a[4],@mod[4]
sbcs @a[5],@a[5],@mod[5]
sbc @b[0],@b[0],xzr
mvn @b[0],@b[0]
and @b[0],@b[0],#2
orr $r_ptr,$r_ptr,@b[0]
cmp $n_ptr,#0
csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re)
cmp $a_ptr,#0
csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re)
and $n_ptr,$n_ptr,#1
and $a_ptr,$a_ptr,#2
orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity
ret
.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
___
if (1) {
sub vec_select {
my $sz = shift;
my @v=map("v$_",(0..5,16..21));
$code.=<<___;
.globl vec_select_$sz
.hidden vec_select_$sz
.type vec_select_$sz,%function
.align 5
vec_select_$sz:
dup v6.2d, $n_ptr
ld1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48
cmeq v6.2d, v6.2d, #0
ld1 {@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48
___
for($i=0; $i<$sz-48; $i+=48) {
$code.=<<___;
bit @v[0].16b, @v[3].16b, v6.16b
ld1 {@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48
bit @v[1].16b, @v[4].16b, v6.16b
ld1 {@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48
bit @v[2].16b, @v[5].16b, v6.16b
st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48
___
@v = @v[6..11,0..5];
}
$code.=<<___;
bit @v[0].16b, @v[3].16b, v6.16b
bit @v[1].16b, @v[4].16b, v6.16b
bit @v[2].16b, @v[5].16b, v6.16b
st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr]
ret
.size vec_select_$sz,.-vec_select_$sz
___
}
vec_select(48);
vec_select(96);
vec_select(192);
vec_select(144);
vec_select(288);
}
{
my ($inp, $end, $step) = map("x$_", (0..2));
$code.=<<___;
.globl vec_prefetch
.hidden vec_prefetch
.type vec_prefetch,%function
.align 5
vec_prefetch:
add $end, $end, $inp
sub $end, $end, #1
mov $step, #64
prfm pldl1keep, [$inp]
add $inp, $inp, $step
cmp $inp, $end
csel $inp, $end, $inp, hi
csel $step, xzr, $step, hi
prfm pldl1keep, [$inp]
add $inp, $inp, $step
cmp $inp, $end
csel $inp, $end, $inp, hi
csel $step, xzr, $step, hi
prfm pldl1keep, [$inp]
add $inp, $inp, $step
cmp $inp, $end
csel $inp, $end, $inp, hi
csel $step, xzr, $step, hi
prfm pldl1keep, [$inp]
add $inp, $inp, $step
cmp $inp, $end
csel $inp, $end, $inp, hi
csel $step, xzr, $step, hi
prfm pldl1keep, [$inp]
add $inp, $inp, $step
cmp $inp, $end
csel $inp, $end, $inp, hi
csel $step, xzr, $step, hi
prfm pldl1keep, [$inp]
add $inp, $inp, $step
cmp $inp, $end
csel $inp, $end, $inp, hi
prfm pldl1keep, [$inp]
ret
.size vec_prefetch,.-vec_prefetch
___
}
print $code;
close STDOUT;

1430
blst/asm/add_mod_384-x86_64.pl Executable file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,260 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
# common argument layout
($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
$b_ptr = "%rbx";
# common accumulator layout
@acc=map("%r$_",(8..15));
############################################################ 384x384 add/sub
# Double-width addition/subtraction modulo n<<384, as opposite to
# naively expected modulo n*n. It works because n<<384 is the actual
# input boundary condition for Montgomery reduction, not n*n.
# Just in case, this is duplicated, but only one module is
# supposed to be linked...
{
my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected
# except for $n_ptr and $r_ptr
$code.=<<___;
.text
.type __add_mod_384x384,\@abi-omnipotent
.align 32
__add_mod_384x384:
mov 8*0($a_ptr), @acc[0]
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov 8*3($a_ptr), @acc[3]
mov 8*4($a_ptr), @acc[4]
mov 8*5($a_ptr), @acc[5]
mov 8*6($a_ptr), @acc[6]
add 8*0($b_org), @acc[0]
mov 8*7($a_ptr), @acc[7]
adc 8*1($b_org), @acc[1]
mov 8*8($a_ptr), @acc[8]
adc 8*2($b_org), @acc[2]
mov 8*9($a_ptr), @acc[9]
adc 8*3($b_org), @acc[3]
mov 8*10($a_ptr), @acc[10]
adc 8*4($b_org), @acc[4]
mov 8*11($a_ptr), @acc[11]
adc 8*5($b_org), @acc[5]
mov @acc[0], 8*0($r_ptr)
adc 8*6($b_org), @acc[6]
mov @acc[1], 8*1($r_ptr)
adc 8*7($b_org), @acc[7]
mov @acc[2], 8*2($r_ptr)
adc 8*8($b_org), @acc[8]
mov @acc[4], 8*4($r_ptr)
mov @acc[6], @acc[0]
adc 8*9($b_org), @acc[9]
mov @acc[3], 8*3($r_ptr)
mov @acc[7], @acc[1]
adc 8*10($b_org), @acc[10]
mov @acc[5], 8*5($r_ptr)
mov @acc[8], @acc[2]
adc 8*11($b_org), @acc[11]
mov @acc[9], @acc[3]
sbb $b_org, $b_org
sub 8*0($n_ptr), @acc[6]
sbb 8*1($n_ptr), @acc[7]
mov @acc[10], @acc[4]
sbb 8*2($n_ptr), @acc[8]
sbb 8*3($n_ptr), @acc[9]
sbb 8*4($n_ptr), @acc[10]
mov @acc[11], @acc[5]
sbb 8*5($n_ptr), @acc[11]
sbb \$0, $b_org
cmovc @acc[0], @acc[6]
cmovc @acc[1], @acc[7]
cmovc @acc[2], @acc[8]
mov @acc[6], 8*6($r_ptr)
cmovc @acc[3], @acc[9]
mov @acc[7], 8*7($r_ptr)
cmovc @acc[4], @acc[10]
mov @acc[8], 8*8($r_ptr)
cmovc @acc[5], @acc[11]
mov @acc[9], 8*9($r_ptr)
mov @acc[10], 8*10($r_ptr)
mov @acc[11], 8*11($r_ptr)
ret
.size __add_mod_384x384,.-__add_mod_384x384
.type __sub_mod_384x384,\@abi-omnipotent
.align 32
__sub_mod_384x384:
mov 8*0($a_ptr), @acc[0]
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov 8*3($a_ptr), @acc[3]
mov 8*4($a_ptr), @acc[4]
mov 8*5($a_ptr), @acc[5]
mov 8*6($a_ptr), @acc[6]
sub 8*0($b_org), @acc[0]
mov 8*7($a_ptr), @acc[7]
sbb 8*1($b_org), @acc[1]
mov 8*8($a_ptr), @acc[8]
sbb 8*2($b_org), @acc[2]
mov 8*9($a_ptr), @acc[9]
sbb 8*3($b_org), @acc[3]
mov 8*10($a_ptr), @acc[10]
sbb 8*4($b_org), @acc[4]
mov 8*11($a_ptr), @acc[11]
sbb 8*5($b_org), @acc[5]
mov @acc[0], 8*0($r_ptr)
sbb 8*6($b_org), @acc[6]
mov 8*0($n_ptr), @acc[0]
mov @acc[1], 8*1($r_ptr)
sbb 8*7($b_org), @acc[7]
mov 8*1($n_ptr), @acc[1]
mov @acc[2], 8*2($r_ptr)
sbb 8*8($b_org), @acc[8]
mov 8*2($n_ptr), @acc[2]
mov @acc[3], 8*3($r_ptr)
sbb 8*9($b_org), @acc[9]
mov 8*3($n_ptr), @acc[3]
mov @acc[4], 8*4($r_ptr)
sbb 8*10($b_org), @acc[10]
mov 8*4($n_ptr), @acc[4]
mov @acc[5], 8*5($r_ptr)
sbb 8*11($b_org), @acc[11]
mov 8*5($n_ptr), @acc[5]
sbb $b_org, $b_org
and $b_org, @acc[0]
and $b_org, @acc[1]
and $b_org, @acc[2]
and $b_org, @acc[3]
and $b_org, @acc[4]
and $b_org, @acc[5]
add @acc[0], @acc[6]
adc @acc[1], @acc[7]
mov @acc[6], 8*6($r_ptr)
adc @acc[2], @acc[8]
mov @acc[7], 8*7($r_ptr)
adc @acc[3], @acc[9]
mov @acc[8], 8*8($r_ptr)
adc @acc[4], @acc[10]
mov @acc[9], 8*9($r_ptr)
adc @acc[5], @acc[11]
mov @acc[10], 8*10($r_ptr)
mov @acc[11], 8*11($r_ptr)
ret
.size __sub_mod_384x384,.-__sub_mod_384x384
.globl add_mod_384x384
.hidden add_mod_384x384
.type add_mod_384x384,\@function,4,"unwind"
.align 32
add_mod_384x384:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
call __add_mod_384x384
mov 8(%rsp),%r15
.cfi_restore %r15
mov 16(%rsp),%r14
.cfi_restore %r14
mov 24(%rsp),%r13
.cfi_restore %r13
mov 32(%rsp),%r12
.cfi_restore %r12
mov 40(%rsp),%rbx
.cfi_restore %rbx
mov 48(%rsp),%rbp
.cfi_restore %rbp
lea 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.cfi_epilogue
ret
.cfi_endproc
.size add_mod_384x384,.-add_mod_384x384
.globl sub_mod_384x384
.hidden sub_mod_384x384
.type sub_mod_384x384,\@function,4,"unwind"
.align 32
sub_mod_384x384:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
call __sub_mod_384x384
mov 8(%rsp),%r15
.cfi_restore %r15
mov 16(%rsp),%r14
.cfi_restore %r14
mov 24(%rsp),%r13
.cfi_restore %r13
mov 32(%rsp),%r12
.cfi_restore %r12
mov 40(%rsp),%rbx
.cfi_restore %rbx
mov 48(%rsp),%rbp
.cfi_restore %rbp
lea 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.cfi_epilogue
ret
.cfi_endproc
.size sub_mod_384x384,.-sub_mod_384x384
___
}
print $code;
close STDOUT;

381
blst/asm/arm-xlate.pl Executable file
View file

@ -0,0 +1,381 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# ARM assembler distiller/adapter by \@dot-asm.
use strict;
################################################################
# Recognized "flavour"-s are:
#
# linux[32|64] GNU assembler, effectively pass-through
# ios[32|64] global symbols' decorations, PIC tweaks, etc.
# win[32|64] Visual Studio armasm-specific directives
# coff[32|64] e.g. clang --target=arm-windows ...
#
my $flavour = shift;
$flavour = "linux" if (!$flavour or $flavour eq "void");
my $output = shift;
open STDOUT,">$output" || die "can't open $output: $!";
my %GLOBALS;
my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0;
my $in_proc; # used with 'windows' flavour
################################################################
# directives which need special treatment on different platforms
################################################################
my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch
my $fpu = sub { } if ($flavour !~ /linux/); # omit .fpu
my $rodata = sub {
SWITCH: for ($flavour) {
/linux/ && return ".section\t.rodata";
/ios/ && return ".section\t__TEXT,__const";
/coff/ && return ".section\t.rdata,\"dr\"";
/win/ && return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8";
last;
}
};
my $hidden = sub {
if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); }
} if ($flavour !~ /linux/);
my $comm = sub {
my @args = split(/,\s*/,shift);
my $name = @args[0];
my $global = \$GLOBALS{$name};
my $ret;
if ($flavour =~ /ios32/) {
$ret = ".comm\t_$name,@args[1]\n";
$ret .= ".non_lazy_symbol_pointer\n";
$ret .= "$name:\n";
$ret .= ".indirect_symbol\t_$name\n";
$ret .= ".long\t0\n";
$ret .= ".previous";
$name = "_$name";
} elsif ($flavour =~ /win/) {
$ret = "\tCOMMON\t|$name|,@args[1]";
} elsif ($flavour =~ /coff/) {
$ret = ".comm\t$name,@args[1]";
} else {
$ret = ".comm\t".join(',',@args);
}
$$global = $name;
$ret;
};
my $globl = sub {
my $name = shift;
my $global = \$GLOBALS{$name};
my $ret;
SWITCH: for ($flavour) {
/ios/ && do { $name = "_$name"; last; };
/win/ && do { $ret = ""; last; };
}
$ret = ".globl $name" if (!defined($ret));
$$global = $name;
$ret;
};
my $global = $globl;
my $extern = sub {
&$globl(@_);
if ($flavour =~ /win/) {
return "\tEXTERN\t@_";
}
return; # return nothing
};
my $type = sub {
my $arg = join(',',@_);
my $ret;
SWITCH: for ($flavour) {
/ios32/ && do { if ($arg =~ /(\w+),\s*%function/) {
$ret = "#ifdef __thumb2__\n" .
".thumb_func $1\n" .
"#endif";
}
last;
};
/win/ && do { if ($arg =~ /(\w+),\s*%(function|object)/) {
my $type = "[DATA]";
if ($2 eq "function") {
$in_proc = $1;
$type = "[FUNC]";
}
$ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type"
: "";
}
last;
};
/coff/ && do { if ($arg =~ /(\w+),\s*%function/) {
$ret = ".def $1;\n".
".type 32;\n".
".endef";
}
last;
};
}
return $ret;
} if ($flavour !~ /linux/);
my $size = sub {
if ($in_proc && $flavour =~ /win/) {
$in_proc = undef;
return "\tENDP";
}
} if ($flavour !~ /linux/);
my $inst = sub {
if ($flavour =~ /win/) { "\tDCDU\t".join(',',@_); }
else { ".long\t".join(',',@_); }
} if ($flavour !~ /linux/);
my $asciz = sub {
my $line = join(",",@_);
if ($line =~ /^"(.*)"$/)
{ if ($flavour =~ /win/) {
"\tDCB\t$line,0\n\tALIGN\t4";
} else {
".byte " . join(",",unpack("C*",$1),0) . "\n.align 2";
}
} else { ""; }
};
my $align = sub {
"\tALIGN\t".2**@_[0];
} if ($flavour =~ /win/);
$align = sub {
".p2align\t".@_[0];
} if ($flavour =~ /coff/);
my $byte = sub {
"\tDCB\t".join(',',@_);
} if ($flavour =~ /win/);
my $short = sub {
"\tDCWU\t".join(',',@_);
} if ($flavour =~ /win/);
my $word = sub {
"\tDCDU\t".join(',',@_);
} if ($flavour =~ /win/);
my $long = $word if ($flavour =~ /win/);
my $quad = sub {
"\tDCQU\t".join(',',@_);
} if ($flavour =~ /win/);
my $skip = sub {
"\tSPACE\t".shift;
} if ($flavour =~ /win/);
my $code = sub {
"\tCODE@_[0]";
} if ($flavour =~ /win/);
my $thumb = sub { # .thumb should appear prior .text in source
"# define ARM THUMB\n" .
"\tTHUMB";
} if ($flavour =~ /win/);
my $text = sub {
"\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM");
} if ($flavour =~ /win/);
my $syntax = sub {} if ($flavour =~ /win/); # omit .syntax
my $rva = sub {
# .rva directive comes in handy only on 32-bit Windows, i.e. it can
# be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections.
# However! Corresponding compilers don't seem to bet on PIC, which
# raises the question why would assembler programmer have to jump
# through the hoops? But just in case, it would go as following:
#
# ldr r1,.LOPENSSL_armcap
# ldr r2,.LOPENSSL_armcap+4
# adr r0,.LOPENSSL_armcap
# bic r1,r1,#1 ; de-thumb-ify link.exe's ideas
# sub r0,r0,r1 ; r0 is image base now
# ldr r0,[r0,r2]
# ...
#.LOPENSSL_armcap:
# .rva .LOPENSSL_armcap ; self-reference
# .rva OPENSSL_armcap_P ; real target
#
# Non-position-independent [and ISA-neutral] alternative is so much
# simpler:
#
# ldr r0,.LOPENSSL_armcap
# ldr r0,[r0]
# ...
#.LOPENSSL_armcap:
# .long OPENSSL_armcap_P
#
"\tDCDU\t@_[0]\n\tRELOC\t2"
} if ($flavour =~ /win(?!64)/);
################################################################
# some broken instructions in Visual Studio armasm[64]...
my $it = sub {} if ($flavour =~ /win32/); # omit 'it'
my $ext = sub {
"\text8\t".join(',',@_);
} if ($flavour =~ /win64/);
my $csel = sub {
my ($args,$comment) = split(m|\s*//|,shift);
my @regs = split(m|,\s*|,$args);
my $cond = pop(@regs);
"\tcsel$cond\t".join(',',@regs);
} if ($flavour =~ /win64/);
my $csetm = sub {
my ($args,$comment) = split(m|\s*//|,shift);
my @regs = split(m|,\s*|,$args);
my $cond = pop(@regs);
"\tcsetm$cond\t".join(',',@regs);
} if ($flavour =~ /win64/);
# ... then conditional branch instructions are also broken, but
# maintaining all the variants is tedious, so I kludge-fix it
# elsewhere...
################################################################
my $adrp = sub {
my ($args,$comment) = split(m|\s*//|,shift);
"\tadrp\t$args\@PAGE";
} if ($flavour =~ /ios64/);
my $paciasp = sub {
($flavour =~ /linux/) ? "\t.inst\t0xd503233f"
: &$inst(0xd503233f);
};
my $autiasp = sub {
($flavour =~ /linux/) ? "\t.inst\t0xd50323bf"
: &$inst(0xd50323bf);
};
sub range {
my ($r,$sfx,$start,$end) = @_;
join(",",map("$r$_$sfx",($start..$end)));
}
sub expand_line {
my $line = shift;
my @ret = ();
pos($line)=0;
while ($line =~ m/\G[^@\/\{\"]*/g) {
if ($line =~ m/\G(@|\/\/|$)/gc) {
last;
}
elsif ($line =~ m/\G\{/gc) {
my $saved_pos = pos($line);
$line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e;
pos($line) = $saved_pos;
$line =~ m/\G[^\}]*\}/g;
}
elsif ($line =~ m/\G\"/gc) {
$line =~ m/\G[^\"]*\"/g;
}
}
$line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
if ($flavour =~ /win/) {
# adjust alignment hints, "[rN,:32]" -> "[rN@32]"
$line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/;
# adjust local labels, ".Lwhatever" -> "|$Lwhatever|"
$line =~ s/\.(L\w{2,})/|\$$1|/g;
# omit "#:lo12:" on win64
$line =~ s/#:lo12://;
} elsif ($flavour =~ /coff(?!64)/) {
$line =~ s/\.L(\w{2,})/(\$ML$1)/g;
} elsif ($flavour =~ /ios64/) {
$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
}
return $line;
}
while(my $line=<>) {
# fix up assembler-specific commentary delimiter
$line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/);
if ($line =~ m/^\s*(#|@|;|\/\/)/) { print $line; next; }
$line =~ s|/\*.*\*/||; # get rid of C-style comments...
$line =~ s|^\s+||; # ... and skip white spaces in beginning...
$line =~ s|\s+$||; # ... and at the end
{
$line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel
$line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels);
}
{
$line =~ s|(^[\.\w]+)\:\s*||;
my $label = $1;
if ($label) {
$label = ($GLOBALS{$label} or $label);
if ($flavour =~ /win/) {
$label =~ s|^\.L(?=\w)|\$L|;
printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : "");
} else {
$label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/);
printf "%s:", $label;
}
}
}
if ($line !~ m/^[#@;]/) {
$line =~ s|^\s*(\.?)(\S+)\s*||;
my $c = $1; $c = "\t" if ($c eq "");
my $mnemonic = $2;
my $opcode;
if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) {
$opcode = eval("\$$1_$2");
} else {
$opcode = eval("\$$mnemonic");
}
my $arg=expand_line($line);
if (ref($opcode) eq 'CODE') {
$line = &$opcode($arg);
} elsif ($mnemonic) {
if ($flavour =~ /win64/) {
# "b.cond" -> "bcond", kludge-fix:-(
$mnemonic =~ s/^b\.([a-z]{2}$)/b$1/;
}
$line = $c.$mnemonic;
$line.= "\t$arg" if ($arg ne "");
}
}
print $line if ($line);
print "\n";
}
print "\tEND\n" if ($flavour =~ /win/);
close STDOUT;

View file

@ -0,0 +1,586 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast Euclidean inversion as suggested in
# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 -
# on Cortex-A57.
#
# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
# const vec256 modx);
#
$python_ref.=<<'___';
def ct_inverse_mod_256(inp, mod):
a, u = inp, 1
b, v = mod, 0
k = 31
mask = (1 << k) - 1
for i in range(0, 512 // k - 1):
# __ab_approximation_31
n = max(a.bit_length(), b.bit_length())
if n < 64:
a_, b_ = a, b
else:
a_ = (a & mask) | ((a >> (n-k-2)) << k)
b_ = (b & mask) | ((b >> (n-k-2)) << k)
# __inner_loop_31
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, k):
if a_ & 1:
if a_ < b_:
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
a_, f0, g0 = a_-b_, f0-f1, g0-g1
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
# __smul_256_n_shift_by_31
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
if a < 0:
a, f0, g0 = -a, -f0, -g0
if b < 0:
b, f1, g1 = -b, -f1, -g1
# __smul_512x63
u, v = u*f0 + v*g0, u*f1 + v*g1
if 512 % k + k:
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, 512 % k + k):
if a & 1:
if a < b:
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
a, f0, g0 = a-b, f0-f1, g0-g1
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
v = u*f1 + v*g1
mod <<= 512 - mod.bit_length() # align to the left
if v < 0:
v += mod
if v < 0:
v += mod
elif v == 1<<512
v -= mod
return v & (2**512 - 1) # to be reduced % mod
___
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
my @acc=map("x$_",(4..11));
my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17));
my $cnt = $n_ptr;
my @t = map("x$_",(19..26));
my ($a_lo, $b_lo) = @acc[3,7];
$frame = 16+2*512;
$code.=<<___;
.text
.globl ct_inverse_mod_256
.type ct_inverse_mod_256, %function
.align 5
ct_inverse_mod_256:
paciasp
stp x29, x30, [sp,#-80]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
sub sp, sp, #$frame
ldp @acc[0], @acc[1], [$in_ptr,#8*0]
ldp @acc[2], @acc[3], [$in_ptr,#8*2]
add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot
and $in_ptr, $in_ptr, #-512 // in the frame...
str $out_ptr, [sp]
ldp @acc[4], @acc[5], [$n_ptr,#8*0]
ldp @acc[6], @acc[7], [$n_ptr,#8*2]
stp @acc[0], @acc[1], [$in_ptr,#8*0] // copy input to |a|
stp @acc[2], @acc[3], [$in_ptr,#8*2]
stp @acc[4], @acc[5], [$in_ptr,#8*4] // copy modulus to |b|
stp @acc[6], @acc[7], [$in_ptr,#8*6]
////////////////////////////////////////// first iteration
bl .Lab_approximation_31_256_loaded
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
str $f0,[$out_ptr,#8*8] // initialize |u| with |f0|
mov $f0, $f1 // |f1|
mov $g0, $g1 // |g1|
add $out_ptr, $out_ptr, #8*4 // pointer to dst |b|
bl __smul_256_n_shift_by_31
str $f0, [$out_ptr,#8*9] // initialize |v| with |f1|
////////////////////////////////////////// second iteration
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov $f_, $f0 // corrected |f0|
mov $g_, $g0 // corrected |g0|
mov $f0, $f1 // |f1|
mov $g0, $g1 // |g1|
add $out_ptr, $out_ptr, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
ldr @acc[4], [$in_ptr,#8*8] // |u|
ldr @acc[5], [$in_ptr,#8*13] // |v|
madd @acc[0], $f_, @acc[4], xzr // |u|*|f0|
madd @acc[0], $g_, @acc[5], @acc[0] // |v|*|g0|
str @acc[0], [$out_ptr,#8*4]
asr @acc[1], @acc[0], #63 // sign extenstion
stp @acc[1], @acc[1], [$out_ptr,#8*5]
stp @acc[1], @acc[1], [$out_ptr,#8*7]
madd @acc[0], $f0, @acc[4], xzr // |u|*|f1|
madd @acc[0], $g0, @acc[5], @acc[0] // |v|*|g1|
str @acc[0], [$out_ptr,#8*9]
asr @acc[1], @acc[0], #63 // sign extenstion
stp @acc[1], @acc[1], [$out_ptr,#8*10]
stp @acc[1], @acc[1], [$out_ptr,#8*12]
___
for($i=2; $i<15; $i++) {
$code.=<<___;
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov $f_, $f0 // corrected |f0|
mov $g_, $g0 // corrected |g0|
mov $f0, $f1 // |f1|
mov $g0, $g1 // |g1|
add $out_ptr, $out_ptr, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add $out_ptr, $out_ptr, #8*4 // pointer to destination |u|
bl __smul_256x63
adc @t[3], @t[3], @t[4]
str @t[3], [$out_ptr,#8*4]
mov $f_, $f0 // corrected |f1|
mov $g_, $g0 // corrected |g1|
add $out_ptr, $out_ptr, #8*5 // pointer to destination |v|
bl __smul_256x63
___
$code.=<<___ if ($i>7);
bl __smul_512x63_tail
___
$code.=<<___ if ($i<=7);
adc @t[3], @t[3], @t[4]
stp @t[3], @t[3], [$out_ptr,#8*4]
stp @t[3], @t[3], [$out_ptr,#8*6]
___
}
$code.=<<___;
////////////////////////////////////////// two[!] last iterations
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
mov $cnt, #47 // 31 + 512 % 31
//bl __ab_approximation_62_256 // |a| and |b| are exact,
ldr $a_lo, [$in_ptr,#8*0] // just load
ldr $b_lo, [$in_ptr,#8*4]
bl __inner_loop_62_256
mov $f_, $f1
mov $g_, $g1
ldr $out_ptr, [sp] // original out_ptr
bl __smul_256x63
bl __smul_512x63_tail
ldr x30, [x29,#8]
smulh @t[1], @acc[3], $g_ // figure out top-most limb
ldp @acc[4], @acc[5], [$nx_ptr,#8*0]
adc @t[4], @t[4], @t[6]
ldp @acc[6], @acc[7], [$nx_ptr,#8*2]
add @t[1], @t[1], @t[4] // @t[1] is 1, 0 or -1
asr @t[0], @t[1], #63 // sign as mask
and @t[4], @acc[4], @t[0] // add mod<<256 conditionally
and @t[5], @acc[5], @t[0]
adds @acc[0], @acc[0], @t[4]
and @t[6], @acc[6], @t[0]
adcs @acc[1], @acc[1], @t[5]
and @t[7], @acc[7], @t[0]
adcs @acc[2], @acc[2], @t[6]
adcs @acc[3], @t[3], @t[7]
adc @t[1], @t[1], xzr // @t[1] is 1, 0 or -1
neg @t[0], @t[1]
orr @t[1], @t[1], @t[0] // excess bit or sign as mask
asr @t[0], @t[0], #63 // excess bit as mask
and @acc[4], @acc[4], @t[1] // mask |mod|
and @acc[5], @acc[5], @t[1]
and @acc[6], @acc[6], @t[1]
and @acc[7], @acc[7], @t[1]
eor @acc[4], @acc[4], @t[0] // conditionally negate |mod|
eor @acc[5], @acc[5], @t[0]
adds @acc[4], @acc[4], @t[0], lsr#63
eor @acc[6], @acc[6], @t[0]
adcs @acc[5], @acc[5], xzr
eor @acc[7], @acc[7], @t[0]
adcs @acc[6], @acc[6], xzr
adc @acc[7], @acc[7], xzr
adds @acc[0], @acc[0], @acc[4] // final adjustment for |mod|<<256
adcs @acc[1], @acc[1], @acc[5]
adcs @acc[2], @acc[2], @acc[6]
stp @acc[0], @acc[1], [$out_ptr,#8*4]
adc @acc[3], @acc[3], @acc[7]
stp @acc[2], @acc[3], [$out_ptr,#8*6]
add sp, sp, #$frame
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldr x29, [sp],#80
autiasp
ret
.size ct_inverse_mod_256,.-ct_inverse_mod_256
////////////////////////////////////////////////////////////////////////
.type __smul_256x63, %function
.align 5
__smul_256x63:
___
for($j=0; $j<2; $j++) {
my $f_ = $f_; $f_ = $g_ if ($j);
my @acc = @acc; @acc = @acc[4..7] if ($j);
my $k = 8*8+8*5*$j;
$code.=<<___;
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|)
asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s)
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|)
ldr @t[3+$j], [$in_ptr,#8*4+$k]
eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|)
sub $f_, $f_, $f1
eor @acc[1], @acc[1], $f1
adds @acc[0], @acc[0], $f1, lsr#63
eor @acc[2], @acc[2], $f1
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], $f1
adcs @acc[2], @acc[2], xzr
eor @t[3+$j], @t[3+$j], $f1
umulh @t[0], @acc[0], $f_
adcs @acc[3], @acc[3], xzr
umulh @t[1], @acc[1], $f_
adcs @t[3+$j], @t[3+$j], xzr
umulh @t[2], @acc[2], $f_
___
$code.=<<___ if ($j!=0);
adc $g1, xzr, xzr // used in __smul_512x63_tail
___
$code.=<<___;
mul @acc[0], @acc[0], $f_
cmp $f_, #0
mul @acc[1], @acc[1], $f_
csel @t[3+$j], @t[3+$j], xzr, ne
mul @acc[2], @acc[2], $f_
adds @acc[1], @acc[1], @t[0]
mul @t[5+$j], @acc[3], $f_
adcs @acc[2], @acc[2], @t[1]
adcs @t[5+$j], @t[5+$j], @t[2]
___
$code.=<<___ if ($j==0);
adc @t[7], xzr, xzr
___
}
$code.=<<___;
adc @t[7], @t[7], xzr
adds @acc[0], @acc[0], @acc[4]
adcs @acc[1], @acc[1], @acc[5]
adcs @acc[2], @acc[2], @acc[6]
stp @acc[0], @acc[1], [$out_ptr,#8*0]
adcs @t[5], @t[5], @t[6]
stp @acc[2], @t[5], [$out_ptr,#8*2]
ret
.size __smul_256x63,.-__smul_256x63
.type __smul_512x63_tail, %function
.align 5
__smul_512x63_tail:
umulh @t[5], @acc[3], $f_
ldp @acc[1], @acc[2], [$in_ptr,#8*18] // load rest of |v|
adc @t[7], @t[7], xzr
ldr @acc[3], [$in_ptr,#8*20]
and @t[3], @t[3], $f_
umulh @acc[7], @acc[7], $g_ // resume |v|*|g1| chain
sub @t[5], @t[5], @t[3] // tie up |u|*|f1| chain
asr @t[6], @t[5], #63
eor @acc[1], @acc[1], $f1 // conditionally negate rest of |v|
eor @acc[2], @acc[2], $f1
adds @acc[1], @acc[1], $g1
eor @acc[3], @acc[3], $f1
adcs @acc[2], @acc[2], xzr
umulh @t[0], @t[4], $g_
adc @acc[3], @acc[3], xzr
umulh @t[1], @acc[1], $g_
add @acc[7], @acc[7], @t[7]
umulh @t[2], @acc[2], $g_
mul @acc[0], @t[4], $g_
mul @acc[1], @acc[1], $g_
adds @acc[0], @acc[0], @acc[7]
mul @acc[2], @acc[2], $g_
adcs @acc[1], @acc[1], @t[0]
mul @t[3], @acc[3], $g_
adcs @acc[2], @acc[2], @t[1]
adcs @t[3], @t[3], @t[2]
adc @t[4], xzr, xzr // used in the final step
adds @acc[0], @acc[0], @t[5]
adcs @acc[1], @acc[1], @t[6]
adcs @acc[2], @acc[2], @t[6]
stp @acc[0], @acc[1], [$out_ptr,#8*4]
adcs @t[3], @t[3], @t[6] // carry is used in the final step
stp @acc[2], @t[3], [$out_ptr,#8*6]
ret
.size __smul_512x63_tail,.-__smul_512x63_tail
.type __smul_256_n_shift_by_31, %function
.align 5
__smul_256_n_shift_by_31:
___
for($j=0; $j<2; $j++) {
my $f0 = $f0; $f0 = $g0 if ($j);
my @acc = @acc; @acc = @acc[4..7] if ($j);
my $k = 8*4*$j;
$code.=<<___;
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|)
asr @t[5], $f0, #63 // |f0|'s sign as mask (or |g0|'s)
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
eor @t[6], $f0, @t[5] // conditionally negate |f0| (or |g0|)
eor @acc[0], @acc[0], @t[5] // conditionally negate |a| (or |b|)
sub @t[6], @t[6], @t[5]
eor @acc[1], @acc[1], @t[5]
adds @acc[0], @acc[0], @t[5], lsr#63
eor @acc[2], @acc[2], @t[5]
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], @t[5]
umulh @t[0], @acc[0], @t[6]
adcs @acc[2], @acc[2], xzr
umulh @t[1], @acc[1], @t[6]
adc @acc[3], @acc[3], xzr
umulh @t[2], @acc[2], @t[6]
and @t[5], @t[5], @t[6]
umulh @t[3+$j], @acc[3], @t[6]
neg @t[5], @t[5]
mul @acc[0], @acc[0], @t[6]
mul @acc[1], @acc[1], @t[6]
mul @acc[2], @acc[2], @t[6]
adds @acc[1], @acc[1], @t[0]
mul @acc[3], @acc[3], @t[6]
adcs @acc[2], @acc[2], @t[1]
adcs @acc[3], @acc[3], @t[2]
adc @t[3+$j], @t[3+$j], @t[5]
___
}
$code.=<<___;
adds @acc[0], @acc[0], @acc[4]
adcs @acc[1], @acc[1], @acc[5]
adcs @acc[2], @acc[2], @acc[6]
adcs @acc[3], @acc[3], @acc[7]
adc @acc[4], @t[3], @t[4]
extr @acc[0], @acc[1], @acc[0], #31
extr @acc[1], @acc[2], @acc[1], #31
extr @acc[2], @acc[3], @acc[2], #31
asr @t[4], @acc[4], #63 // result's sign as mask
extr @acc[3], @acc[4], @acc[3], #31
eor @acc[0], @acc[0], @t[4] // ensure the result is positive
eor @acc[1], @acc[1], @t[4]
adds @acc[0], @acc[0], @t[4], lsr#63
eor @acc[2], @acc[2], @t[4]
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], @t[4]
adcs @acc[2], @acc[2], xzr
stp @acc[0], @acc[1], [$out_ptr,#8*0]
adc @acc[3], @acc[3], xzr
stp @acc[2], @acc[3], [$out_ptr,#8*2]
eor $f0, $f0, @t[4] // adjust |f/g| accordingly
eor $g0, $g0, @t[4]
sub $f0, $f0, @t[4]
sub $g0, $g0, @t[4]
ret
.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
___
{
my @a = @acc[0..3];
my @b = @acc[4..7];
my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]);
$code.=<<___;
.type __ab_approximation_31_256, %function
.align 4
__ab_approximation_31_256:
ldp @a[2], @a[3], [$in_ptr,#8*2]
ldp @b[2], @b[3], [$in_ptr,#8*6]
ldp @a[0], @a[1], [$in_ptr,#8*0]
ldp @b[0], @b[1], [$in_ptr,#8*4]
.Lab_approximation_31_256_loaded:
orr @t[0], @a[3], @b[3] // check top-most limbs, ...
cmp @t[0], #0
csel @a[3], @a[3], @a[2], ne
csel @b[3], @b[3], @b[2], ne
csel @a[2], @a[2], @a[1], ne
orr @t[0], @a[3], @b[3] // and ones before top-most, ...
csel @b[2], @b[2], @b[1], ne
cmp @t[0], #0
csel @a[3], @a[3], @a[2], ne
csel @b[3], @b[3], @b[2], ne
csel @a[2], @a[2], @a[0], ne
orr @t[0], @a[3], @b[3] // and one more, ...
csel @b[2], @b[2], @b[0], ne
clz @t[0], @t[0]
cmp @t[0], #64
csel @t[0], @t[0], xzr, ne
csel @a[3], @a[3], @a[2], ne
csel @b[3], @b[3], @b[2], ne
neg @t[1], @t[0]
lslv @a[3], @a[3], @t[0] // align high limbs to the left
lslv @b[3], @b[3], @t[0]
lsrv @a[2], @a[2], @t[1]
lsrv @b[2], @b[2], @t[1]
and @a[2], @a[2], @t[1], asr#6
and @b[2], @b[2], @t[1], asr#6
orr $a_lo, @a[3], @a[2]
orr $b_lo, @b[3], @b[2]
bfxil $a_lo, @a[0], #0, #31
bfxil $b_lo, @b[0], #0, #31
b __inner_loop_31_256
ret
.size __ab_approximation_31_256,.-__ab_approximation_31_256
.type __inner_loop_31_256, %function
.align 4
__inner_loop_31_256:
mov $cnt, #31
mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0
mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1
mov $bias,#0x7FFFFFFF7FFFFFFF
.Loop_31_256:
sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting
sub $cnt, $cnt, #1
and @t[0], $b_lo, @t[3]
sub @t[1], $b_lo, $a_lo // |b_|-|a_|
subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
mov @t[0], $fg1
csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_|
csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1|
csel $fg0, $fg0, @t[0], hs
lsr $a_lo, $a_lo, #1
and @t[0], $fg1, @t[3]
and @t[1], $bias, @t[3]
sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even)
add $fg1, $fg1, $fg1 // |f1|<<=1
add $fg0, $fg0, @t[1]
sub $fg1, $fg1, $bias
cbnz $cnt, .Loop_31_256
mov $bias, #0x7FFFFFFF
ubfx $f0, $fg0, #0, #32
ubfx $g0, $fg0, #32, #32
ubfx $f1, $fg1, #0, #32
ubfx $g1, $fg1, #32, #32
sub $f0, $f0, $bias // remove bias
sub $g0, $g0, $bias
sub $f1, $f1, $bias
sub $g1, $g1, $bias
ret
.size __inner_loop_31_256,.-__inner_loop_31_256
.type __inner_loop_62_256, %function
.align 4
__inner_loop_62_256:
mov $f0, #1 // |f0|=1
mov $g0, #0 // |g0|=0
mov $f1, #0 // |f1|=0
mov $g1, #1 // |g1|=1
.Loop_62_256:
sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting
sub $cnt, $cnt, #1
and @t[0], $b_lo, @t[3]
sub @t[1], $b_lo, $a_lo // |b_|-|a_|
subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
mov @t[0], $f0
csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_|
csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
mov @t[1], $g0
csel $f0, $f0, $f1, hs // exchange |f0| and |f1|
csel $f1, $f1, @t[0], hs
csel $g0, $g0, $g1, hs // exchange |g0| and |g1|
csel $g1, $g1, @t[1], hs
lsr $a_lo, $a_lo, #1
and @t[0], $f1, @t[3]
and @t[1], $g1, @t[3]
add $f1, $f1, $f1 // |f1|<<=1
add $g1, $g1, $g1 // |g1|<<=1
sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even)
sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...)
cbnz $cnt, .Loop_62_256
ret
.size __inner_loop_62_256,.-__inner_loop_62_256
___
}
foreach(split("\n",$code)) {
s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/;
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,837 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast Euclidean inversion as suggested in
# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake.
#
# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
# const vec256 modx);
#
$python_ref.=<<'___';
def ct_inverse_mod_256(inp, mod):
a, u = inp, 1
b, v = mod, 0
k = 31
mask = (1 << k) - 1
for i in range(0, 512 // k - 1):
# __ab_approximation_31
n = max(a.bit_length(), b.bit_length())
if n < 64:
a_, b_ = a, b
else:
a_ = (a & mask) | ((a >> (n-k-2)) << k)
b_ = (b & mask) | ((b >> (n-k-2)) << k)
# __inner_loop_31
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, k):
if a_ & 1:
if a_ < b_:
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
a_, f0, g0 = a_-b_, f0-f1, g0-g1
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
# __smulq_256_n_shift_by_31
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
if a < 0:
a, f0, g0 = -a, -f0, -g0
if b < 0:
b, f1, g1 = -b, -f1, -g1
# __smulq_512x63
u, v = u*f0 + v*g0, u*f1 + v*g1
if 512 % k + k:
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, 512 % k + k):
if a & 1:
if a < b:
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
a, f0, g0 = a-b, f0-f1, g0-g1
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
v = u*f1 + v*g1
mod <<= 512 - mod.bit_length() # align to the left
if v < 0:
v += mod
if v < 0:
v += mod
elif v == 1<<512
v -= mod
return v & (2**512 - 1) # to be reduced % mod
___
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
my @acc = map("%r$_",(8..15));
my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
my $cnt = "%edx";
$frame = 8*6+2*512;
$code.=<<___;
.text
.globl ct_inverse_mod_256
.type ct_inverse_mod_256,\@function,4,"unwind"
.align 32
ct_inverse_mod_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$$frame, %rsp
.cfi_adjust_cfa_offset $frame
.cfi_end_prologue
lea 8*6+511(%rsp), %rax # find closest 512-byte-aligned spot
and \$-512, %rax # in the frame...
mov $out_ptr, 8*4(%rsp)
mov $nx_ptr, 8*5(%rsp)
mov 8*0($in_ptr), @acc[0] # load input
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*0($n_ptr), @acc[4] # load modulus
mov 8*1($n_ptr), @acc[5]
mov 8*2($n_ptr), @acc[6]
mov 8*3($n_ptr), @acc[7]
mov @acc[0], 8*0(%rax) # copy input to |a|
mov @acc[1], 8*1(%rax)
mov @acc[2], 8*2(%rax)
mov @acc[3], 8*3(%rax)
mov @acc[4], 8*4(%rax) # copy modulus to |b|
mov @acc[5], 8*5(%rax)
mov @acc[6], 8*6(%rax)
mov @acc[7], 8*7(%rax)
mov %rax, $in_ptr
################################# first iteration
mov \$31, $cnt
call __ab_approximation_31_256
#mov $f0, 8*0(%rsp)
#mov $g0, 8*1(%rsp)
mov $f1, 8*2(%rsp)
mov $g1, 8*3(%rsp)
mov \$256, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
call __smulq_256_n_shift_by_31
#mov $f0, 8*0(%rsp) # corrected |f0|
#mov $g0, 8*1(%rsp) # corrected |g0|
mov $f0, 8*8($out_ptr) # initialize |u| with |f0|
mov 8*2(%rsp), $f0 # |f1|
mov 8*3(%rsp), $g0 # |g1|
lea 8*4($out_ptr), $out_ptr # pointer to destination |b|
call __smulq_256_n_shift_by_31
#mov $f0, 8*2(%rsp) # corrected |f1|
#mov $g0, 8*3(%rsp) # corrected |g1|
mov $f0, 8*9($out_ptr) # initialize |v| with |f1|
################################# second iteration
xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v|
mov \$31, $cnt
call __ab_approximation_31_256
#mov $f0, 8*0(%rsp)
#mov $g0, 8*1(%rsp)
mov $f1, 8*2(%rsp)
mov $g1, 8*3(%rsp)
mov \$256, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
call __smulq_256_n_shift_by_31
mov $f0, 8*0(%rsp) # corrected |f0|
mov $g0, 8*1(%rsp) # corrected |g0|
mov 8*2(%rsp), $f0 # |f1|
mov 8*3(%rsp), $g0 # |g1|
lea 8*4($out_ptr), $out_ptr # pointer to destination |b|
call __smulq_256_n_shift_by_31
#mov $f0, 8*2(%rsp) # corrected |f1|
#mov $g0, 8*3(%rsp) # corrected |g1|
mov 8*8($in_ptr), @acc[0] # |u|
mov 8*13($in_ptr), @acc[4] # |v|
mov @acc[0], @acc[1]
imulq 8*0(%rsp), @acc[0] # |u|*|f0|
mov @acc[4], @acc[5]
imulq 8*1(%rsp), @acc[4] # |v|*|g0|
add @acc[4], @acc[0]
mov @acc[0], 8*4($out_ptr) # destination |u|
sar \$63, @acc[0] # sign extension
mov @acc[0], 8*5($out_ptr)
mov @acc[0], 8*6($out_ptr)
mov @acc[0], 8*7($out_ptr)
mov @acc[0], 8*8($out_ptr)
lea 8*8($in_ptr), $in_ptr # make in_ptr "rewindable" with xor
imulq $f0, @acc[1] # |u|*|f1|
imulq $g0, @acc[5] # |v|*|g1|
add @acc[5], @acc[1]
mov @acc[1], 8*9($out_ptr) # destination |v|
sar \$63, @acc[1] # sign extension
mov @acc[1], 8*10($out_ptr)
mov @acc[1], 8*11($out_ptr)
mov @acc[1], 8*12($out_ptr)
mov @acc[1], 8*13($out_ptr)
___
for($i=2; $i<15; $i++) {
my $smul_512x63 = $i>8 ? "__smulq_512x63"
: "__smulq_256x63";
$code.=<<___;
xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v|
mov \$31, $cnt
call __ab_approximation_31_256
#mov $f0, 8*0(%rsp)
#mov $g0, 8*1(%rsp)
mov $f1, 8*2(%rsp)
mov $g1, 8*3(%rsp)
mov \$256, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
call __smulq_256_n_shift_by_31
mov $f0, 8*0(%rsp) # corrected |f0|
mov $g0, 8*1(%rsp) # corrected |g0|
mov 8*2(%rsp), $f0 # |f1|
mov 8*3(%rsp), $g0 # |g1|
lea 8*4($out_ptr), $out_ptr # pointer to destination |b|
call __smulq_256_n_shift_by_31
mov $f0, 8*2(%rsp) # corrected |f1|
mov $g0, 8*3(%rsp) # corrected |g1|
mov 8*0(%rsp), $f0 # |f0|
mov 8*1(%rsp), $g0 # |g0|
lea 8*8($in_ptr), $in_ptr # pointer to source |u|v|
lea 8*4($out_ptr), $out_ptr # pointer to destination |u|
call __smulq_256x63
mov 8*2(%rsp), $f0 # |f1|
mov 8*3(%rsp), $g0 # |g1|
lea 8*5($out_ptr),$out_ptr # pointer to destination |v|
call $smul_512x63
___
$code.=<<___ if ($i==8);
sar \$63, %rbp # sign extension
mov %rbp, 8*5($out_ptr)
mov %rbp, 8*6($out_ptr)
mov %rbp, 8*7($out_ptr)
___
}
$code.=<<___;
################################# two[!] last iterations in one go
xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v|
mov \$47, $cnt # 31 + 512 % 31
#call __ab_approximation_31 # |a| and |b| are exact, just load
mov 8*0($in_ptr), @acc[0] # |a_lo|
#xor @acc[1], @acc[1] # |a_hi|
mov 8*4($in_ptr), @acc[2] # |b_lo|
#xor @acc[3], @acc[3] # |b_hi|
call __inner_loop_62_256
#mov $f0, 8*0(%rsp)
#mov $g0, 8*1(%rsp)
#mov $f1, 8*2(%rsp)
#mov $g1, 8*3(%rsp)
#mov 8*0(%rsp), $f0 # |f0|
#mov 8*1(%rsp), $g0 # |g0|
lea 8*8($in_ptr), $in_ptr # pointer to source |u|v|
#lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
#call __smulq_256x63
#mov 8*2(%rsp), $f0 # |f1|
#mov 8*3(%rsp), $g0 # |g1|
mov $f1, $f0
mov $g1, $g0
mov 8*4(%rsp), $out_ptr # original |out_ptr|
call __smulq_512x63
adc %rbp, %rdx # the excess limb of the result
mov 8*5(%rsp), $in_ptr # original |nx_ptr|
mov %rdx, %rax
sar \$63, %rdx # result's sign as mask
mov %rdx, @acc[0] # mask |modulus|
mov %rdx, @acc[1]
and 8*0($in_ptr), @acc[0]
mov %rdx, @acc[2]
and 8*1($in_ptr), @acc[1]
and 8*2($in_ptr), @acc[2]
and 8*3($in_ptr), %rdx
add @acc[0], @acc[4] # conditionally add |modulus|<<256
adc @acc[1], @acc[5]
adc @acc[2], @acc[6]
adc %rdx, @acc[7]
adc \$0, %rax
mov %rax, %rdx
neg %rax
or %rax, %rdx # excess bit or sign as mask
sar \$63, %rax # excess bit as mask
mov %rdx, @acc[0] # mask |modulus|
mov %rdx, @acc[1]
and 8*0($in_ptr), @acc[0]
mov %rdx, @acc[2]
and 8*1($in_ptr), @acc[1]
and 8*2($in_ptr), @acc[2]
and 8*3($in_ptr), %rdx
xor %rax, @acc[0] # conditionally negate |modulus|
xor %rcx, %rcx
xor %rax, @acc[1]
sub %rax, %rcx
xor %rax, @acc[2]
xor %rax, %rdx
add %rcx, @acc[0]
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, %rdx
add @acc[0], @acc[4] # final adjustment for |modulus|<<256
adc @acc[1], @acc[5]
adc @acc[2], @acc[6]
adc %rdx, @acc[7]
mov @acc[4], 8*4($out_ptr) # store absolute value
mov @acc[5], 8*5($out_ptr)
mov @acc[6], 8*6($out_ptr)
mov @acc[7], 8*7($out_ptr)
lea $frame(%rsp), %r8 # size optimization
mov 8*0(%r8),%r15
.cfi_restore %r15
mov 8*1(%r8),%r14
.cfi_restore %r14
mov 8*2(%r8),%r13
.cfi_restore %r13
mov 8*3(%r8),%r12
.cfi_restore %r12
mov 8*4(%r8),%rbx
.cfi_restore %rbx
mov 8*5(%r8),%rbp
.cfi_restore %rbp
lea 8*6(%r8),%rsp
.cfi_adjust_cfa_offset -$frame-8*6
.cfi_epilogue
ret
.cfi_endproc
.size ct_inverse_mod_256,.-ct_inverse_mod_256
___
########################################################################
# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
# to the maximum bit-length of the *result*, and "63" - to the maximum
# bit-length of the |f?| and |g?| single-limb multiplicands. However!
# The latter should not be taken literally, as they are always chosen so
# that "bad things" don't happen. For example, there comes a point when
# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
# because past that point |f0| is always 1 and |g0| is always 0. And,
# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
# perform full-width |u|*|f1| multiplication, half-width one with sign
# extension is sufficient...
$code.=<<___;
.type __smulq_512x63,\@abi-omnipotent
.align 32
__smulq_512x63:
mov 8*0($in_ptr), @acc[0] # load |u|
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*4($in_ptr), %rbp # sign limb
mov $f0, %rbx
sar \$63, $f0 # |f0|'s sign as mask
xor %rax, %rax
sub $f0, %rax # |f0|'s sign as bit
xor $f0, %rbx # conditionally negate |f0|
add %rax, %rbx
xor $f0, @acc[0] # conditionally negate |u|
xor $f0, @acc[1]
xor $f0, @acc[2]
xor $f0, @acc[3]
xor $f0, %rbp
add @acc[0], %rax
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, %rbp
mulq %rbx # |u|*|f0|
mov %rax, 8*0($out_ptr) # offload |u|*|f0|
mov @acc[1], %rax
mov %rdx, @acc[1]
___
for($i=1; $i<3; $i++) {
$code.=<<___;
mulq %rbx
add %rax, @acc[$i]
mov @acc[$i+1], %rax
adc \$0, %rdx
mov @acc[$i], 8*$i($out_ptr)
mov %rdx, @acc[$i+1]
___
}
$code.=<<___;
and %rbx, %rbp
neg %rbp
mulq %rbx
add %rax, @acc[3]
adc %rdx, %rbp
mov @acc[3], 8*3($out_ptr)
mov 8*5($in_ptr), @acc[0] # load |v|
mov 8*6($in_ptr), @acc[1]
mov 8*7($in_ptr), @acc[2]
mov 8*8($in_ptr), @acc[3]
mov 8*9($in_ptr), @acc[4]
mov 8*10($in_ptr), @acc[5]
mov 8*11($in_ptr), @acc[6]
mov 8*12($in_ptr), @acc[7]
mov $g0, $f0
sar \$63, $f0 # |g0|'s sign as mask
xor %rax, %rax
sub $f0, %rax # |g0|'s sign as bit
xor $f0, $g0 # conditionally negate |g0|
add %rax, $g0
xor $f0, @acc[0] # conditionally negate |v|
xor $f0, @acc[1]
xor $f0, @acc[2]
xor $f0, @acc[3]
xor $f0, @acc[4]
xor $f0, @acc[5]
xor $f0, @acc[6]
xor $f0, @acc[7]
add @acc[0], %rax
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
adc \$0, @acc[6]
adc \$0, @acc[7]
mulq $g0
mov %rax, @acc[0]
mov @acc[1], %rax
mov %rdx, @acc[1]
___
for($i=1; $i<7; $i++) {
$code.=<<___;
mulq $g0
add %rax, @acc[$i]
mov @acc[$i+1], %rax
adc \$0, %rdx
mov %rdx, @acc[$i+1]
___
}
$code.=<<___;
imulq $g0
add %rax, @acc[7]
adc \$0, %rdx # used in the final step
mov %rbp, %rbx
sar \$63, %rbp # sign extension
add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0|
adc 8*1($out_ptr), @acc[1]
adc 8*2($out_ptr), @acc[2]
adc 8*3($out_ptr), @acc[3]
adc %rbx, @acc[4]
adc %rbp, @acc[5]
adc %rbp, @acc[6]
adc %rbp, @acc[7]
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov @acc[5], 8*5($out_ptr)
mov @acc[6], 8*6($out_ptr)
mov @acc[7], 8*7($out_ptr)
ret
.size __smulq_512x63,.-__smulq_512x63
.type __smulq_256x63,\@abi-omnipotent
.align 32
__smulq_256x63:
___
for($j=0; $j<2; $j++) {
my $k = 8*5*$j;
my @acc=@acc; @acc=@acc[4..7] if($j);
my $top="%rbp"; $top=$g0 if($j);
$code.=<<___;
mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
mov $k+8*1($in_ptr), @acc[1]
mov $k+8*2($in_ptr), @acc[2]
mov $k+8*3($in_ptr), @acc[3]
mov $k+8*4($in_ptr), $top # sign/excess limb
mov $f0, %rbx
sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s)
xor %rax, %rax
sub $f0, %rax # |f0|'s sign as bit (or |g0|'s)
xor $f0, %rbx # conditionally negate |f0|
add %rax, %rbx
xor $f0, @acc[0] # conditionally negate |u| (or |v|)
xor $f0, @acc[1]
xor $f0, @acc[2]
xor $f0, @acc[3]
xor $f0, $top
add @acc[0], %rax
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, $top
mulq %rbx
mov %rax, @acc[0]
mov @acc[1], %rax
mov %rdx, @acc[1]
___
for($i=1; $i<3; $i++) {
$code.=<<___;
mulq %rbx
add %rax, @acc[$i]
mov @acc[$i+1], %rax
adc \$0, %rdx
mov %rdx, @acc[$i+1]
___
}
$code.=<<___;
and %rbx, $top
neg $top
mulq %rbx
add %rax, @acc[3]
adc %rdx, $top
___
$code.=<<___ if ($j==0);
mov $g0, $f0
___
}
$code.=<<___;
add @acc[4], @acc[0] # accumulate |u|*|f0|
adc @acc[5], @acc[1]
adc @acc[6], @acc[2]
adc @acc[7], @acc[3]
adc %rcx, %rbp
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov %rbp, 8*4($out_ptr)
ret
.size __smulq_256x63,.-__smulq_256x63
___
########################################################################
# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
# the names refers to maximum bit-lengths of |a| and |b|. As already
# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
# chosen so that "bad things" don't happen. For example, so that the
# sum of the products doesn't overflow, and that the final result is
# never wider than inputs...
{
$code.=<<___;
.type __smulq_256_n_shift_by_31,\@abi-omnipotent
.align 32
__smulq_256_n_shift_by_31:
mov $f0, 8*0($out_ptr) # offload |f0|
mov $g0, 8*1($out_ptr) # offload |g0|
mov $f0, %rbp
___
for($j=0; $j<2; $j++) {
my $k = 8*4*$j;
my @acc=@acc; @acc=@acc[4..7] if ($j);
my $f0="%rbp"; $f0=$g0 if ($j);
$code.=<<___;
mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
mov $k+8*1($in_ptr), @acc[1]
mov $k+8*2($in_ptr), @acc[2]
mov $k+8*3($in_ptr), @acc[3]
mov $f0, %rbx
sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s)
xor %rax, %rax
sub $f0, %rax # |f0|'s sign as bit (or |g0|'s)
xor $f0, %rbx # conditionally negate |f0| (or |g0|)
add %rax, %rbx
xor $f0, @acc[0] # conditionally negate |a| (or |b|)
xor $f0, @acc[1]
xor $f0, @acc[2]
xor $f0, @acc[3]
add @acc[0], %rax
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
mulq %rbx
mov %rax, @acc[0]
mov @acc[1], %rax
and %rbx, $f0
neg $f0
mov %rdx, @acc[1]
___
for($i=1; $i<3; $i++) {
$code.=<<___;
mulq %rbx
add %rax, @acc[$i]
mov @acc[$i+1], %rax
adc \$0, %rdx
mov %rdx, @acc[$i+1]
___
}
$code.=<<___;
mulq %rbx
add %rax, @acc[3]
adc %rdx, $f0
___
}
$code.=<<___;
add @acc[4], @acc[0]
adc @acc[5], @acc[1]
adc @acc[6], @acc[2]
adc @acc[7], @acc[3]
adc $g0, %rbp
mov 8*0($out_ptr), $f0 # restore original |f0|
mov 8*1($out_ptr), $g0 # restore original |g0|
shrd \$31, @acc[1], @acc[0]
shrd \$31, @acc[2], @acc[1]
shrd \$31, @acc[3], @acc[2]
shrd \$31, %rbp, @acc[3]
sar \$63, %rbp # sign as mask
xor %rax, %rax
sub %rbp, %rax # sign as bit
xor %rbp, @acc[0] # conditionally negate the result
xor %rbp, @acc[1]
xor %rbp, @acc[2]
xor %rbp, @acc[3]
add %rax, @acc[0]
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
xor %rbp, $f0 # conditionally negate |f0|
xor %rbp, $g0 # conditionally negate |g0|
add %rax, $f0
add %rax, $g0
ret
.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31
___
}
{
my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
my ($a_, $b_) = ($a_lo, $b_lo);
{
my @a = ($a_lo, $t1, $a_hi);
my @b = ($b_lo, $t2, $b_hi);
$code.=<<___;
.type __ab_approximation_31_256,\@abi-omnipotent
.align 32
__ab_approximation_31_256:
mov 8*3($in_ptr), @a[2] # load |a| in reverse order
mov 8*7($in_ptr), @b[2] # load |b| in reverse order
mov 8*2($in_ptr), @a[1]
mov 8*6($in_ptr), @b[1]
mov 8*1($in_ptr), @a[0]
mov 8*5($in_ptr), @b[0]
mov @a[2], $t0
or @b[2], $t0 # check top-most limbs, ...
cmovz @a[1], @a[2]
cmovz @b[1], @b[2]
cmovz @a[0], @a[1]
mov 8*0($in_ptr), @a[0]
cmovz @b[0], @b[1]
mov 8*4($in_ptr), @b[0]
mov @a[2], $t0
or @b[2], $t0 # ... and ones before that ...
cmovz @a[1], @a[2]
cmovz @b[1], @b[2]
cmovz @a[0], @a[1]
cmovz @b[0], @b[1]
mov @a[2], $t0
or @b[2], $t0
bsr $t0, %rcx
lea 1(%rcx), %rcx
cmovz @a[0], @a[2]
cmovz @b[0], @b[2]
cmovz $t0, %rcx
neg %rcx
#and \$63, %rcx # debugging artefact
shldq %cl, @a[1], @a[2] # align second limb to the left
shldq %cl, @b[1], @b[2]
mov \$0x7FFFFFFF, %eax
and %rax, @a[0]
and %rax, @b[0]
not %rax
and %rax, @a[2]
and %rax, @b[2]
or @a[2], @a[0]
or @b[2], @b[0]
jmp __inner_loop_31_256
ret
.size __ab_approximation_31_256,.-__ab_approximation_31_256
___
}
$code.=<<___;
.type __inner_loop_31_256,\@abi-omnipotent
.align 32 # comment and punish Coffee Lake by up to 40%
__inner_loop_31_256: ################# by Thomas Pornin
mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0
mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1
mov \$0x7FFFFFFF7FFFFFFF, $bias
.Loop_31_256:
cmp $b_, $a_ # if |a_|<|b_|, swap the variables
mov $a_, $t0
mov $b_, $t1
mov $fg0, $t2
mov $fg1, $t3
cmovb $b_, $a_
cmovb $t0, $b_
cmovb $fg1, $fg0
cmovb $t2, $fg1
sub $b_, $a_ # |a_|-|b_|
sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1|
add $bias, $fg0
test \$1, $t0 # if |a_| was even, roll back
cmovz $t0, $a_
cmovz $t1, $b_
cmovz $t2, $fg0
cmovz $t3, $fg1
shr \$1, $a_ # |a_|>>=1
add $fg1, $fg1 # |f1|<<=1, |g1|<<=1
sub $bias, $fg1
sub \$1, $cnt
jnz .Loop_31_256
shr \$32, $bias
mov %ecx, %edx # $fg0, $f0
mov ${fg1}d, ${f1}d
shr \$32, $g0
shr \$32, $g1
sub $bias, $f0 # remove the bias
sub $bias, $g0
sub $bias, $f1
sub $bias, $g1
ret
.size __inner_loop_31_256,.-__inner_loop_31_256
.type __inner_loop_62_256,\@abi-omnipotent
.align 32
__inner_loop_62_256:
mov $cnt, %r15d
mov \$1, $f0 # |f0|=1
xor $g0, $g0 # |g0|=0
xor $f1, $f1 # |f1|=0
mov $f0, $g1 # |g1|=1
mov $f0, %r14
.Loop_62_256:
xor $t0, $t0
test %r14, $a_lo # if |a_| is odd, then we'll be subtracting |b_|
mov $b_lo, $t1
cmovnz $b_lo, $t0
sub $a_lo, $t1 # |b_|-|a_|
mov $a_lo, $t2
sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even)
cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_|
cmovc $t2, $b_lo # |b_| = |a_|
mov $f0, $t0 # exchange |f0| and |f1|
cmovc $f1, $f0
cmovc $t0, $f1
mov $g0, $t1 # exchange |g0| and |g1|
cmovc $g1, $g0
cmovc $t1, $g1
xor $t0, $t0
xor $t1, $t1
shr \$1, $a_lo
test %r14, $t2 # if |a_| was odd, then we'll be subtracting...
cmovnz $f1, $t0
cmovnz $g1, $t1
add $f1, $f1 # |f1|<<=1
add $g1, $g1 # |g1|<<=1
sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even)
sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...)
sub \$1, %r15d
jnz .Loop_62_256
ret
.size __inner_loop_62_256,.-__inner_loop_62_256
___
}
print $code;
close STDOUT;

View file

@ -0,0 +1,610 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast Euclidean inversion as suggested in
# https://eprint.iacr.org/2020/972. Performance is >12x better [on
# Cortex cores] than modulus-specific FLT addition chain...
#
# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
#
$python_ref.=<<'___';
def ct_inverse_mod_383(inp, mod):
a, u = inp, 1
b, v = mod, 0
k = 62
w = 64
mask = (1 << w) - 1
for i in range(0, 766 // k):
# __ab_approximation_62
n = max(a.bit_length(), b.bit_length())
if n < 128:
a_, b_ = a, b
else:
a_ = (a & mask) | ((a >> (n-w)) << w)
b_ = (b & mask) | ((b >> (n-w)) << w)
# __inner_loop_62
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, k):
if a_ & 1:
if a_ < b_:
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
a_, f0, g0 = a_-b_, f0-f1, g0-g1
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
# __smul_383_n_shift_by_62
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
if a < 0:
a, f0, g0 = -a, -f0, -g0
if b < 0:
b, f1, g1 = -b, -f1, -g1
# __smul_767x63
u, v = u*f0 + v*g0, u*f1 + v*g1
if 766 % k:
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, 766 % k):
if a & 1:
if a < b:
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
a, f0, g0 = a-b, f0-f1, g0-g1
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
v = u*f1 + v*g1
if v < 0:
v += mod << (768 - mod.bit_length()) # left aligned
return v & (2**768 - 1) # to be reduced % mod
___
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
my @acc=map("x$_",(3..14));
my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21));
my $cnt = $n_ptr;
my @t = map("x$_",(22..28,2));
my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11];
$frame = 16+2*512;
$code.=<<___;
.text
.globl ct_inverse_mod_383
.type ct_inverse_mod_383, %function
.align 5
ct_inverse_mod_383:
paciasp
stp x29, x30, [sp,#-128]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
sub sp, sp, #$frame
ldp @t[0], @acc[1], [$in_ptr,#8*0]
ldp @acc[2], @acc[3], [$in_ptr,#8*2]
ldp @acc[4], @acc[5], [$in_ptr,#8*4]
add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot
and $in_ptr, $in_ptr, #-512 // in the frame...
stp $out_ptr, $nx_ptr, [sp]
ldp @acc[6], @acc[7], [$n_ptr,#8*0]
ldp @acc[8], @acc[9], [$n_ptr,#8*2]
ldp @acc[10], @acc[11], [$n_ptr,#8*4]
stp @t[0], @acc[1], [$in_ptr,#8*0] // copy input to |a|
stp @acc[2], @acc[3], [$in_ptr,#8*2]
stp @acc[4], @acc[5], [$in_ptr,#8*4]
stp @acc[6], @acc[7], [$in_ptr,#8*6] // copy modulus to |b|
stp @acc[8], @acc[9], [$in_ptr,#8*8]
stp @acc[10], @acc[11], [$in_ptr,#8*10]
////////////////////////////////////////// first iteration
mov $cnt, #62
bl .Lab_approximation_62_loaded
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
str $f0,[$out_ptr,#8*12] // initialize |u| with |f0|
mov $f0, $f1 // |f1|
mov $g0, $g1 // |g1|
add $out_ptr, $out_ptr, #8*6 // pointer to dst |b|
bl __smul_383_n_shift_by_62
str $f0, [$out_ptr,#8*12] // initialize |v| with |f1|
////////////////////////////////////////// second iteration
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
mov $cnt, #62
bl __ab_approximation_62
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov $f_, $f0 // corrected |f0|
mov $g_, $g0 // corrected |g0|
mov $f0, $f1 // |f1|
mov $g0, $g1 // |g1|
add $out_ptr, $out_ptr, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
ldr @acc[4], [$in_ptr,#8*12] // |u|
ldr @acc[5], [$in_ptr,#8*18] // |v|
mul @acc[0], $f_, @acc[4] // |u|*|f0|
smulh @acc[1], $f_, @acc[4]
mul @acc[2], $g_, @acc[5] // |v|*|g0|
smulh @acc[3], $g_, @acc[5]
adds @acc[0], @acc[0], @acc[2]
adc @acc[1], @acc[1], @acc[3]
stp @acc[0], @acc[1], [$out_ptr,#8*6]
asr @acc[2], @acc[1], #63 // sign extenstion
stp @acc[2], @acc[2], [$out_ptr,#8*8]
stp @acc[2], @acc[2], [$out_ptr,#8*10]
mul @acc[0], $f0, @acc[4] // |u|*|f1|
smulh @acc[1], $f0, @acc[4]
mul @acc[2], $g0, @acc[5] // |v|*|g1|
smulh @acc[3], $g0, @acc[5]
adds @acc[0], @acc[0], @acc[2]
adc @acc[1], @acc[1], @acc[3]
stp @acc[0], @acc[1], [$out_ptr,#8*12]
asr @acc[2], @acc[1], #63 // sign extenstion
stp @acc[2], @acc[2], [$out_ptr,#8*14]
stp @acc[2], @acc[2], [$out_ptr,#8*16]
___
for($i=2; $i<11; $i++) {
$code.=<<___;
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
mov $cnt, #62
bl __ab_approximation_62
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov $f_, $f0 // corrected |f0|
mov $g_, $g0 // corrected |g0|
mov $f0, $f1 // |f1|
mov $g0, $g1 // |g1|
add $out_ptr, $out_ptr, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add $out_ptr, $out_ptr, #8*6 // pointer to destination |u|
bl __smul_383x63
mov $f_, $f0 // corrected |f1|
mov $g_, $g0 // corrected |g1|
add $out_ptr, $out_ptr, #8*6 // pointer to destination |v|
bl __smul_383x63
___
$code.=<<___ if ($i>5);
bl __smul_767x63_tail
___
$code.=<<___ if ($i==5);
asr @t[5], @t[5], #63 // sign extension
stp @t[5], @t[5], [$out_ptr,#8*6]
stp @t[5], @t[5], [$out_ptr,#8*8]
stp @t[5], @t[5], [$out_ptr,#8*10]
___
}
$code.=<<___;
////////////////////////////////////////// iteration before last
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
mov $cnt, #62
//bl __ab_approximation_62 // |a| and |b| are exact,
ldp $a_lo, $a_hi, [$in_ptr,#8*0] // just load
ldp $b_lo, $b_hi, [$in_ptr,#8*6]
bl __inner_loop_62
eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v|
str $a_lo, [$out_ptr,#8*0]
str $b_lo, [$out_ptr,#8*6]
mov $f_, $f0 // exact |f0|
mov $g_, $g0 // exact |g0|
mov $f0, $f1
mov $g0, $g1
add $out_ptr, $out_ptr, #8*12 // pointer to dst |u|
bl __smul_383x63
mov $f_, $f0 // exact |f1|
mov $g_, $g0 // exact |g1|
add $out_ptr, $out_ptr, #8*6 // pointer to dst |v|
bl __smul_383x63
bl __smul_767x63_tail
////////////////////////////////////////// last iteration
eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v|
mov $cnt, #22 // 766 % 62
//bl __ab_approximation_62 // |a| and |b| are exact,
ldr $a_lo, [$in_ptr,#8*0] // just load
eor $a_hi, $a_hi, $a_hi
ldr $b_lo, [$in_ptr,#8*6]
eor $b_hi, $b_hi, $b_hi
bl __inner_loop_62
mov $f_, $f1
mov $g_, $g1
ldp $out_ptr, $f0, [sp] // original out_ptr and n_ptr
bl __smul_383x63
bl __smul_767x63_tail
ldr x30, [x29,#8]
asr @t[0], @acc[5], #63 // sign as mask
ldp @acc[6], @acc[7], [$f0,#8*0]
ldp @acc[8], @acc[9], [$f0,#8*2]
ldp @acc[10], @acc[11], [$f0,#8*4]
and @acc[6], @acc[6], @t[0] // add mod<<384 conditionally
and @acc[7], @acc[7], @t[0]
adds @acc[0], @acc[0], @acc[6]
and @acc[8], @acc[8], @t[0]
adcs @acc[1], @acc[1], @acc[7]
and @acc[9], @acc[9], @t[0]
adcs @acc[2], @acc[2], @acc[8]
and @acc[10], @acc[10], @t[0]
adcs @acc[3], @acc[3], @acc[9]
and @acc[11], @acc[11], @t[0]
stp @acc[0], @acc[1], [$out_ptr,#8*6]
adcs @acc[4], @acc[4], @acc[10]
stp @acc[2], @acc[3], [$out_ptr,#8*8]
adc @acc[5], @acc[5], @acc[11]
stp @acc[4], @acc[5], [$out_ptr,#8*10]
add sp, sp, #$frame
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldr x29, [sp],#128
autiasp
ret
.size ct_inverse_mod_383,.-ct_inverse_mod_383
////////////////////////////////////////////////////////////////////////
// see corresponding commentary in ctx_inverse_mod_384-x86_64...
.type __smul_383x63, %function
.align 5
__smul_383x63:
___
for($j=0; $j<2; $j++) {
my $f_ = $f_; $f_ = $g_ if ($j);
my @acc = @acc; @acc = @acc[6..11] if ($j);
my $k = 8*12+8*6*$j;
$code.=<<___;
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|)
asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s)
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|)
ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k]
eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|)
sub $f_, $f_, $f1
eor @acc[1], @acc[1], $f1
adds @acc[0], @acc[0], $f1, lsr#63
eor @acc[2], @acc[2], $f1
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], $f1
adcs @acc[2], @acc[2], xzr
eor @acc[4], @acc[4], $f1
adcs @acc[3], @acc[3], xzr
umulh @t[0], @acc[0], $f_
eor @acc[5], @acc[5], $f1
umulh @t[1], @acc[1], $f_
adcs @acc[4], @acc[4], xzr
umulh @t[2], @acc[2], $f_
adcs @acc[5], @acc[5], xzr
umulh @t[3], @acc[3], $f_
___
$code.=<<___ if ($j);
adc $g1, xzr, xzr // used in __smul_767x63_tail
___
$code.=<<___;
umulh @t[4], @acc[4], $f_
mul @acc[0], @acc[0], $f_
mul @acc[1], @acc[1], $f_
mul @acc[2], @acc[2], $f_
adds @acc[1], @acc[1], @t[0]
mul @acc[3], @acc[3], $f_
adcs @acc[2], @acc[2], @t[1]
mul @acc[4], @acc[4], $f_
adcs @acc[3], @acc[3], @t[2]
mul @t[5+$j],@acc[5], $f_
adcs @acc[4], @acc[4], @t[3]
adcs @t[5+$j],@t[5+$j],@t[4]
___
$code.=<<___ if ($j==0);
adc @t[7], xzr, xzr
___
}
$code.=<<___;
adc @t[7], @t[7], xzr
adds @acc[0], @acc[0], @acc[6]
adcs @acc[1], @acc[1], @acc[7]
adcs @acc[2], @acc[2], @acc[8]
adcs @acc[3], @acc[3], @acc[9]
stp @acc[0], @acc[1], [$out_ptr,#8*0]
adcs @acc[4], @acc[4], @acc[10]
stp @acc[2], @acc[3], [$out_ptr,#8*2]
adcs @t[5], @t[5], @t[6]
stp @acc[4], @t[5], [$out_ptr,#8*4]
adc @t[6], @t[7], xzr // used in __smul_767x63_tail
ret
.size __smul_383x63,.-__smul_383x63
.type __smul_767x63_tail, %function
.align 5
__smul_767x63_tail:
smulh @t[5], @acc[5], $f_
ldp @acc[0], @acc[1], [$in_ptr,#8*24] // load rest of |v|
umulh @acc[11],@acc[11], $g_
ldp @acc[2], @acc[3], [$in_ptr,#8*26]
ldp @acc[4], @acc[5], [$in_ptr,#8*28]
eor @acc[0], @acc[0], $f1 // conditionally negate rest of |v|
eor @acc[1], @acc[1], $f1
eor @acc[2], @acc[2], $f1
adds @acc[0], @acc[0], $g1
eor @acc[3], @acc[3], $f1
adcs @acc[1], @acc[1], xzr
eor @acc[4], @acc[4], $f1
adcs @acc[2], @acc[2], xzr
eor @acc[5], @acc[5], $f1
adcs @acc[3], @acc[3], xzr
umulh @t[0], @acc[0], $g_
adcs @acc[4], @acc[4], xzr
umulh @t[1], @acc[1], $g_
adc @acc[5], @acc[5], xzr
umulh @t[2], @acc[2], $g_
add @acc[11], @acc[11], @t[6]
umulh @t[3], @acc[3], $g_
asr @t[6], @t[5], #63
umulh @t[4], @acc[4], $g_
mul @acc[0], @acc[0], $g_
mul @acc[1], @acc[1], $g_
mul @acc[2], @acc[2], $g_
adds @acc[0], @acc[0], @acc[11]
mul @acc[3], @acc[3], $g_
adcs @acc[1], @acc[1], @t[0]
mul @acc[4], @acc[4], $g_
adcs @acc[2], @acc[2], @t[1]
mul @acc[5], @acc[5], $g_
adcs @acc[3], @acc[3], @t[2]
adcs @acc[4], @acc[4], @t[3]
adc @acc[5], @acc[5], @t[4]
adds @acc[0], @acc[0], @t[5]
adcs @acc[1], @acc[1], @t[6]
adcs @acc[2], @acc[2], @t[6]
adcs @acc[3], @acc[3], @t[6]
stp @acc[0], @acc[1], [$out_ptr,#8*6]
adcs @acc[4], @acc[4], @t[6]
stp @acc[2], @acc[3], [$out_ptr,#8*8]
adc @acc[5], @acc[5], @t[6]
stp @acc[4], @acc[5], [$out_ptr,#8*10]
ret
.size __smul_767x63_tail,.-__smul_767x63_tail
.type __smul_383_n_shift_by_62, %function
.align 5
__smul_383_n_shift_by_62:
___
for($j=0; $j<2; $j++) {
my $f0 = $f0; $f0 = $g0 if ($j);
my @acc = @acc; @acc = @acc[6..11] if ($j);
my $k = 8*6*$j;
$code.=<<___;
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|)
asr @t[6], $f0, #63 // |f0|'s sign as mask (or |g0|'s)
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
eor @t[7], $f0, @t[6] // conditionally negate |f0| (or |g0|)
ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k]
eor @acc[0], @acc[0], @t[6] // conditionally negate |a| (or |b|)
sub @t[7], @t[7], @t[6]
eor @acc[1], @acc[1], @t[6]
adds @acc[0], @acc[0], @t[6], lsr#63
eor @acc[2], @acc[2], @t[6]
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], @t[6]
adcs @acc[2], @acc[2], xzr
eor @acc[4], @acc[4], @t[6]
umulh @t[0], @acc[0], @t[7]
adcs @acc[3], @acc[3], xzr
umulh @t[1], @acc[1], @t[7]
eor @acc[5], @acc[5], @t[6]
umulh @t[2], @acc[2], @t[7]
adcs @acc[4], @acc[4], xzr
umulh @t[3], @acc[3], @t[7]
adc @acc[5], @acc[5], xzr
umulh @t[4], @acc[4], @t[7]
smulh @t[5+$j], @acc[5], @t[7]
mul @acc[0], @acc[0], @t[7]
mul @acc[1], @acc[1], @t[7]
mul @acc[2], @acc[2], @t[7]
adds @acc[1], @acc[1], @t[0]
mul @acc[3], @acc[3], @t[7]
adcs @acc[2], @acc[2], @t[1]
mul @acc[4], @acc[4], @t[7]
adcs @acc[3], @acc[3], @t[2]
mul @acc[5], @acc[5], @t[7]
adcs @acc[4], @acc[4], @t[3]
adcs @acc[5], @acc[5] ,@t[4]
adc @t[5+$j], @t[5+$j], xzr
___
}
$code.=<<___;
adds @acc[0], @acc[0], @acc[6]
adcs @acc[1], @acc[1], @acc[7]
adcs @acc[2], @acc[2], @acc[8]
adcs @acc[3], @acc[3], @acc[9]
adcs @acc[4], @acc[4], @acc[10]
adcs @acc[5], @acc[5], @acc[11]
adc @acc[6], @t[5], @t[6]
extr @acc[0], @acc[1], @acc[0], #62
extr @acc[1], @acc[2], @acc[1], #62
extr @acc[2], @acc[3], @acc[2], #62
asr @t[6], @acc[6], #63
extr @acc[3], @acc[4], @acc[3], #62
extr @acc[4], @acc[5], @acc[4], #62
extr @acc[5], @acc[6], @acc[5], #62
eor @acc[0], @acc[0], @t[6]
eor @acc[1], @acc[1], @t[6]
adds @acc[0], @acc[0], @t[6], lsr#63
eor @acc[2], @acc[2], @t[6]
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], @t[6]
adcs @acc[2], @acc[2], xzr
eor @acc[4], @acc[4], @t[6]
adcs @acc[3], @acc[3], xzr
eor @acc[5], @acc[5], @t[6]
stp @acc[0], @acc[1], [$out_ptr,#8*0]
adcs @acc[4], @acc[4], xzr
stp @acc[2], @acc[3], [$out_ptr,#8*2]
adc @acc[5], @acc[5], xzr
stp @acc[4], @acc[5], [$out_ptr,#8*4]
eor $f0, $f0, @t[6]
eor $g0, $g0, @t[6]
sub $f0, $f0, @t[6]
sub $g0, $g0, @t[6]
ret
.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
___
{
my @a = @acc[0..5];
my @b = @acc[6..11];
$code.=<<___;
.type __ab_approximation_62, %function
.align 4
__ab_approximation_62:
ldp @a[4], @a[5], [$in_ptr,#8*4]
ldp @b[4], @b[5], [$in_ptr,#8*10]
ldp @a[2], @a[3], [$in_ptr,#8*2]
ldp @b[2], @b[3], [$in_ptr,#8*8]
.Lab_approximation_62_loaded:
orr @t[0], @a[5], @b[5] // check top-most limbs, ...
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[3], ne
orr @t[0], @a[5], @b[5] // ... ones before top-most, ...
csel @b[4], @b[4], @b[3], ne
ldp @a[0], @a[1], [$in_ptr,#8*0]
ldp @b[0], @b[1], [$in_ptr,#8*6]
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[2], ne
orr @t[0], @a[5], @b[5] // ... and ones before that ...
csel @b[4], @b[4], @b[2], ne
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[1], ne
orr @t[0], @a[5], @b[5]
csel @b[4], @b[4], @b[1], ne
clz @t[0], @t[0]
cmp @t[0], #64
csel @t[0], @t[0], xzr, ne
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
neg @t[1], @t[0]
lslv @a[5], @a[5], @t[0] // align high limbs to the left
lslv @b[5], @b[5], @t[0]
lsrv @a[4], @a[4], @t[1]
lsrv @b[4], @b[4], @t[1]
and @a[4], @a[4], @t[1], asr#6
and @b[4], @b[4], @t[1], asr#6
orr @a[5], @a[5], @a[4]
orr @b[5], @b[5], @b[4]
b __inner_loop_62
ret
.size __ab_approximation_62,.-__ab_approximation_62
___
}
$code.=<<___;
.type __inner_loop_62, %function
.align 4
__inner_loop_62:
mov $f0, #1 // |f0|=1
mov $g0, #0 // |g0|=0
mov $f1, #0 // |f1|=0
mov $g1, #1 // |g1|=1
.Loop_62:
sbfx @t[6], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting
sub $cnt, $cnt, #1
subs @t[2], $b_lo, $a_lo // |b_|-|a_|
and @t[0], $b_lo, @t[6]
sbc @t[3], $b_hi, $a_hi
and @t[1], $b_hi, @t[6]
subs @t[4], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
mov @t[0], $f0
sbcs @t[5], $a_hi, @t[1]
mov @t[1], $g0
csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_|
csel $b_hi, $b_hi, $a_hi, hs
csel $a_lo, @t[4], @t[2], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel $a_hi, @t[5], @t[3], hs
csel $f0, $f0, $f1, hs // exchange |f0| and |f1|
csel $f1, $f1, @t[0], hs
csel $g0, $g0, $g1, hs // exchange |g0| and |g1|
csel $g1, $g1, @t[1], hs
extr $a_lo, $a_hi, $a_lo, #1
lsr $a_hi, $a_hi, #1
and @t[0], $f1, @t[6]
and @t[1], $g1, @t[6]
add $f1, $f1, $f1 // |f1|<<=1
add $g1, $g1, $g1 // |g1|<<=1
sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even)
sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...)
cbnz $cnt, .Loop_62
ret
.size __inner_loop_62,.-__inner_loop_62
___
print $code;
close STDOUT;

View file

@ -0,0 +1,398 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast quadratic residue test as suggested in
# https://eprint.iacr.org/2020/972. Performance is >12x better [on
# Cortex cores] than modulus-specific Legendre symbol addition chain...
#
# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
#
$python_ref.=<<'___';
def ct_is_square_mod_384(inp, mod):
a = inp
b = mod
L = 0 # only least significant bit, adding 1 makes up for sign change
k = 30
w = 32
mask = (1 << w) - 1
for i in range(0, 768 // k - 1):
# __ab_approximation_30
n = max(a.bit_length(), b.bit_length())
if n < 64:
a_, b_ = a, b
else:
a_ = (a & mask) | ((a >> (n-w)) << w)
b_ = (b & mask) | ((b >> (n-w)) << w)
# __inner_loop_30
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, k):
if a_ & 1:
if a_ < b_:
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
# tell the whole story
a_, f0, g0 = a_-b_, f0-f1, g0-g1
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7]
# __smulq_384_n_shift_by_30
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
if b < 0:
b = -b
if a < 0:
a = -a
L += (b % 4) >> 1 # |b| is always odd, the second bit
# tells the whole story
if True:
for j in range(0, 768 % k + k):
if a & 1:
if a < b:
a, b = b, a
L += (a & b) >> 1 # |a| and |b| are both odd, second bits
# tell the whole story
a = a-b
a = a >> 1
L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7]
return (L & 1) ^ 1
___
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2));
my @acc=map("x$_",(3..14));
my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20));
my @t = map("x$_",(21..28));
my ($a_, $b_) = @acc[5,11];
$frame = 2*256;
$code.=<<___;
.text
.globl ct_is_square_mod_384
.type ct_is_square_mod_384, %function
.align 5
ct_is_square_mod_384:
paciasp
stp x29, x30, [sp,#-128]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
sub sp, sp, #$frame
ldp @acc[0], @acc[1], [x0,#8*0] // load input
ldp @acc[2], @acc[3], [x0,#8*2]
ldp @acc[4], @acc[5], [x0,#8*4]
add $in_ptr, sp, #255 // find closest 256-byte-aligned spot
and $in_ptr, $in_ptr, #-256 // in the frame...
ldp @acc[6], @acc[7], [x1,#8*0] // load modulus
ldp @acc[8], @acc[9], [x1,#8*2]
ldp @acc[10], @acc[11], [x1,#8*4]
stp @acc[0], @acc[1], [$in_ptr,#8*6] // copy input to |a|
stp @acc[2], @acc[3], [$in_ptr,#8*8]
stp @acc[4], @acc[5], [$in_ptr,#8*10]
stp @acc[6], @acc[7], [$in_ptr,#8*0] // copy modulus to |b|
stp @acc[8], @acc[9], [$in_ptr,#8*2]
stp @acc[10], @acc[11], [$in_ptr,#8*4]
eor $L, $L, $L // init the Legendre symbol
mov $cnt, #24 // 24 is 768/30-1
b .Loop_is_square
.align 4
.Loop_is_square:
bl __ab_approximation_30
sub $cnt, $cnt, #1
eor $out_ptr, $in_ptr, #128 // pointer to dst |b|
bl __smul_384_n_shift_by_30
mov $f1, $f0 // |f0|
mov $g1, $g0 // |g0|
add $out_ptr, $out_ptr, #8*6 // pointer to dst |a|
bl __smul_384_n_shift_by_30
ldp @acc[6], @acc[7], [$out_ptr,#-8*6]
eor $in_ptr, $in_ptr, #128 // flip-flop src |a|b|
and @t[6], @t[6], @acc[6] // if |a| was negative,
add $L, $L, @t[6], lsr#1 // adjust |L|
cbnz $cnt, .Loop_is_square
////////////////////////////////////////// last iteration
//bl __ab_approximation_30 // |a| and |b| are exact,
//ldr $a_, [$in_ptr,#8*6] // just load
mov $b_, @acc[6] // ldr $b_, [$in_ptr,#8*0]
mov $cnt, #48 // 48 is 768%30 + 30
bl __inner_loop_48
ldr x30, [x29,#8]
and x0, $L, #1
eor x0, x0, #1
add sp, sp, #$frame
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldr x29, [sp],#128
autiasp
ret
.size ct_is_square_mod_384,.-ct_is_square_mod_384
.type __smul_384_n_shift_by_30, %function
.align 5
__smul_384_n_shift_by_30:
___
for($j=0; $j<2; $j++) {
my $fx = $g1; $fx = $f1 if ($j);
my @acc = @acc; @acc = @acc[6..11] if ($j);
my $k = 8*6*$j;
$code.=<<___;
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |b| (or |a|)
asr @t[6], $fx, #63 // |g1|'s sign as mask (or |f1|'s)
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
eor $fx, $fx, @t[6] // conditionally negate |g1| (or |f1|)
ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k]
eor @acc[0], @acc[0], @t[6] // conditionally negate |b| (or |a|)
sub $fx, $fx, @t[6]
eor @acc[1], @acc[1], @t[6]
adds @acc[0], @acc[0], @t[6], lsr#63
eor @acc[2], @acc[2], @t[6]
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], @t[6]
adcs @acc[2], @acc[2], xzr
eor @acc[4], @acc[4], @t[6]
umulh @t[0], @acc[0], $fx
adcs @acc[3], @acc[3], xzr
umulh @t[1], @acc[1], $fx
eor @acc[5], @acc[5], @t[6]
umulh @t[2], @acc[2], $fx
adcs @acc[4], @acc[4], xzr
umulh @t[3], @acc[3], $fx
adc @acc[5], @acc[5], xzr
umulh @t[4], @acc[4], $fx
and @t[7], $fx, @t[6]
umulh @t[5+$j], @acc[5], $fx
neg @t[7], @t[7]
mul @acc[0], @acc[0], $fx
mul @acc[1], @acc[1], $fx
mul @acc[2], @acc[2], $fx
adds @acc[1], @acc[1], @t[0]
mul @acc[3], @acc[3], $fx
adcs @acc[2], @acc[2], @t[1]
mul @acc[4], @acc[4], $fx
adcs @acc[3], @acc[3], @t[2]
mul @acc[5], @acc[5], $fx
adcs @acc[4], @acc[4], @t[3]
adcs @acc[5], @acc[5] ,@t[4]
adc @t[5+$j], @t[5+$j], @t[7]
___
}
$code.=<<___;
adds @acc[0], @acc[0], @acc[6]
adcs @acc[1], @acc[1], @acc[7]
adcs @acc[2], @acc[2], @acc[8]
adcs @acc[3], @acc[3], @acc[9]
adcs @acc[4], @acc[4], @acc[10]
adcs @acc[5], @acc[5], @acc[11]
adc @acc[6], @t[5], @t[6]
extr @acc[0], @acc[1], @acc[0], #30
extr @acc[1], @acc[2], @acc[1], #30
extr @acc[2], @acc[3], @acc[2], #30
asr @t[6], @acc[6], #63
extr @acc[3], @acc[4], @acc[3], #30
extr @acc[4], @acc[5], @acc[4], #30
extr @acc[5], @acc[6], @acc[5], #30
eor @acc[0], @acc[0], @t[6]
eor @acc[1], @acc[1], @t[6]
adds @acc[0], @acc[0], @t[6], lsr#63
eor @acc[2], @acc[2], @t[6]
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], @t[6]
adcs @acc[2], @acc[2], xzr
eor @acc[4], @acc[4], @t[6]
adcs @acc[3], @acc[3], xzr
eor @acc[5], @acc[5], @t[6]
stp @acc[0], @acc[1], [$out_ptr,#8*0]
adcs @acc[4], @acc[4], xzr
stp @acc[2], @acc[3], [$out_ptr,#8*2]
adc @acc[5], @acc[5], xzr
stp @acc[4], @acc[5], [$out_ptr,#8*4]
ret
.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
___
{
my @a = @acc[0..5];
my @b = @acc[6..11];
my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]);
$code.=<<___;
.type __ab_approximation_30, %function
.align 4
__ab_approximation_30:
ldp @b[4], @b[5], [$in_ptr,#8*4] // |a| is still in registers
ldp @b[2], @b[3], [$in_ptr,#8*2]
orr @t[0], @a[5], @b[5] // check top-most limbs, ...
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[3], ne
orr @t[0], @a[5], @b[5] // ... ones before top-most, ...
csel @b[4], @b[4], @b[3], ne
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[2], ne
orr @t[0], @a[5], @b[5] // ... and ones before that ...
csel @b[4], @b[4], @b[2], ne
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[1], ne
orr @t[0], @a[5], @b[5] // and one more, ...
csel @b[4], @b[4], @b[1], ne
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[0], ne
orr @t[0], @a[5], @b[5]
csel @b[4], @b[4], @b[0], ne
clz @t[0], @t[0]
cmp @t[0], #64
csel @t[0], @t[0], xzr, ne
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
neg @t[1], @t[0]
lslv @a[5], @a[5], @t[0] // align high limbs to the left
lslv @b[5], @b[5], @t[0]
lsrv @a[4], @a[4], @t[1]
lsrv @b[4], @b[4], @t[1]
and @a[4], @a[4], @t[1], asr#6
and @b[4], @b[4], @t[1], asr#6
orr $a_, @a[5], @a[4]
orr $b_, @b[5], @b[4]
bfxil $a_, @a[0], #0, #32
bfxil $b_, @b[0], #0, #32
b __inner_loop_30
ret
.size __ab_approximation_30,.-__ab_approximation_30
.type __inner_loop_30, %function
.align 4
__inner_loop_30:
mov $cnt, #30
mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0
mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1
mov $bias,#0x7FFFFFFF7FFFFFFF
.Loop_30:
sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting
and @t[4], $a_, $b_
sub $cnt, $cnt, #1
and @t[0], $b_, @t[3]
sub @t[1], $b_, $a_ // |b_|-|a_|
subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
add @t[4], $L, @t[4], lsr#1 // L + (a_ & b_) >> 1
mov @t[0], $fg1
csel $b_, $b_, $a_, hs // |b_| = |a_|
csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1|
csel $fg0, $fg0, @t[0], hs
csel $L, $L, @t[4], hs
lsr $a_, $a_, #1
and @t[0], $fg1, @t[3]
and @t[1], $bias, @t[3]
add $t[2], $b_, #2
sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even)
add $fg1, $fg1, $fg1 // |f1|<<=1
add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5
add $fg0, $fg0, @t[1]
sub $fg1, $fg1, $bias
cbnz $cnt, .Loop_30
mov $bias, #0x7FFFFFFF
ubfx $f0, $fg0, #0, #32
ubfx $g0, $fg0, #32, #32
ubfx $f1, $fg1, #0, #32
ubfx $g1, $fg1, #32, #32
sub $f0, $f0, $bias // remove the bias
sub $g0, $g0, $bias
sub $f1, $f1, $bias
sub $g1, $g1, $bias
ret
.size __inner_loop_30,.-__inner_loop_30
___
}
$code.=<<___;
.type __inner_loop_48, %function
.align 4
__inner_loop_48:
.Loop_48:
sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting
and @t[4], $a_, $b_
sub $cnt, $cnt, #1
and @t[0], $b_, @t[3]
sub @t[1], $b_, $a_ // |b_|-|a_|
subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
add @t[4], $L, @t[4], lsr#1
csel $b_, $b_, $a_, hs // |b_| = |a_|
csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel $L, $L, @t[4], hs
add $t[2], $b_, #2
lsr $a_, $a_, #1
add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5
cbnz $cnt, .Loop_48
ret
.size __inner_loop_48,.-__inner_loop_48
___
print $code;
close STDOUT;

View file

@ -0,0 +1,494 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast quadratic residue test as suggested in
# https://eprint.iacr.org/2020/972. Performance is >5x better than
# modulus-specific Legendre symbol addition chain...
#
# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
#
$python_ref.=<<'___';
def ct_is_square_mod_384(inp, mod):
a = inp
b = mod
L = 0 # only least significant bit, adding 1 makes up for sign change
k = 30
w = 32
mask = (1 << w) - 1
for i in range(0, 768 // k - 1):
# __ab_approximation_30
n = max(a.bit_length(), b.bit_length())
if n < 64:
a_, b_ = a, b
else:
a_ = (a & mask) | ((a >> (n-w)) << w)
b_ = (b & mask) | ((b >> (n-w)) << w)
# __inner_loop_30
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, k):
if a_ & 1:
if a_ < b_:
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
# tell the whole story
a_, f0, g0 = a_-b_, f0-f1, g0-g1
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7]
# __smulq_384_n_shift_by_30
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
if b < 0:
b = -b
if a < 0:
a = -a
L += (b % 4) >> 1 # |b| is always odd, the second bit
# tells the whole story
if True:
for j in range(0, 768 % k + k):
if a & 1:
if a < b:
a, b = b, a
L += (a & b) >> 1 # |a| and |b| are both odd, second bits
# tell the whole story
a = a-b
a = a >> 1
L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7]
return (L & 1) ^ 1
___
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
my ($out_ptr, $in_ptr) = ("%rdi", "%rsi");
my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx");
my @acc=map("%r$_",(8..15));
my $L = "%rbp";
$frame = 8*3+2*256;
$code.=<<___;
.text
.globl ct_is_square_mod_384
.type ct_is_square_mod_384,\@function,2,"unwind"
.align 32
ct_is_square_mod_384:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$$frame, %rsp
.cfi_adjust_cfa_offset $frame
.cfi_end_prologue
lea 8*3+255(%rsp), %rax # find closest 256-byte-aligned spot
and \$-256, %rax # in the frame...
mov 8*0(%rdi), @acc[0] # load input
mov 8*1(%rdi), @acc[1]
mov 8*2(%rdi), @acc[2]
mov 8*3(%rdi), @acc[3]
mov 8*4(%rdi), @acc[4]
mov 8*5(%rdi), @acc[5]
mov 8*0(%rsi), @acc[6] # load modulus
mov 8*1(%rsi), @acc[7]
mov 8*2(%rsi), %rbx
mov 8*3(%rsi), %rcx
mov 8*4(%rsi), %rdx
mov 8*5(%rsi), %rdi
mov %rax, $in_ptr # pointer to source |a|b|
mov @acc[0], 8*0(%rax) # copy input to |a|
mov @acc[1], 8*1(%rax)
mov @acc[2], 8*2(%rax)
mov @acc[3], 8*3(%rax)
mov @acc[4], 8*4(%rax)
mov @acc[5], 8*5(%rax)
mov @acc[6], 8*6(%rax) # copy modulus to |b|
mov @acc[7], 8*7(%rax)
mov %rbx, 8*8(%rax)
mov %rcx, 8*9(%rax)
mov %rdx, 8*10(%rax)
mov %rdi, 8*11(%rax)
xor $L, $L # initialize the Legendre symbol
mov \$24, %ecx # 24 is 768/30-1
jmp .Loop_is_square
.align 32
.Loop_is_square:
mov %ecx, 8*2(%rsp) # offload loop counter
call __ab_approximation_30
mov $f0, 8*0(%rsp) # offload |f0| and |g0|
mov $g0, 8*1(%rsp)
mov \$128+8*6, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |b|
call __smulq_384_n_shift_by_30
mov 8*0(%rsp), $f1 # pop |f0| and |g0|
mov 8*1(%rsp), $g1
lea -8*6($out_ptr),$out_ptr # pointer to destination |a|
call __smulq_384_n_shift_by_30
mov 8*2(%rsp), %ecx # re-load loop counter
xor \$128, $in_ptr # flip-flop pointer to source |a|b|
and 8*6($out_ptr), @acc[6] # if |a| was negative, adjust |L|
shr \$1, @acc[6]
add @acc[6], $L
sub \$1, %ecx
jnz .Loop_is_square
################################# last iteration
#call __ab_approximation_30 # |a| and |b| are exact, just load
#mov 8*0($in_ptr), @acc[0] # |a_|
mov 8*6($in_ptr), @acc[1] # |b_|
call __inner_loop_48 # 48 is 768%30+30
mov \$1, %rax
and $L, %rax
xor \$1, %rax # return value
lea $frame(%rsp), %r8 # size optimization
mov 8*0(%r8),%r15
.cfi_restore %r15
mov 8*1(%r8),%r14
.cfi_restore %r14
mov 8*2(%r8),%r13
.cfi_restore %r13
mov 8*3(%r8),%r12
.cfi_restore %r12
mov 8*4(%r8),%rbx
.cfi_restore %rbx
mov 8*5(%r8),%rbp
.cfi_restore %rbp
lea 8*6(%r8),%rsp
.cfi_adjust_cfa_offset -$frame-8*6
.cfi_epilogue
ret
.cfi_endproc
.size ct_is_square_mod_384,.-ct_is_square_mod_384
.type __smulq_384_n_shift_by_30,\@abi-omnipotent
.align 32
__smulq_384_n_shift_by_30:
___
for($j=0; $j<2; $j++) {
$code.=<<___;
mov 8*0($in_ptr), @acc[0] # load |a| (or |b|)
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*4($in_ptr), @acc[4]
mov 8*5($in_ptr), @acc[5]
mov %rdx, %rbx # |f1| (or |g1|)
sar \$63, %rdx # |f1|'s sign as mask (or |g1|'s)
xor %rax, %rax
sub %rdx, %rax # |f1|'s sign as bit (or |g1|'s)
xor %rdx, %rbx # conditionally negate |f1| (or |g1|)
add %rax, %rbx
xor %rdx, @acc[0] # conditionally negate |a| (or |b|)
xor %rdx, @acc[1]
xor %rdx, @acc[2]
xor %rdx, @acc[3]
xor %rdx, @acc[4]
xor %rdx, @acc[5]
add @acc[0], %rax
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
mov %rdx, @acc[6+$j]
and %rbx, @acc[6+$j]
mulq %rbx # |a|*|f1| (or |b|*|g1|)
mov %rax, @acc[0]
mov @acc[1], %rax
mov %rdx, @acc[1]
___
for($i=1; $i<5; $i++) {
$code.=<<___;
mulq %rbx
add %rax, @acc[$i]
mov @acc[$i+1], %rax
adc \$0, %rdx
mov %rdx, @acc[$i+1]
___
}
$code.=<<___;
neg @acc[6+$j]
mulq %rbx
add %rax, @acc[5]
adc %rdx, @acc[6+$j]
___
$code.=<<___ if ($j==0);
lea 8*6($in_ptr), $in_ptr # pointer to |b|
mov $g1, %rdx
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov @acc[5], 8*5($out_ptr)
___
}
$code.=<<___;
lea -8*6($in_ptr), $in_ptr # restore original in_ptr
add 8*0($out_ptr), @acc[0]
adc 8*1($out_ptr), @acc[1]
adc 8*2($out_ptr), @acc[2]
adc 8*3($out_ptr), @acc[3]
adc 8*4($out_ptr), @acc[4]
adc 8*5($out_ptr), @acc[5]
adc @acc[7], @acc[6]
shrd \$30, @acc[1], @acc[0]
shrd \$30, @acc[2], @acc[1]
shrd \$30, @acc[3], @acc[2]
shrd \$30, @acc[4], @acc[3]
shrd \$30, @acc[5], @acc[4]
shrd \$30, @acc[6], @acc[5]
sar \$63, @acc[6] # sign as mask
xor %rbx, %rbx
sub @acc[6], %rbx # sign as bit
xor @acc[6], @acc[0] # conditionally negate the result
xor @acc[6], @acc[1]
xor @acc[6], @acc[2]
xor @acc[6], @acc[3]
xor @acc[6], @acc[4]
xor @acc[6], @acc[5]
add %rbx, @acc[0]
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov @acc[5], 8*5($out_ptr)
ret
.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30
___
{
my ($a_, $b_) = @acc[0..1];
my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15));
my ($fg0, $fg1, $bias) = ($g0, $g1, $t5);
my $cnt = "%edi";
{
my @a = @acc[0..5];
my @b = (@a[1..3], $t4, $t5, $g0);
$code.=<<___;
.type __ab_approximation_30,\@abi-omnipotent
.align 32
__ab_approximation_30:
mov 8*11($in_ptr), @b[5] # load |b| in reverse order
mov 8*10($in_ptr), @b[4]
mov 8*9($in_ptr), @b[3]
mov @a[5], %rax
or @b[5], %rax # check top-most limbs, ...
cmovz @a[4], @a[5]
cmovz @b[4], @b[5]
cmovz @a[3], @a[4]
mov 8*8($in_ptr), @b[2]
cmovz @b[3], @b[4]
mov @a[5], %rax
or @b[5], %rax # ... ones before top-most, ...
cmovz @a[4], @a[5]
cmovz @b[4], @b[5]
cmovz @a[2], @a[4]
mov 8*7($in_ptr), @b[1]
cmovz @b[2], @b[4]
mov @a[5], %rax
or @b[5], %rax # ... and ones before that ...
cmovz @a[4], @a[5]
cmovz @b[4], @b[5]
cmovz @a[1], @a[4]
mov 8*6($in_ptr), @b[0]
cmovz @b[1], @b[4]
mov @a[5], %rax
or @b[5], %rax # ... and ones before that ...
cmovz @a[4], @a[5]
cmovz @b[4], @b[5]
cmovz @a[0], @a[4]
cmovz @b[0], @b[4]
mov @a[5], %rax
or @b[5], %rax
bsr %rax, %rcx
lea 1(%rcx), %rcx
cmovz @a[0], @a[5]
cmovz @b[0], @b[5]
cmovz %rax, %rcx
neg %rcx
#and \$63, %rcx # debugging artefact
shldq %cl, @a[4], @a[5] # align second limb to the left
shldq %cl, @b[4], @b[5]
mov \$0xFFFFFFFF00000000, %rax
mov @a[0]d, ${a_}d
mov @b[0]d, ${b_}d
and %rax, @a[5]
and %rax, @b[5]
or @a[5], ${a_}
or @b[5], ${b_}
jmp __inner_loop_30
ret
.size __ab_approximation_30,.-__ab_approximation_30
___
}
$code.=<<___;
.type __inner_loop_30,\@abi-omnipotent
.align 32
__inner_loop_30: ################# by Thomas Pornin
mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0
mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1
lea -1($fg0), $bias # 0x7FFFFFFF7FFFFFFF
mov \$30, $cnt
.Loop_30:
mov $a_, %rax
and $b_, %rax
shr \$1, %rax # (a_ & b_) >> 1
cmp $b_, $a_ # if |a_|<|b_|, swap the variables
mov $a_, $t0
mov $b_, $t1
lea (%rax,$L), %rax # pre-"negate" |L|
mov $fg0, $t2
mov $fg1, $t3
mov $L, $t4
cmovb $b_, $a_
cmovb $t0, $b_
cmovb $fg1, $fg0
cmovb $t2, $fg1
cmovb %rax, $L
sub $b_, $a_ # |a_|-|b_|
sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1|
add $bias, $fg0
test \$1, $t0 # if |a_| was even, roll back
cmovz $t0, $a_
cmovz $t1, $b_
cmovz $t2, $fg0
cmovz $t3, $fg1
cmovz $t4, $L
lea 2($b_), %rax
shr \$1, $a_ # |a_|>>=1
shr \$2, %rax
add $fg1, $fg1 # |f1|<<=1, |g1|<<=1
lea (%rax,$L), $L # "negate" |L| if |b|%8 is 3 or 5
sub $bias, $fg1
sub \$1, $cnt
jnz .Loop_30
shr \$32, $bias
mov %ebx, %eax # $fg0 -> $f0
shr \$32, $g0
mov %ecx, %edx # $fg1 -> $f1
shr \$32, $g1
sub $bias, $f0 # remove the bias
sub $bias, $g0
sub $bias, $f1
sub $bias, $g1
ret
.size __inner_loop_30,.-__inner_loop_30
.type __inner_loop_48,\@abi-omnipotent
.align 32
__inner_loop_48:
mov \$48, $cnt # 48 is 768%30+30
.Loop_48:
mov $a_, %rax
and $b_, %rax
shr \$1, %rax # (a_ & b_) >> 1
cmp $b_, $a_ # if |a_|<|b_|, swap the variables
mov $a_, $t0
mov $b_, $t1
lea (%rax,$L), %rax
mov $L, $t2
cmovb $b_, $a_
cmovb $t0, $b_
cmovb %rax, $L
sub $b_, $a_ # |a_|-|b_|
test \$1, $t0 # if |a_| was even, roll back
cmovz $t0, $a_
cmovz $t1, $b_
cmovz $t2, $L
lea 2($b_), %rax
shr \$1, $a_ # |a_|>>=1
shr \$2, %rax
add %rax, $L # "negate" |L| if |b|%8 is 3 or 5
sub \$1, $cnt
jnz .Loop_48
ret
.size __inner_loop_48,.-__inner_loop_48
___
}
print $code;
close STDOUT;

View file

@ -0,0 +1,886 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast Euclidean inversion as suggested in
# https://eprint.iacr.org/2020/972. Performance is >5x better than
# modulus-specific FLT addition chain...
#
# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
#
$python_ref.=<<'___';
def ct_inverse_mod_383(inp, mod):
a, u = inp, 1
b, v = mod, 0
k = 62
w = 64
mask = (1 << w) - 1
for i in range(0, 766 // k):
# __ab_approximation_62
n = max(a.bit_length(), b.bit_length())
if n < 128:
a_, b_ = a, b
else:
a_ = (a & mask) | ((a >> (n-w)) << w)
b_ = (b & mask) | ((b >> (n-w)) << w)
# __inner_loop_62
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, k):
if a_ & 1:
if a_ < b_:
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
a_, f0, g0 = a_-b_, f0-f1, g0-g1
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
# __smulq_383_n_shift_by_62
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
if a < 0:
a, f0, g0 = -a, -f0, -g0
if b < 0:
b, f1, g1 = -b, -f1, -g1
# __smulq_767x63
u, v = u*f0 + v*g0, u*f1 + v*g1
if 766 % k:
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, 766 % k):
if a & 1:
if a < b:
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
a, f0, g0 = a-b, f0-f1, g0-g1
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
v = u*f1 + v*g1
if v < 0:
v += mod << (768 - mod.bit_length()) # left aligned
return v & (2**768 - 1) # to be reduced % mod
___
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
my $cnt = "%edi";
$frame = 8*11+2*512;
$code.=<<___;
.text
.globl ct_inverse_mod_383
.type ct_inverse_mod_383,\@function,4,"unwind"
.align 32
ct_inverse_mod_383:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$$frame, %rsp
.cfi_adjust_cfa_offset $frame
.cfi_end_prologue
lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot
and \$-512, %rax # in the frame...
mov $out_ptr, 8*4(%rsp)
mov $nx_ptr, 8*5(%rsp)
mov 8*0($in_ptr), @acc[0] # load input
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*4($in_ptr), @acc[4]
mov 8*5($in_ptr), @acc[5]
mov 8*0($n_ptr), @acc[6] # load modulus
mov 8*1($n_ptr), @acc[7]
mov 8*2($n_ptr), @acc[8]
mov 8*3($n_ptr), @acc[9]
mov 8*4($n_ptr), @acc[10]
mov 8*5($n_ptr), @acc[11]
mov @acc[0], 8*0(%rax) # copy input to |a|
mov @acc[1], 8*1(%rax)
mov @acc[2], 8*2(%rax)
mov @acc[3], 8*3(%rax)
mov @acc[4], 8*4(%rax)
mov @acc[5], 8*5(%rax)
mov @acc[6], 8*6(%rax) # copy modulus to |b|
mov @acc[7], 8*7(%rax)
mov @acc[8], 8*8(%rax)
mov @acc[9], 8*9(%rax)
mov @acc[10], 8*10(%rax)
mov %rax, $in_ptr # pointer to source |a|b|1|0|
mov @acc[11], 8*11(%rax)
################################# first iteration
mov \$62, $cnt
call __ab_approximation_62
#mov $f0, 8*7(%rsp)
#mov $g0, 8*8(%rsp)
mov $f1, 8*9(%rsp)
mov $g1, 8*10(%rsp)
mov \$256, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
call __smulq_383_n_shift_by_62
#mov $f0, 8*7(%rsp) # corrected |f0|
#mov $g0, 8*8(%rsp) # corrected |g0|
mov $f0, 8*12($out_ptr) # initialize |u| with |f0|
mov 8*9(%rsp), $f0 # |f1|
mov 8*10(%rsp), $g0 # |g1|
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
call __smulq_383_n_shift_by_62
#mov $f0, 8*9(%rsp) # corrected |f1|
#mov $g0, 8*10(%rsp) # corrected |g1|
mov $f0, 8*12($out_ptr) # initialize |v| with |f1|
################################# second iteration
xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v|
mov \$62, $cnt
call __ab_approximation_62
#mov $f0, 8*7(%rsp)
#mov $g0, 8*8(%rsp)
mov $f1, 8*9(%rsp)
mov $g1, 8*10(%rsp)
mov \$256, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
call __smulq_383_n_shift_by_62
mov $f0, 8*7(%rsp) # corrected |f0|
mov $g0, 8*8(%rsp) # corrected |g0|
mov 8*9(%rsp), $f0 # |f1|
mov 8*10(%rsp), $g0 # |g1|
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
call __smulq_383_n_shift_by_62
#mov $f0, 8*9(%rsp) # corrected |f1|
#mov $g0, 8*10(%rsp) # corrected |g1|
mov 8*12($in_ptr), %rax # |u|
mov 8*18($in_ptr), @acc[3] # |v|
mov $f0, %rbx
mov %rax, @acc[2]
imulq 8*7(%rsp) # |u|*|f0|
mov %rax, @acc[0]
mov @acc[3], %rax
mov %rdx, @acc[1]
imulq 8*8(%rsp) # |v|*|g0|
add %rax, @acc[0]
adc %rdx, @acc[1]
mov @acc[0], 8*6($out_ptr) # destination |u|
mov @acc[1], 8*7($out_ptr)
sar \$63, @acc[1] # sign extension
mov @acc[1], 8*8($out_ptr)
mov @acc[1], 8*9($out_ptr)
mov @acc[1], 8*10($out_ptr)
mov @acc[1], 8*11($out_ptr)
lea 8*12($in_ptr),$in_ptr # make in_ptr "rewindable" with xor
mov @acc[2], %rax
imulq %rbx # |u|*|f1|
mov %rax, @acc[0]
mov @acc[3], %rax
mov %rdx, @acc[1]
imulq %rcx # |v|*|g1|
add %rax, @acc[0]
adc %rdx, @acc[1]
mov @acc[0], 8*12($out_ptr) # destination |v|
mov @acc[1], 8*13($out_ptr)
sar \$63, @acc[1] # sign extension
mov @acc[1], 8*14($out_ptr)
mov @acc[1], 8*15($out_ptr)
mov @acc[1], 8*16($out_ptr)
mov @acc[1], 8*17($out_ptr)
___
for($i=2; $i<11; $i++) {
my $smul_767x63 = $i>5 ? "__smulq_767x63"
: "__smulq_383x63";
$code.=<<___;
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
mov \$62, $cnt
call __ab_approximation_62
#mov $f0, 8*7(%rsp)
#mov $g0, 8*8(%rsp)
mov $f1, 8*9(%rsp)
mov $g1, 8*10(%rsp)
mov \$256, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
call __smulq_383_n_shift_by_62
mov $f0, 8*7(%rsp) # corrected |f0|
mov $g0, 8*8(%rsp) # corrected |g0|
mov 8*9(%rsp), $f0 # |f1|
mov 8*10(%rsp), $g0 # |g1|
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
call __smulq_383_n_shift_by_62
mov $f0, 8*9(%rsp) # corrected |f1|
mov $g0, 8*10(%rsp) # corrected |g1|
mov 8*7(%rsp), $f0 # |f0|
mov 8*8(%rsp), $g0 # |g0|
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
call __smulq_383x63
mov 8*9(%rsp), $f0 # |f1|
mov 8*10(%rsp), $g0 # |g1|
lea 8*6($out_ptr),$out_ptr # pointer to destination |v|
call $smul_767x63
___
$code.=<<___ if ($i==5);
sar \$63, @acc[5] # sign extension
mov @acc[5], 8*6($out_ptr)
mov @acc[5], 8*7($out_ptr)
mov @acc[5], 8*8($out_ptr)
mov @acc[5], 8*9($out_ptr)
mov @acc[5], 8*10($out_ptr)
mov @acc[5], 8*11($out_ptr)
___
}
$code.=<<___;
################################# iteration before last
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
mov \$62, $cnt
#call __ab_approximation_62 # |a| and |b| are exact, just load
mov 8*0($in_ptr), @acc[0] # |a_lo|
mov 8*1($in_ptr), @acc[1] # |a_hi|
mov 8*6($in_ptr), @acc[2] # |b_lo|
mov 8*7($in_ptr), @acc[3] # |b_hi|
call __inner_loop_62
#mov $f0, 8*7(%rsp)
#mov $g0, 8*8(%rsp)
mov $f1, 8*9(%rsp)
mov $g1, 8*10(%rsp)
mov \$256, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
mov @acc[0], 8*0($out_ptr)
mov @acc[2], 8*6($out_ptr)
#mov 8*7(%rsp), $f0 # |f0|
#mov 8*8(%rsp), $g0 # |g0|
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
lea 8*12($out_ptr),$out_ptr # pointer to destination |u|
call __smulq_383x63
mov 8*9(%rsp), $f0 # |f1|
mov 8*10(%rsp), $g0 # |g1|
lea 8*6($out_ptr),$out_ptr # pointer to destination |v|
call __smulq_767x63
################################# last iteration
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
mov \$22, $cnt # 766 % 62
#call __ab_approximation_62 # |a| and |b| are exact, just load
mov 8*0($in_ptr), @acc[0] # |a_lo|
xor @acc[1], @acc[1] # |a_hi|
mov 8*6($in_ptr), @acc[2] # |b_lo|
xor @acc[3], @acc[3] # |b_hi|
call __inner_loop_62
#mov $f0, 8*7(%rsp)
#mov $g0, 8*8(%rsp)
#mov $f1, 8*9(%rsp)
#mov $g1, 8*10(%rsp)
#mov 8*7(%rsp), $f0 # |f0|
#mov 8*8(%rsp), $g0 # |g0|
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
#lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
#call __smulq_383x63
#mov 8*9(%rsp), $f0 # |f1|
#mov 8*10(%rsp), $g0 # |g1|
mov $f1, $f0
mov $g1, $g0
mov 8*4(%rsp), $out_ptr # original out_ptr
call __smulq_767x63
mov 8*5(%rsp), $in_ptr # original n_ptr
mov %rax, %rdx # top limb of the result
sar \$63, %rax # result's sign as mask
mov %rax, @acc[0] # mask |modulus|
mov %rax, @acc[1]
mov %rax, @acc[2]
and 8*0($in_ptr), @acc[0]
and 8*1($in_ptr), @acc[1]
mov %rax, @acc[3]
and 8*2($in_ptr), @acc[2]
and 8*3($in_ptr), @acc[3]
mov %rax, @acc[4]
and 8*4($in_ptr), @acc[4]
and 8*5($in_ptr), %rax
add @acc[0], @acc[6] # conditionally add |modulus|<<384
adc @acc[1], @acc[7]
adc @acc[2], @acc[8]
adc @acc[3], @acc[9]
adc @acc[4], %rcx
adc %rax, %rdx
mov @acc[6], 8*6($out_ptr) # store absolute value
mov @acc[7], 8*7($out_ptr)
mov @acc[8], 8*8($out_ptr)
mov @acc[9], 8*9($out_ptr)
mov %rcx, 8*10($out_ptr)
mov %rdx, 8*11($out_ptr)
lea $frame(%rsp), %r8 # size optimization
mov 8*0(%r8),%r15
.cfi_restore %r15
mov 8*1(%r8),%r14
.cfi_restore %r14
mov 8*2(%r8),%r13
.cfi_restore %r13
mov 8*3(%r8),%r12
.cfi_restore %r12
mov 8*4(%r8),%rbx
.cfi_restore %rbx
mov 8*5(%r8),%rbp
.cfi_restore %rbp
lea 8*6(%r8),%rsp
.cfi_adjust_cfa_offset -$frame-8*6
.cfi_epilogue
ret
.cfi_endproc
.size ct_inverse_mod_383,.-ct_inverse_mod_383
___
########################################################################
# see corresponding commentary in ctx_inverse_mod_384-x86_64...
{
my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
my $fx = @acc[9];
$code.=<<___;
.type __smulq_767x63,\@abi-omnipotent
.align 32
__smulq_767x63:
mov 8*0($in_ptr), @acc[0] # load |u|
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*4($in_ptr), @acc[4]
mov 8*5($in_ptr), @acc[5]
mov $f0, $fx
sar \$63, $f0 # |f0|'s sign as mask
xor %rax, %rax
sub $f0, %rax # |f0|'s sign as bit
mov $out_ptr, 8*1(%rsp)
mov $in_ptr, 8*2(%rsp)
lea 8*6($in_ptr), $in_ptr # pointer to |v|
xor $f0, $fx # conditionally negate |f0|
add %rax, $fx
xor $f0, @acc[0] # conditionally negate |u|
xor $f0, @acc[1]
xor $f0, @acc[2]
xor $f0, @acc[3]
xor $f0, @acc[4]
xor $f0, @acc[5]
add @acc[0], %rax
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
mulq $fx # |u|*|f0|
mov %rax, 8*0($out_ptr) # offload |u|*|f0|
mov @acc[1], %rax
mov %rdx, @acc[1]
___
for($i=1; $i<5; $i++) {
$code.=<<___;
mulq $fx
add %rax, @acc[$i]
mov @acc[$i+1], %rax
adc \$0, %rdx
mov %rdx, @acc[$i+1]
mov @acc[$i], 8*$i($out_ptr)
___
}
$code.=<<___;
imulq $fx
add %rax, @acc[$i]
adc \$0, %rdx
mov @acc[5], 8*5($out_ptr)
mov %rdx, 8*6($out_ptr)
sar \$63, %rdx # sign extension
mov %rdx, 8*7($out_ptr)
___
{
my $fx=$in_ptr;
$code.=<<___;
mov $g0, $f0 # load |g0|
mov 8*0($in_ptr), @acc[0] # load |v|
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*4($in_ptr), @acc[4]
mov 8*5($in_ptr), @acc[5]
mov 8*6($in_ptr), @acc[6]
mov 8*7($in_ptr), @acc[7]
mov 8*8($in_ptr), @acc[8]
mov 8*9($in_ptr), @acc[9]
mov 8*10($in_ptr), @acc[10]
mov 8*11($in_ptr), @acc[11]
mov $f0, $fx # overrides in_ptr
sar \$63, $f0 # |g0|'s sign as mask
xor %rax, %rax
sub $f0, %rax # |g0|'s sign as bit
xor $f0, $fx # conditionally negate |g0|
add %rax, $fx
xor $f0, @acc[0] # conditionally negate |v|
xor $f0, @acc[1]
xor $f0, @acc[2]
xor $f0, @acc[3]
xor $f0, @acc[4]
xor $f0, @acc[5]
xor $f0, @acc[6]
xor $f0, @acc[7]
xor $f0, @acc[8]
xor $f0, @acc[9]
xor $f0, @acc[10]
xor $f0, @acc[11]
add @acc[0], %rax
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
adc \$0, @acc[6]
adc \$0, @acc[7]
adc \$0, @acc[8]
adc \$0, @acc[9]
adc \$0, @acc[10]
adc \$0, @acc[11]
mulq $fx # |v|*|g0|
mov %rax, @acc[0]
mov @acc[1], %rax
mov %rdx, @acc[1]
___
for($i=1; $i<11; $i++) {
$code.=<<___;
mulq $fx
add %rax, @acc[$i]
mov @acc[$i+1], %rax
adc \$0, %rdx
mov %rdx, @acc[$i+1]
___
}
$code.=<<___;
mov 8*1(%rsp), %rdx # out_ptr
imulq $fx, %rax
mov 8*2(%rsp), $in_ptr # restore original in_ptr
add @acc[11], %rax
add 8*0(%rdx), @acc[0] # accumulate |u|*|f0|
adc 8*1(%rdx), @acc[1]
adc 8*2(%rdx), @acc[2]
adc 8*3(%rdx), @acc[3]
adc 8*4(%rdx), @acc[4]
adc 8*5(%rdx), @acc[5]
adc 8*6(%rdx), @acc[6]
mov 8*7(%rdx), @acc[11] # sign extension
adc @acc[11], @acc[7]
adc @acc[11], @acc[8]
adc @acc[11], @acc[9]
adc @acc[11], @acc[10]
adc @acc[11], %rax
mov %rdx, $out_ptr # restore original out_ptr
mov @acc[0], 8*0(%rdx)
mov @acc[1], 8*1(%rdx)
mov @acc[2], 8*2(%rdx)
mov @acc[3], 8*3(%rdx)
mov @acc[4], 8*4(%rdx)
mov @acc[5], 8*5(%rdx)
mov @acc[6], 8*6(%rdx)
mov @acc[7], 8*7(%rdx)
mov @acc[8], 8*8(%rdx)
mov @acc[9], 8*9(%rdx)
mov @acc[10], 8*10(%rdx)
mov %rax, 8*11(%rdx)
ret
.size __smulq_767x63,.-__smulq_767x63
___
}
$code.=<<___;
.type __smulq_383x63,\@abi-omnipotent
.align 32
__smulq_383x63:
___
for($j=0; $j<2; $j++) {
$code.=<<___;
mov 8*0($in_ptr), @acc[0] # load |u| (or |v|)
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*4($in_ptr), @acc[4]
mov 8*5($in_ptr), @acc[5]
mov %rdx, $fx
sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s)
xor %rax, %rax
sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s)
xor %rdx, $fx # conditionally negate |f0|
add %rax, $fx
xor %rdx, @acc[0] # conditionally negate |u| (or |v|)
xor %rdx, @acc[1]
xor %rdx, @acc[2]
xor %rdx, @acc[3]
xor %rdx, @acc[4]
xor %rdx, @acc[5]
add @acc[0], %rax
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
mulq $fx # |u|*|f0| (or |v|*|g0|)
mov %rax, @acc[0]
mov @acc[1], %rax
mov %rdx, @acc[1]
___
for($i=1; $i<5; $i++) {
$code.=<<___;
mulq $fx
add %rax, @acc[$i]
mov @acc[$i+1], %rax
adc \$0, %rdx
mov %rdx, @acc[$i+1]
___
}
$code.=<<___ if ($j==0);
imulq $fx, %rax
add %rax, @acc[$i]
lea 8*6($in_ptr), $in_ptr # pointer to |v|
mov $g0, %rdx
mov @acc[0], 8*0($out_ptr) # offload |u|*|f0|
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov @acc[5], 8*5($out_ptr)
___
}
$code.=<<___;
imulq $fx, %rax
add %rax, @acc[$i]
lea -8*6($in_ptr), $in_ptr # restore original in_ptr
add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0|
adc 8*1($out_ptr), @acc[1]
adc 8*2($out_ptr), @acc[2]
adc 8*3($out_ptr), @acc[3]
adc 8*4($out_ptr), @acc[4]
adc 8*5($out_ptr), @acc[5]
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov @acc[5], 8*5($out_ptr)
ret
.size __smulq_383x63,.-__smulq_383x63
___
{
$code.=<<___;
.type __smulq_383_n_shift_by_62,\@abi-omnipotent
.align 32
__smulq_383_n_shift_by_62:
mov $f0, @acc[8]
___
my $f0 = @acc[8];
for($j=0; $j<2; $j++) {
$code.=<<___;
mov 8*0($in_ptr), @acc[0] # load |a| (or |b|)
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*4($in_ptr), @acc[4]
mov 8*5($in_ptr), @acc[5]
mov %rdx, $fx
sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s)
xor %rax, %rax
sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s)
xor %rdx, $fx # conditionally negate |f0| (or |g0|)
add %rax, $fx
xor %rdx, @acc[0] # conditionally negate |a| (or |b|)
xor %rdx, @acc[1]
xor %rdx, @acc[2]
xor %rdx, @acc[3]
xor %rdx, @acc[4]
xor %rdx, @acc[5]
add @acc[0], %rax
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
mulq $fx # |a|*|f0| (or |b|*|g0|)
mov %rax, @acc[0]
mov @acc[1], %rax
mov %rdx, @acc[1]
___
for($i=1; $i<5; $i++) {
$code.=<<___;
mulq $fx
add %rax, @acc[$i]
mov @acc[$i+1], %rax
adc \$0, %rdx
mov %rdx, @acc[$i+1]
___
}
$code.=<<___ if ($j==0);
imulq $fx
add %rax, @acc[$i]
adc \$0, %rdx
lea 8*6($in_ptr), $in_ptr # pointer to |b|
mov %rdx, @acc[6]
mov $g0, %rdx
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov @acc[5], 8*5($out_ptr)
___
}
$code.=<<___;
imulq $fx
add %rax, @acc[$i]
adc \$0, %rdx
lea -8*6($in_ptr), $in_ptr # restore original in_ptr
add 8*0($out_ptr), @acc[0]
adc 8*1($out_ptr), @acc[1]
adc 8*2($out_ptr), @acc[2]
adc 8*3($out_ptr), @acc[3]
adc 8*4($out_ptr), @acc[4]
adc 8*5($out_ptr), @acc[5]
adc %rdx, @acc[6]
mov $f0, %rdx
shrd \$62, @acc[1], @acc[0]
shrd \$62, @acc[2], @acc[1]
shrd \$62, @acc[3], @acc[2]
shrd \$62, @acc[4], @acc[3]
shrd \$62, @acc[5], @acc[4]
shrd \$62, @acc[6], @acc[5]
sar \$63, @acc[6] # sign as mask
xor $fx, $fx
sub @acc[6], $fx # sign as bit
xor @acc[6], @acc[0] # conditionally negate the result
xor @acc[6], @acc[1]
xor @acc[6], @acc[2]
xor @acc[6], @acc[3]
xor @acc[6], @acc[4]
xor @acc[6], @acc[5]
add $fx, @acc[0]
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov @acc[5], 8*5($out_ptr)
xor @acc[6], %rdx # conditionally negate |f0|
xor @acc[6], $g0 # conditionally negate |g0|
add $fx, %rdx
add $fx, $g0
ret
.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62
___
} }
{
my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi");
{
my @a = ($a_lo, $t1, $a_hi);
my @b = ($b_lo, $t2, $b_hi);
$code.=<<___;
.type __ab_approximation_62,\@abi-omnipotent
.align 32
__ab_approximation_62:
mov 8*5($in_ptr), @a[2] # load |a| in reverse order
mov 8*11($in_ptr), @b[2] # load |b| in reverse order
mov 8*4($in_ptr), @a[1]
mov 8*10($in_ptr), @b[1]
mov 8*3($in_ptr), @a[0]
mov 8*9($in_ptr), @b[0]
mov @a[2], $t0
or @b[2], $t0 # check top-most limbs, ...
cmovz @a[1], @a[2]
cmovz @b[1], @b[2]
cmovz @a[0], @a[1]
cmovz @b[0], @b[1]
mov 8*2($in_ptr), @a[0]
mov 8*8($in_ptr), @b[0]
mov @a[2], $t0
or @b[2], $t0 # ... ones before top-most, ...
cmovz @a[1], @a[2]
cmovz @b[1], @b[2]
cmovz @a[0], @a[1]
cmovz @b[0], @b[1]
mov 8*1($in_ptr), @a[0]
mov 8*7($in_ptr), @b[0]
mov @a[2], $t0
or @b[2], $t0 # ... and ones before that ...
cmovz @a[1], @a[2]
cmovz @b[1], @b[2]
cmovz @a[0], @a[1]
cmovz @b[0], @b[1]
mov 8*0($in_ptr), @a[0]
mov 8*6($in_ptr), @b[0]
mov @a[2], $t0
or @b[2], $t0
bsr $t0, %rcx
lea 1(%rcx), %rcx
cmovz @a[1], @a[2]
cmovz @b[1], @b[2]
cmovz $t0, %rcx
neg %rcx
#and \$63, %rcx # debugging artefact
shldq %cl, @a[1], @a[2] # align second limb to the left
shldq %cl, @b[1], @b[2]
jmp __inner_loop_62
ret
.size __ab_approximation_62,.-__ab_approximation_62
___
}
$code.=<<___;
.type __inner_loop_62,\@abi-omnipotent
.align 8
.long 0
__inner_loop_62:
mov \$1, $f0 # |f0|=1
xor $g0, $g0 # |g0|=0
xor $f1, $f1 # |f1|=0
mov \$1, $g1 # |g1|=1
mov $in_ptr, 8(%rsp)
.Loop_62:
xor $t0, $t0
xor $t1, $t1
test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_|
mov $b_lo, $t2
mov $b_hi, $t3
cmovnz $b_lo, $t0
cmovnz $b_hi, $t1
sub $a_lo, $t2 # |b_|-|a_|
sbb $a_hi, $t3
mov $a_lo, $t4
mov $a_hi, $t5
sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even)
sbb $t1, $a_hi
cmovc $t2, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_|
cmovc $t3, $a_hi
cmovc $t4, $b_lo # |b_| = |a_|
cmovc $t5, $b_hi
mov $f0, $t0 # exchange |f0| and |f1|
cmovc $f1, $f0
cmovc $t0, $f1
mov $g0, $t1 # exchange |g0| and |g1|
cmovc $g1, $g0
cmovc $t1, $g1
xor $t0, $t0
xor $t1, $t1
shrd \$1, $a_hi, $a_lo
shr \$1, $a_hi
test \$1, $t4 # if |a_| was odd, then we'll be subtracting...
cmovnz $f1, $t0
cmovnz $g1, $t1
add $f1, $f1 # |f1|<<=1
add $g1, $g1 # |g1|<<=1
sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even)
sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...)
sub \$1, $cnt
jnz .Loop_62
mov 8(%rsp), $in_ptr
ret
.size __inner_loop_62,.-__inner_loop_62
___
}
print $code;
close STDOUT;

View file

@ -0,0 +1,995 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast Euclidean inversion as suggested in
# https://eprint.iacr.org/2020/972. Performance is >4x better than
# modulus-specific FLT addition chain...
#
# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
#
$python_ref.=<<'___';
def ct_inverse_mod_383(inp, mod):
a, u = inp, 1
b, v = mod, 0
k = 31
mask = (1 << k) - 1
for i in range(0, 766 // k):
# __ab_approximation_31
n = max(a.bit_length(), b.bit_length())
if n < 64:
a_, b_ = a, b
else:
a_ = (a & mask) | ((a >> (n-k-2)) << k)
b_ = (b & mask) | ((b >> (n-k-2)) << k)
# __inner_loop_31
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, k):
if a_ & 1:
if a_ < b_:
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
a_, f0, g0 = a_-b_, f0-f1, g0-g1
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
# __smulx_383_n_shift_by_31
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
if a < 0:
a, f0, g0 = -a, -f0, -g0
if b < 0:
b, f1, g1 = -b, -f1, -g1
# __smulx_767x63
u, v = u*f0 + v*g0, u*f1 + v*g1
if 766 % k:
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, 766 % k):
if a & 1:
if a < b:
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
a, f0, g0 = a-b, f0-f1, g0-g1
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
v = u*f1 + v*g1
if v < 0:
v += mod << (768 - mod.bit_length()) # left aligned
return v & (2**768 - 1) # to be reduced % mod
___
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
my $cnt = "%edi";
$frame = 8*11+2*512;
$code.=<<___;
.text
.globl ctx_inverse_mod_383
.type ctx_inverse_mod_383,\@function,4,"unwind"
.align 32
ctx_inverse_mod_383:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$$frame, %rsp
.cfi_adjust_cfa_offset $frame
.cfi_end_prologue
lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot
and \$-512, %rax # in the frame...
mov $out_ptr, 8*4(%rsp)
mov $nx_ptr, 8*5(%rsp)
mov 8*0($in_ptr), @acc[0] # load input
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*4($in_ptr), @acc[4]
mov 8*5($in_ptr), @acc[5]
mov 8*0($n_ptr), @acc[6] # load modulus
mov 8*1($n_ptr), @acc[7]
mov 8*2($n_ptr), @acc[8]
mov 8*3($n_ptr), @acc[9]
mov 8*4($n_ptr), @acc[10]
mov 8*5($n_ptr), @acc[11]
mov @acc[0], 8*0(%rax) # copy input to |a|
mov @acc[1], 8*1(%rax)
mov @acc[2], 8*2(%rax)
mov @acc[3], 8*3(%rax)
mov @acc[4], 8*4(%rax)
mov @acc[5], 8*5(%rax)
mov @acc[6], 8*6(%rax) # copy modulus to |b|
mov @acc[7], 8*7(%rax)
mov @acc[8], 8*8(%rax)
mov @acc[9], 8*9(%rax)
mov @acc[10], 8*10(%rax)
mov %rax, $in_ptr
mov @acc[11], 8*11(%rax)
################################# first iteration
mov \$31, $cnt
call __ab_approximation_31
#mov $f0, 8*7(%rsp)
#mov $g0, 8*8(%rsp)
mov $f1, 8*9(%rsp)
mov $g1, 8*10(%rsp)
mov \$256, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
call __smulx_383_n_shift_by_31
#mov $f0, 8*7(%rsp) # corrected |f0|
#mov $g0, 8*8(%rsp) # corrected |g0|
mov $f0, 8*12($out_ptr) # initialize |u| with |f0|
mov 8*9(%rsp), $f0 # |f1|
mov 8*10(%rsp), $g0 # |g1|
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
call __smulx_383_n_shift_by_31
#mov $f0, 8*9(%rsp) # corrected |f1|
#mov $g0, 8*10(%rsp) # corrected |g1|
mov $f0, 8*12($out_ptr) # initialize |v| with |f1|
################################# second iteration
xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v|
mov \$31, $cnt
call __ab_approximation_31
#mov $f0, 8*7(%rsp)
#mov $g0, 8*8(%rsp)
mov $f1, 8*9(%rsp)
mov $g1, 8*10(%rsp)
mov \$256, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
call __smulx_383_n_shift_by_31
mov $f0, 8*7(%rsp) # corrected |f0|
mov $g0, 8*8(%rsp) # corrected |g0|
mov 8*9(%rsp), $f0 # |f1|
mov 8*10(%rsp), $g0 # |g1|
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
call __smulx_383_n_shift_by_31
#mov $f0, 8*9(%rsp) # corrected |f1|
#mov $g0, 8*10(%rsp) # corrected |g1|
mov 8*12($in_ptr), %rax # |u|
mov 8*18($in_ptr), @acc[3] # |v|
mov $f0, %rbx
mov %rax, @acc[2]
imulq 8*7(%rsp) # |u|*|f0|
mov %rax, @acc[0]
mov @acc[3], %rax
mov %rdx, @acc[1]
imulq 8*8(%rsp) # |v|*|g0|
add %rax, @acc[0]
adc %rdx, @acc[1]
mov @acc[0], 8*6($out_ptr) # destination |u|
mov @acc[1], 8*7($out_ptr)
sar \$63, @acc[1] # sign extension
mov @acc[1], 8*8($out_ptr)
mov @acc[1], 8*9($out_ptr)
mov @acc[1], 8*10($out_ptr)
mov @acc[1], 8*11($out_ptr)
lea 8*12($in_ptr), $in_ptr # make in_ptr "rewindable" with xor
mov @acc[2], %rax
imulq %rbx # |u|*|f1|
mov %rax, @acc[0]
mov @acc[3], %rax
mov %rdx, @acc[1]
imulq %rcx # |v|*|g1|
add %rax, @acc[0]
adc %rdx, @acc[1]
mov @acc[0], 8*12($out_ptr) # destination |v|
mov @acc[1], 8*13($out_ptr)
sar \$63, @acc[1] # sign extension
mov @acc[1], 8*14($out_ptr)
mov @acc[1], 8*15($out_ptr)
mov @acc[1], 8*16($out_ptr)
mov @acc[1], 8*17($out_ptr)
___
for($i=2; $i<23; $i++) {
my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31"
: "__smulx_191_n_shift_by_31";
my $smul_767x63 = $i>11 ? "__smulx_767x63"
: "__smulx_383x63";
$code.=<<___;
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
mov \$31, $cnt
call __ab_approximation_31
#mov $f0, 8*7(%rsp)
#mov $g0, 8*8(%rsp)
mov $f1, 8*9(%rsp)
mov $g1, 8*10(%rsp)
mov \$256, $out_ptr
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
call $smul_n_shift
mov $f0, 8*7(%rsp) # corrected |f0|
mov $g0, 8*8(%rsp) # corrected |g0|
mov 8*9(%rsp), $f0 # |f1|
mov 8*10(%rsp), $g0 # |g1|
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
call $smul_n_shift
mov $f0, 8*9(%rsp) # corrected |f1|
mov $g0, 8*10(%rsp) # corrected |g1|
mov 8*7(%rsp), $f0 # |f0|
mov 8*8(%rsp), $g0 # |g0|
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
call __smulx_383x63
mov 8*9(%rsp), $f0 # |f1|
mov 8*10(%rsp), $g0 # |g1|
lea 8*6($out_ptr),$out_ptr # pointer to destination |v|
call $smul_767x63
___
$code.=<<___ if ($i==11);
sar \$63, @acc[5] # sign extension
mov @acc[5], 8*6($out_ptr)
mov @acc[5], 8*7($out_ptr)
mov @acc[5], 8*8($out_ptr)
mov @acc[5], 8*9($out_ptr)
mov @acc[5], 8*10($out_ptr)
mov @acc[5], 8*11($out_ptr)
___
}
$code.=<<___;
################################# two[!] last iterations in one go
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
mov \$53, $cnt # 31 + 766 % 31
#call __ab_approximation_31 # |a| and |b| are exact, just load
mov 8*0($in_ptr), @acc[0] # |a_lo|
#xor @acc[1], @acc[1] # |a_hi|
mov 8*6($in_ptr), @acc[2] # |b_lo|
#xor @acc[3], @acc[3] # |b_hi|
call __inner_loop_62
#mov $f0, 8*7(%rsp)
#mov $g0, 8*8(%rsp)
#mov $f1, 8*9(%rsp)
#mov $g1, 8*10(%rsp)
#mov 8*7(%rsp), $f0 # |f0|
#mov 8*8(%rsp), $g0 # |g0|
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
#lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
#call __smulx_383x63
#mov 8*9(%rsp), $f0 # |f1|
#mov 8*10(%rsp), $g0 # |g1|
mov $f1, $f0
mov $g1, $g0
mov 8*4(%rsp), $out_ptr # original out_ptr
call __smulx_767x63
mov 8*5(%rsp), $in_ptr # original n_ptr
mov %rax, %rdx # top limb of the result
sar \$63, %rax # result's sign as mask
mov %rax, @acc[0] # mask |modulus|
mov %rax, @acc[1]
mov %rax, @acc[2]
and 8*0($in_ptr), @acc[0]
and 8*1($in_ptr), @acc[1]
mov %rax, @acc[3]
and 8*2($in_ptr), @acc[2]
and 8*3($in_ptr), @acc[3]
mov %rax, @acc[4]
and 8*4($in_ptr), @acc[4]
and 8*5($in_ptr), %rax
add @acc[0], @acc[6] # conditionally add |modulus|<<384
adc @acc[1], @acc[7]
adc @acc[2], @acc[8]
adc @acc[3], @acc[9]
adc @acc[4], %rcx
adc %rax, %rdx
mov @acc[6], 8*6($out_ptr) # store absolute value
mov @acc[7], 8*7($out_ptr)
mov @acc[8], 8*8($out_ptr)
mov @acc[9], 8*9($out_ptr)
mov %rcx, 8*10($out_ptr)
mov %rdx, 8*11($out_ptr)
lea $frame(%rsp), %r8 # size optimization
mov 8*0(%r8),%r15
.cfi_restore %r15
mov 8*1(%r8),%r14
.cfi_restore %r14
mov 8*2(%r8),%r13
.cfi_restore %r13
mov 8*3(%r8),%r12
.cfi_restore %r12
mov 8*4(%r8),%rbx
.cfi_restore %rbx
mov 8*5(%r8),%rbp
.cfi_restore %rbp
lea 8*6(%r8),%rsp
.cfi_adjust_cfa_offset -$frame-8*6
.cfi_epilogue
ret
.cfi_endproc
.size ctx_inverse_mod_383,.-ctx_inverse_mod_383
___
########################################################################
# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
# to the maximum bit-length of the *result*, and "63" - to the maximum
# bit-length of the |f?| and |g?| single-limb multiplicands. However!
# The latter should not be taken literally, as they are always chosen so
# that "bad things" don't happen. For example, there comes a point when
# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
# because past that point |f0| is always 1 and |g0| is always 0. And,
# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
# perform full-width |u|*|f1| multiplication, half-width one with sign
# extension is sufficient...
{
my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
my $fx = @acc[9];
$code.=<<___;
.type __smulx_767x63,\@abi-omnipotent
.align 32
__smulx_767x63:
mov 8*0($in_ptr), @acc[0] # load |u|
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*4($in_ptr), @acc[4]
mov 8*5($in_ptr), @acc[5]
mov $f0, %rax
sar \$63, %rax # |f0|'s sign as mask
xor $fx, $fx # overrides in_ptr
sub %rax, $fx # |f0|'s sign as bit
mov $out_ptr, 8*1(%rsp)
mov $in_ptr, 8*2(%rsp)
lea 8*6($in_ptr), $in_ptr # pointer to |v|
xor %rax, $f0 # conditionally negate |f0|
add $fx, $f0
xor %rax, @acc[0] # conditionally negate |u|
xor %rax, @acc[1]
xor %rax, @acc[2]
xor %rax, @acc[3]
xor %rax, @acc[4]
xor @acc[5], %rax
add $fx, @acc[0]
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, %rax
mulx @acc[0], @acc[0], $fx # |u|*|f0|
mulx @acc[1], @acc[1], @acc[5]
add $fx, @acc[1]
___
for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
$code.=<<___;
mulx @acc[$i], @acc[$i], $a
adc $b, @acc[$i]
___
($a, $b) = ($b, $a);
}
$code.=<<___;
adc \$0, $fx
imulq %rdx
add $fx, %rax
adc \$0, %rdx
mov @acc[0], 8*0($out_ptr) # offload |u|*|f0|
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov %rax, 8*5($out_ptr)
mov %rdx, 8*6($out_ptr)
sar \$63, %rdx # sign extension
mov %rdx, 8*7($out_ptr)
___
{
my $fx=$in_ptr;
$code.=<<___;
mov $g0, $f0 # load |g0|
mov $g0, %rax
mov 8*0($in_ptr), @acc[0] # load |v|
mov 8*1($in_ptr), @acc[1]
mov 8*2($in_ptr), @acc[2]
mov 8*3($in_ptr), @acc[3]
mov 8*4($in_ptr), @acc[4]
mov 8*5($in_ptr), @acc[5]
mov 8*6($in_ptr), @acc[6]
mov 8*7($in_ptr), @acc[7]
mov 8*8($in_ptr), @acc[8]
mov 8*9($in_ptr), @acc[9]
mov 8*10($in_ptr), @acc[10]
mov 8*11($in_ptr), @acc[11]
sar \$63, %rax # |g0|'s sign as mask
xor $fx, $fx # overrides in_ptr
sub %rax, $fx # |g0|'s sign as bit
xor %rax, $f0 # conditionally negate |g0|
add $fx, $f0
xor %rax, @acc[0] # conditionally negate |v|
xor %rax, @acc[1]
xor %rax, @acc[2]
xor %rax, @acc[3]
xor %rax, @acc[4]
xor %rax, @acc[5]
xor %rax, @acc[6]
xor %rax, @acc[7]
xor %rax, @acc[8]
xor %rax, @acc[9]
xor %rax, @acc[10]
xor %rax, @acc[11]
add $fx, @acc[0]
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
adc \$0, @acc[6]
adc \$0, @acc[7]
adc \$0, @acc[8]
adc \$0, @acc[9]
adc \$0, @acc[10]
adc \$0, @acc[11]
mulx @acc[0], @acc[0], %rax # |v|*|g0|
mulx @acc[1], @acc[1], $fx
add %rax, @acc[1]
___
for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) {
$code.=<<___;
mulx @acc[$i], @acc[$i], $a
adc $b, @acc[$i]
___
($a, $b) = ($b, $a);
}
$code.=<<___;
mulx @acc[11], @acc[11], $fx
mov 8*1(%rsp), %rdx # out_ptr
mov 8*2(%rsp), $in_ptr # restore original in_ptr
adc @acc[11], %rax
add 8*0(%rdx), @acc[0] # accumulate |u|*|f0|
adc 8*1(%rdx), @acc[1]
adc 8*2(%rdx), @acc[2]
adc 8*3(%rdx), @acc[3]
adc 8*4(%rdx), @acc[4]
adc 8*5(%rdx), @acc[5]
adc 8*6(%rdx), @acc[6]
mov 8*7(%rdx), @acc[11] # sign extension
adc @acc[11], @acc[7]
adc @acc[11], @acc[8]
adc @acc[11], @acc[9]
adc @acc[11], @acc[10]
adc @acc[11], %rax
mov %rdx, $out_ptr # restore original out_ptr
mov @acc[0], 8*0(%rdx)
mov @acc[1], 8*1(%rdx)
mov @acc[2], 8*2(%rdx)
mov @acc[3], 8*3(%rdx)
mov @acc[4], 8*4(%rdx)
mov @acc[5], 8*5(%rdx)
mov @acc[6], 8*6(%rdx)
mov @acc[7], 8*7(%rdx)
mov @acc[8], 8*8(%rdx)
mov @acc[9], 8*9(%rdx)
mov @acc[10], 8*10(%rdx)
mov %rax, 8*11(%rdx)
ret
.size __smulx_767x63,.-__smulx_767x63
___
}
$code.=<<___;
.type __smulx_383x63,\@abi-omnipotent
.align 32
__smulx_383x63:
___
for($j=0; $j<2; $j++) {
my $k = 8*6*$j;
$code.=<<___;
mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
mov $k+8*1($in_ptr), @acc[1]
mov $k+8*2($in_ptr), @acc[2]
mov $k+8*3($in_ptr), @acc[3]
mov $k+8*4($in_ptr), @acc[4]
mov $k+8*5($in_ptr), @acc[5]
mov $f0, $fx
sar \$63, $fx # |f0|'s sign as mask (or |g0|'s)
xor %rax, %rax
sub $fx, %rax # |f0|'s sign as bit (or |g0|'s)
xor $fx, $f0 # conditionally negate |f0|
add %rax, $f0
xor $fx, @acc[0] # conditionally negate |u| (or |v|)
xor $fx, @acc[1]
xor $fx, @acc[2]
xor $fx, @acc[3]
xor $fx, @acc[4]
xor $fx, @acc[5]
add %rax, @acc[0]
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
mulx @acc[0], @acc[0], $fx # |u|*|f0| (or |v|*|g0|)
mulx @acc[1], @acc[1], %rax
add $fx, @acc[1]
___
for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) {
$code.=<<___;
mulx @acc[$i], @acc[$i], $a
adc $b, @acc[$i]
___
($a, $b) = ($b, $a);
}
$code.=<<___ if ($j==0);
mulx @acc[$i], @acc[$i], %rax
mov $g0, $f0
adc $fx, @acc[$i]
mov @acc[0], 8*0($out_ptr) # offload |u|*|f0|
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov @acc[5], 8*5($out_ptr)
___
}
$code.=<<___;
mulx @acc[$i], @acc[$i], %rax
adc $fx, @acc[$i]
add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0|
adc 8*1($out_ptr), @acc[1]
adc 8*2($out_ptr), @acc[2]
adc 8*3($out_ptr), @acc[3]
adc 8*4($out_ptr), @acc[4]
adc 8*5($out_ptr), @acc[5]
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov @acc[5], 8*5($out_ptr)
ret
.size __smulx_383x63,.-__smulx_383x63
___
########################################################################
# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
# the names refers to maximum bit-lengths of |a| and |b|. As already
# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
# chosen so that "bad things" don't happen. For example, so that the
# sum of the products doesn't overflow, and that the final result is
# never wider than inputs...
{
$code.=<<___;
.type __smulx_383_n_shift_by_31,\@abi-omnipotent
.align 32
__smulx_383_n_shift_by_31:
mov $f0, @acc[8]
xor @acc[6], @acc[6]
___
my $f0 = @acc[8];
for($j=0; $j<2; $j++) {
my $k = 8*6*$j;
$code.=<<___;
mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
mov $k+8*1($in_ptr), @acc[1]
mov $k+8*2($in_ptr), @acc[2]
mov $k+8*3($in_ptr), @acc[3]
mov $k+8*4($in_ptr), @acc[4]
mov $k+8*5($in_ptr), @acc[5]
mov %rdx, %rax
sar \$63, %rax # |f0|'s sign as mask (or |g0|'s)
xor $fx, $fx
sub %rax, $fx # |f0|'s sign as bit (or |g0|'s)
xor %rax, %rdx # conditionally negate |f0| (or |g0|)
add $fx, %rdx
xor %rax, @acc[0] # conditionally negate |a| (or |b|)
xor %rax, @acc[1]
xor %rax, @acc[2]
xor %rax, @acc[3]
xor %rax, @acc[4]
xor @acc[5], %rax
add $fx, @acc[0]
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, %rax
mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|)
mulx @acc[1], @acc[1], @acc[5]
add $fx, @acc[1]
___
for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
$code.=<<___;
mulx @acc[$i], @acc[$i], $a
adc $b, @acc[$i]
___
($a, $b) = ($b, $a);
}
$code.=<<___ if ($j==0);
adc \$0, $fx
imulq %rdx
add $fx, %rax
adc %rdx, @acc[6]
mov $g0, %rdx
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov %rax, 8*5($out_ptr)
___
}
$code.=<<___;
adc \$0, $fx
imulq %rdx
add $fx, %rax
adc \$0, %rdx
add 8*0($out_ptr), @acc[0]
adc 8*1($out_ptr), @acc[1]
adc 8*2($out_ptr), @acc[2]
adc 8*3($out_ptr), @acc[3]
adc 8*4($out_ptr), @acc[4]
adc 8*5($out_ptr), %rax
adc %rdx, @acc[6]
mov $f0, %rdx
shrd \$31, @acc[1], @acc[0]
shrd \$31, @acc[2], @acc[1]
shrd \$31, @acc[3], @acc[2]
shrd \$31, @acc[4], @acc[3]
shrd \$31, %rax, @acc[4]
shrd \$31, @acc[6], %rax
sar \$63, @acc[6] # sign as mask
xor $fx, $fx
sub @acc[6], $fx # sign as bit
xor @acc[6], @acc[0] # conditionally negate the result
xor @acc[6], @acc[1]
xor @acc[6], @acc[2]
xor @acc[6], @acc[3]
xor @acc[6], @acc[4]
xor @acc[6], %rax
add $fx, @acc[0]
adc \$0, @acc[1]
adc \$0, @acc[2]
adc \$0, @acc[3]
adc \$0, @acc[4]
adc \$0, %rax
mov @acc[0], 8*0($out_ptr)
mov @acc[1], 8*1($out_ptr)
mov @acc[2], 8*2($out_ptr)
mov @acc[3], 8*3($out_ptr)
mov @acc[4], 8*4($out_ptr)
mov %rax, 8*5($out_ptr)
xor @acc[6], %rdx # conditionally negate |f0|
xor @acc[6], $g0 # conditionally negate |g0|
add $fx, %rdx
add $fx, $g0
ret
.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31
___
} {
$code.=<<___;
.type __smulx_191_n_shift_by_31,\@abi-omnipotent
.align 32
__smulx_191_n_shift_by_31:
mov $f0, @acc[8]
___
my $f0 = @acc[8];
for($j=0; $j<2; $j++) {
my $k = 8*6*$j;
my @acc=@acc;
@acc=@acc[3..5] if ($j);
$code.=<<___;
mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
mov $k+8*1($in_ptr), @acc[1]
mov $k+8*2($in_ptr), @acc[2]
mov %rdx, %rax
sar \$63, %rax # |f0|'s sign as mask (or |g0|'s)
xor $fx, $fx
sub %rax, $fx # |f0|'s sign as bit (or |g0|'s)
xor %rax, %rdx # conditionally negate |f0| (or |g0|)
add $fx, %rdx
xor %rax, @acc[0] # conditionally negate |a| (or |b|)
xor %rax, @acc[1]
xor @acc[2], %rax
add $fx, @acc[0]
adc \$0, @acc[1]
adc \$0, %rax
mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|)
mulx @acc[1], @acc[1], @acc[2]
add $fx, @acc[1]
adc \$0, @acc[2]
imulq %rdx
add %rax, @acc[2]
adc \$0, %rdx
___
$code.=<<___ if ($j==0);
mov %rdx, @acc[6]
mov $g0, %rdx
___
}
$code.=<<___;
add @acc[0], @acc[3]
adc @acc[1], @acc[4]
adc @acc[2], @acc[5]
adc %rdx, @acc[6]
mov $f0, %rdx
shrd \$31, @acc[4], @acc[3]
shrd \$31, @acc[5], @acc[4]
shrd \$31, @acc[6], @acc[5]
sar \$63, @acc[6] # sign as mask
xor $fx, $fx
sub @acc[6], $fx # sign as bit
xor @acc[6], @acc[3] # conditionally negate the result
xor @acc[6], @acc[4]
xor @acc[6], @acc[5]
add $fx, @acc[3]
adc \$0, @acc[4]
adc \$0, @acc[5]
mov @acc[3], 8*0($out_ptr)
mov @acc[4], 8*1($out_ptr)
mov @acc[5], 8*2($out_ptr)
xor @acc[6], %rdx # conditionally negate |f0|
xor @acc[6], $g0 # conditionally negate |g0|
add $fx, %rdx
add $fx, $g0
ret
.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31
___
} }
{
my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
my ($a_, $b_) = ($a_lo, $b_lo);
{
my @a = ($a_lo, $t1, $a_hi);
my @b = ($b_lo, $t2, $b_hi);
$code.=<<___;
.type __ab_approximation_31,\@abi-omnipotent
.align 32
__ab_approximation_31:
mov 8*5($in_ptr), @a[2] # load |a| in reverse order
mov 8*11($in_ptr), @b[2] # load |b| in reverse order
mov 8*4($in_ptr), @a[1]
mov 8*10($in_ptr), @b[1]
mov 8*3($in_ptr), @a[0]
mov 8*9($in_ptr), @b[0]
mov @a[2], $t0
or @b[2], $t0 # check top-most limbs, ...
cmovz @a[1], @a[2]
cmovz @b[1], @b[2]
cmovz @a[0], @a[1]
mov 8*2($in_ptr), @a[0]
cmovz @b[0], @b[1]
mov 8*8($in_ptr), @b[0]
mov @a[2], $t0
or @b[2], $t0 # ... ones before top-most, ...
cmovz @a[1], @a[2]
cmovz @b[1], @b[2]
cmovz @a[0], @a[1]
mov 8*1($in_ptr), @a[0]
cmovz @b[0], @b[1]
mov 8*7($in_ptr), @b[0]
mov @a[2], $t0
or @b[2], $t0 # ... and ones before that ...
cmovz @a[1], @a[2]
cmovz @b[1], @b[2]
cmovz @a[0], @a[1]
mov 8*0($in_ptr), @a[0]
cmovz @b[0], @b[1]
mov 8*6($in_ptr), @b[0]
mov @a[2], $t0
or @b[2], $t0 # ... and ones before that ...
cmovz @a[1], @a[2]
cmovz @b[1], @b[2]
cmovz @a[0], @a[1]
cmovz @b[0], @b[1]
mov @a[2], $t0
or @b[2], $t0
bsr $t0, %rcx
lea 1(%rcx), %rcx
cmovz @a[0], @a[2]
cmovz @b[0], @b[2]
cmovz $t0, %rcx
neg %rcx
#and \$63, %rcx # debugging artefact
shldq %cl, @a[1], @a[2] # align second limb to the left
shldq %cl, @b[1], @b[2]
mov \$0x7FFFFFFF, %eax
and %rax, @a[0]
and %rax, @b[0]
andn @a[2], %rax, @a[2]
andn @b[2], %rax, @b[2]
or @a[2], @a[0]
or @b[2], @b[0]
jmp __inner_loop_31
ret
.size __ab_approximation_31,.-__ab_approximation_31
___
}
$code.=<<___;
.type __inner_loop_31,\@abi-omnipotent
.align 32
__inner_loop_31: ################# by Thomas Pornin
mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0
mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1
mov \$0x7FFFFFFF7FFFFFFF, $bias
.Loop_31:
cmp $b_, $a_ # if |a_|<|b_|, swap the variables
mov $a_, $t0
mov $b_, $t1
mov $fg0, $t2
mov $fg1, $t3
cmovb $b_, $a_
cmovb $t0, $b_
cmovb $fg1, $fg0
cmovb $t2, $fg1
sub $b_, $a_ # |a_|-|b_|
sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1|
add $bias, $fg0
test \$1, $t0 # if |a_| was even, roll back
cmovz $t0, $a_
cmovz $t1, $b_
cmovz $t2, $fg0
cmovz $t3, $fg1
shr \$1, $a_ # |a_|>>=1
add $fg1, $fg1 # |f1|<<=1, |g1|<<=1
sub $bias, $fg1
sub \$1, $cnt
jnz .Loop_31
shr \$32, $bias
mov %ecx, %edx # $fg0, $f0
mov ${fg1}d, ${f1}d
shr \$32, $g0
shr \$32, $g1
sub $bias, $f0 # remove the bias
sub $bias, $g0
sub $bias, $f1
sub $bias, $g1
ret
.size __inner_loop_31,.-__inner_loop_31
.type __inner_loop_62,\@abi-omnipotent
.align 32
__inner_loop_62:
mov \$1, $f0 # |f0|=1
xor $g0, $g0 # |g0|=0
xor $f1, $f1 # |f1|=0
mov \$1, $g1 # |g1|=1
.Loop_62:
xor $t0, $t0
test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_|
mov $b_lo, $t1
cmovnz $b_lo, $t0
sub $a_lo, $t1 # |b_|-|a_|
mov $a_lo, $t2
sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even)
cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_|
cmovc $t2, $b_lo # |b_| = |a_|
mov $f0, $t0 # exchange |f0| and |f1|
cmovc $f1, $f0
cmovc $t0, $f1
mov $g0, $t1 # exchange |g0| and |g1|
cmovc $g1, $g0
cmovc $t1, $g1
xor $t0, $t0
xor $t1, $t1
shr \$1, $a_lo
test \$1, $t2 # if |a_| was odd, then we'll be subtracting...
cmovnz $f1, $t0
cmovnz $g1, $t1
add $f1, $f1 # |f1|<<=1
add $g1, $g1 # |g1|<<=1
sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even)
sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...)
sub \$1, $cnt
jnz .Loop_62
ret
.size __inner_loop_62,.-__inner_loop_62
___
}
print $code;
close STDOUT;

122
blst/asm/div3w-armv8.pl Executable file
View file

@ -0,0 +1,122 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
$code.=<<___;
.text
.globl div_3_limbs
.type div_3_limbs,%function
.align 5
div_3_limbs:
ldp x4,x5,[x0] // load R
eor x0,x0,x0 // Q = 0
mov x3,#64 // loop counter
nop
.Loop:
subs x6,x4,x1 // R - D
add x0,x0,x0 // Q <<= 1
sbcs x7,x5,x2
add x0,x0,#1 // Q + speculative bit
csel x4,x4,x6,lo // select between R and R - D
extr x1,x2,x1,#1 // D >>= 1
csel x5,x5,x7,lo
lsr x2,x2,#1
sbc x0,x0,xzr // subtract speculative bit
sub x3,x3,#1
cbnz x3,.Loop
asr x3,x0,#63 // top bit -> mask
add x0,x0,x0 // Q <<= 1
subs x6,x4,x1 // R - D
add x0,x0,#1 // Q + specilative bit
sbcs x7,x5,x2
sbc x0,x0,xzr // subtract speculative bit
orr x0,x0,x3 // all ones if overflow
ret
.size div_3_limbs,.-div_3_limbs
___
{
my ($div_rem, $divisor, $quot) = map("x$_",(0..2));
my @div = map("x$_",(3..4));
my @acc = map("x$_",(5..7));
my @t = map("x$_",(8..11));
$code.=<<___;
.globl quot_rem_128
.type quot_rem_128,%function
.align 5
quot_rem_128:
ldp @div[0],@div[1],[$divisor]
mul @acc[0],@div[0],$quot // divisor[0:1} * quotient
umulh @acc[1],@div[0],$quot
mul @t[3], @div[1],$quot
umulh @acc[2],@div[1],$quot
ldp @t[0],@t[1],[$div_rem] // load 3 limbs of the dividend
ldr @t[2],[$div_rem,#16]
adds @acc[1],@acc[1],@t[3]
adc @acc[2],@acc[2],xzr
subs @t[0],@t[0],@acc[0] // dividend - divisor * quotient
sbcs @t[1],@t[1],@acc[1]
sbcs @t[2],@t[2],@acc[2]
sbc @acc[0],xzr,xzr // borrow -> mask
add $quot,$quot,@acc[0] // if borrowed, adjust the quotient ...
and @div[0],@div[0],@acc[0]
and @div[1],@div[1],@acc[0]
adds @t[0],@t[0],@div[0] // ... and add divisor
adc @t[1],@t[1],@div[1]
stp @t[0],@t[1],[$div_rem] // save 2 limbs of the remainder
str $quot,[$div_rem,#16] // and one limb of the quotient
mov x0,$quot // return adjusted quotient
ret
.size quot_rem_128,.-quot_rem_128
.globl quot_rem_64
.type quot_rem_64,%function
.align 5
quot_rem_64:
ldr @div[0],[$divisor]
ldr @t[0],[$div_rem] // load 1 limb of the dividend
mul @acc[0],@div[0],$quot // divisor * quotient
sub @t[0],@t[0],@acc[0] // dividend - divisor * quotient
stp @t[0],$quot,[$div_rem] // save remainder and quotient
mov x0,$quot // return quotient
ret
.size quot_rem_64,.-quot_rem_64
___
}
print $code;
close STDOUT;

184
blst/asm/div3w-x86_64.pl Executable file
View file

@ -0,0 +1,184 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
$c_ref=<<'___';
/*
* |div_top| points at two most significant limbs of the dividend, |d_hi|
* and |d_lo| are two most significant limbs of the divisor. If divisor
* is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
* The divisor is required to be "bitwise left-aligned," and dividend's
* top limbs to be not larger than the divisor's. The latter limitation
* can be problematic in the first iteration of multi-precision division,
* where in most general case the condition would have to be "smaller."
* The subroutine considers four limbs, two of which are "overlapping,"
* hence the name... Another way to look at it is to think of the pair
* of the dividend's limbs being suffixed with a zero:
* +-------+-------+-------+
* R | | | 0 |
* +-------+-------+-------+
* +-------+-------+
* D | | |
* +-------+-------+
*/
limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)
{
llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0];
llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo;
limb_t Q = 0, mask;
size_t i;
for (i = 0; i < LIMB_BITS; i++) {
Q <<= 1;
mask = (R >= D);
Q |= mask;
R -= (D & ((llimb_t)0 - mask));
D >>= 1;
}
mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */
Q <<= 1;
Q |= (R >= D);
return (Q | mask);
}
___
$code.=<<___;
.text
.globl div_3_limbs
.hidden div_3_limbs
.type div_3_limbs,\@function,3
.align 32
div_3_limbs:
mov (%rdi),%r8 # load R.lo
mov 8(%rdi),%r9 # load R.hi
xor %rax,%rax # Q = 0
mov \$64,%ecx # loop counter
.Loop:
mov %r8,%r10 # put aside R
sub %rsi,%r8 # R -= D
mov %r9,%r11
sbb %rdx,%r9
lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit
mov %rdx,%rdi
cmovc %r10,%r8 # restore R if R - D borrowed
cmovc %r11,%r9
sbb \$0,%rax # subtract speculative bit
shl \$63,%rdi
shr \$1,%rsi
shr \$1,%rdx
or %rdi,%rsi # D >>= 1
sub \$1,%ecx
jnz .Loop
lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit
sar \$63,%rax # top bit -> mask
sub %rsi,%r8 # R -= D
sbb %rdx,%r9
sbb \$0,%rcx # subtract speculative bit
or %rcx,%rax # all ones if overflow
ret
.size div_3_limbs,.-div_3_limbs
___
########################################################################
# Calculate remainder and adjust the quotient, which can be off-by-one.
# Then save quotient in limb next to top limb of the remainder. There is
# place, because the remainder/next-iteration-dividend gets shorter by
# one limb.
{
my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");
my @acc = ("%r8", "%r9", "%rdx");
my @tmp = ("%r10", "%r11", "%rax");
$code.=<<___;
.globl quot_rem_128
.hidden quot_rem_128
.type quot_rem_128,\@function,3
.align 32
quot_rem_128:
mov %rdx, %rax
mov %rdx, $quotient
mulq 0($divisor) # divisor[0:1] * quotient
mov %rax, @acc[0]
mov $quotient, %rax
mov %rdx, @acc[1]
mulq 8($divisor)
add %rax, @acc[1]
adc \$0, %rdx # %rdx is @acc[2]
mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend
mov 8($div_rem), @tmp[1]
mov 16($div_rem), @tmp[2]
sub @acc[0], @tmp[0] # dividend - divisor * quotient
sbb @acc[1], @tmp[1]
sbb @acc[2], @tmp[2]
sbb @acc[0], @acc[0] # borrow -> mask
add @acc[0], $quotient # if borrowed, adjust the quotient ...
mov @acc[0], @acc[1]
and 0($divisor), @acc[0]
and 8($divisor), @acc[1]
add @acc[0], @tmp[0] # ... and add divisor
adc @acc[1], @tmp[1]
mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ...
mov @tmp[1], 8($div_rem)
mov $quotient, 16($div_rem) # ... and 1 limb of the quotient
mov $quotient, %rax # return adjusted quotient
ret
.size quot_rem_128,.-quot_rem_128
########################################################################
# Unlike 128-bit case above, quotient is exact. As result just one limb
# of the dividend is sufficient to calculate the remainder...
.globl quot_rem_64
.hidden quot_rem_64
.type quot_rem_64,\@function,3
.align 32
quot_rem_64:
mov %rdx, %rax # return quotient
imulq 0($divisor), %rdx # divisor[0] * quotient
mov 0($div_rem), @tmp[0] # load 1 limb of the dividend
sub %rdx, @tmp[0] # dividend - divisor * quotient
mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ...
mov %rax, 8($div_rem) # ... and 1 limb of the quotient
ret
.size quot_rem_64,.-quot_rem_64
___
}
print $code;
close STDOUT;

409
blst/asm/mul_mont_256-armv8.pl Executable file
View file

@ -0,0 +1,409 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# As for "sparse" in subroutine names, see commentary in the
# asm/mulx_mont_256-x86_64.pl module.
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
@mod=map("x$_",(5..8));
$bi="x9";
@a=map("x$_",(10..13));
@tmp=map("x$_",(14..17));
@acc=map("x$_",(19..24));
$m0=$n_ptr;
$code.=<<___;
.text
.globl mul_mont_sparse_256
.hidden mul_mont_sparse_256
.type mul_mont_sparse_256,%function
.align 5
mul_mont_sparse_256:
stp x29,x30,[sp,#-64]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
ldp @a[0],@a[1],[$a_ptr]
ldr $bi, [$b_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
mul @acc[0],@a[0],$bi
ldp @mod[0],@mod[1],[$n_ptr]
mul @acc[1],@a[1],$bi
ldp @mod[2],@mod[3],[$n_ptr,#16]
mul @acc[2],@a[2],$bi
mul @acc[3],@a[3],$bi
umulh @tmp[0],@a[0],$bi
umulh @tmp[1],@a[1],$bi
mul $m0,$n0,@acc[0]
umulh @tmp[2],@a[2],$bi
umulh @tmp[3],@a[3],$bi
adds @acc[1],@acc[1],@tmp[0]
//mul @tmp[0],@mod[0],$m0
adcs @acc[2],@acc[2],@tmp[1]
mul @tmp[1],@mod[1],$m0
adcs @acc[3],@acc[3],@tmp[2]
mul @tmp[2],@mod[2],$m0
adc @acc[4],xzr, @tmp[3]
mul @tmp[3],@mod[3],$m0
___
for ($i=1;$i<4;$i++) {
$code.=<<___;
ldr $bi,[$b_ptr,8*$i]
subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0]
umulh @tmp[0],@mod[0],$m0
adcs @acc[1],@acc[1],@tmp[1]
umulh @tmp[1],@mod[1],$m0
adcs @acc[2],@acc[2],@tmp[2]
umulh @tmp[2],@mod[2],$m0
adcs @acc[3],@acc[3],@tmp[3]
umulh @tmp[3],@mod[3],$m0
adc @acc[4],@acc[4],xzr
adds @acc[0],@acc[1],@tmp[0]
mul @tmp[0],@a[0],$bi
adcs @acc[1],@acc[2],@tmp[1]
mul @tmp[1],@a[1],$bi
adcs @acc[2],@acc[3],@tmp[2]
mul @tmp[2],@a[2],$bi
adcs @acc[3],@acc[4],@tmp[3]
mul @tmp[3],@a[3],$bi
adc @acc[4],xzr,xzr
adds @acc[0],@acc[0],@tmp[0]
umulh @tmp[0],@a[0],$bi
adcs @acc[1],@acc[1],@tmp[1]
umulh @tmp[1],@a[1],$bi
adcs @acc[2],@acc[2],@tmp[2]
mul $m0,$n0,@acc[0]
umulh @tmp[2],@a[2],$bi
adcs @acc[3],@acc[3],@tmp[3]
umulh @tmp[3],@a[3],$bi
adc @acc[4],@acc[4],xzr
adds @acc[1],@acc[1],@tmp[0]
//mul @tmp[0],@mod[0],$m0
adcs @acc[2],@acc[2],@tmp[1]
mul @tmp[1],@mod[1],$m0
adcs @acc[3],@acc[3],@tmp[2]
mul @tmp[2],@mod[2],$m0
adc @acc[4],@acc[4],@tmp[3]
mul @tmp[3],@mod[3],$m0
___
}
$code.=<<___;
subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0]
umulh @tmp[0],@mod[0],$m0
adcs @acc[1],@acc[1],@tmp[1]
umulh @tmp[1],@mod[1],$m0
adcs @acc[2],@acc[2],@tmp[2]
umulh @tmp[2],@mod[2],$m0
adcs @acc[3],@acc[3],@tmp[3]
umulh @tmp[3],@mod[3],$m0
adc @acc[4],@acc[4],xzr
adds @acc[0],@acc[1],@tmp[0]
adcs @acc[1],@acc[2],@tmp[1]
adcs @acc[2],@acc[3],@tmp[2]
adcs @acc[3],@acc[4],@tmp[3]
adc @acc[4],xzr,xzr
subs @tmp[0],@acc[0],@mod[0]
sbcs @tmp[1],@acc[1],@mod[1]
sbcs @tmp[2],@acc[2],@mod[2]
sbcs @tmp[3],@acc[3],@mod[3]
sbcs xzr, @acc[4],xzr
csel @acc[0],@acc[0],@tmp[0],lo
csel @acc[1],@acc[1],@tmp[1],lo
csel @acc[2],@acc[2],@tmp[2],lo
csel @acc[3],@acc[3],@tmp[3],lo
stp @acc[0],@acc[1],[$r_ptr]
stp @acc[2],@acc[3],[$r_ptr,#16]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldr x29,[sp],#64
ret
.size mul_mont_sparse_256,.-mul_mont_sparse_256
___
{
my @acc = (@a,@acc[0..3]);
my @a = @mod;
$code.=<<___;
.globl sqr_mont_sparse_256
.hidden sqr_mont_sparse_256
.type sqr_mont_sparse_256,%function
.align 5
sqr_mont_sparse_256:
paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
mov $n0,$n_ptr
////////////////////////////////////////////////////////////////
// | | | | | |a1*a0| |
// | | | | |a2*a0| | |
// | |a3*a2|a3*a0| | | |
// | | | |a2*a1| | | |
// | | |a3*a1| | | | |
// *| | | | | | | | 2|
// +|a3*a3|a2*a2|a1*a1|a0*a0|
// |--+--+--+--+--+--+--+--|
// |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x]
//
// "can't overflow" below mark carrying into high part of
// multiplication result, which can't overflow, because it
// can never be all ones.
mul @acc[1],@a[1],@a[0] // a[1]*a[0]
umulh @tmp[1],@a[1],@a[0]
mul @acc[2],@a[2],@a[0] // a[2]*a[0]
umulh @tmp[2],@a[2],@a[0]
mul @acc[3],@a[3],@a[0] // a[3]*a[0]
umulh @acc[4],@a[3],@a[0]
adds @acc[2],@acc[2],@tmp[1] // accumulate high parts of multiplication
mul @tmp[0],@a[2],@a[1] // a[2]*a[1]
umulh @tmp[1],@a[2],@a[1]
adcs @acc[3],@acc[3],@tmp[2]
mul @tmp[2],@a[3],@a[1] // a[3]*a[1]
umulh @tmp[3],@a[3],@a[1]
adc @acc[4],@acc[4],xzr // can't overflow
mul @acc[5],@a[3],@a[2] // a[3]*a[2]
umulh @acc[6],@a[3],@a[2]
adds @tmp[1],@tmp[1],@tmp[2] // accumulate high parts of multiplication
mul @acc[0],@a[0],@a[0] // a[0]*a[0]
adc @tmp[2],@tmp[3],xzr // can't overflow
adds @acc[3],@acc[3],@tmp[0] // accumulate low parts of multiplication
umulh @a[0],@a[0],@a[0]
adcs @acc[4],@acc[4],@tmp[1]
mul @tmp[1],@a[1],@a[1] // a[1]*a[1]
adcs @acc[5],@acc[5],@tmp[2]
umulh @a[1],@a[1],@a[1]
adc @acc[6],@acc[6],xzr // can't overflow
adds @acc[1],@acc[1],@acc[1] // acc[1-6]*=2
mul @tmp[2],@a[2],@a[2] // a[2]*a[2]
adcs @acc[2],@acc[2],@acc[2]
umulh @a[2],@a[2],@a[2]
adcs @acc[3],@acc[3],@acc[3]
mul @tmp[3],@a[3],@a[3] // a[3]*a[3]
adcs @acc[4],@acc[4],@acc[4]
umulh @a[3],@a[3],@a[3]
adcs @acc[5],@acc[5],@acc[5]
adcs @acc[6],@acc[6],@acc[6]
adc @acc[7],xzr,xzr
adds @acc[1],@acc[1],@a[0] // +a[i]*a[i]
adcs @acc[2],@acc[2],@tmp[1]
adcs @acc[3],@acc[3],@a[1]
adcs @acc[4],@acc[4],@tmp[2]
adcs @acc[5],@acc[5],@a[2]
adcs @acc[6],@acc[6],@tmp[3]
adc @acc[7],@acc[7],@a[3]
bl __mul_by_1_mont_256
ldr x30,[x29,#8]
adds @acc[0],@acc[0],@acc[4] // accumulate upper half
adcs @acc[1],@acc[1],@acc[5]
adcs @acc[2],@acc[2],@acc[6]
adcs @acc[3],@acc[3],@acc[7]
adc @acc[4],xzr,xzr
subs @tmp[0],@acc[0],@mod[0]
sbcs @tmp[1],@acc[1],@mod[1]
sbcs @tmp[2],@acc[2],@mod[2]
sbcs @tmp[3],@acc[3],@mod[3]
sbcs xzr, @acc[4],xzr
csel @acc[0],@acc[0],@tmp[0],lo
csel @acc[1],@acc[1],@tmp[1],lo
csel @acc[2],@acc[2],@tmp[2],lo
csel @acc[3],@acc[3],@tmp[3],lo
stp @acc[0],@acc[1],[$r_ptr]
stp @acc[2],@acc[3],[$r_ptr,#16]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
autiasp
ret
.size sqr_mont_sparse_256,.-sqr_mont_sparse_256
___
}
{
my @a = (@a, $bi);
$code.=<<___;
.globl from_mont_256
.hidden from_mont_256
.type from_mont_256,%function
.align 5
from_mont_256:
paciasp
stp x29,x30,[sp,#-16]!
add x29,sp,#0
mov $n0,$n_ptr
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
bl __mul_by_1_mont_256
ldr x30,[x29,#8]
subs @tmp[0],@a[0],@mod[0]
sbcs @tmp[1],@a[1],@mod[1]
sbcs @tmp[2],@a[2],@mod[2]
sbcs @tmp[3],@a[3],@mod[3]
csel @a[0],@a[0],@tmp[0],lo
csel @a[1],@a[1],@tmp[1],lo
csel @a[2],@a[2],@tmp[2],lo
csel @a[3],@a[3],@tmp[3],lo
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
ldr x29,[sp],#16
autiasp
ret
.size from_mont_256,.-from_mont_256
.globl redc_mont_256
.hidden redc_mont_256
.type redc_mont_256,%function
.align 5
redc_mont_256:
paciasp
stp x29,x30,[sp,#-16]!
add x29,sp,#0
mov $n0,$n_ptr
ldp @a[0],@a[1],[$a_ptr]
ldp @a[2],@a[3],[$a_ptr,#16]
bl __mul_by_1_mont_256
ldr x30,[x29,#8]
ldp @tmp[0],@tmp[1],[$a_ptr,#32]
ldp @tmp[2],@tmp[3],[$a_ptr,#48]
adds @a[0],@a[0],@tmp[0]
adcs @a[1],@a[1],@tmp[1]
adcs @a[2],@a[2],@tmp[2]
adcs @a[3],@a[3],@tmp[3]
adc @a[4],xzr,xzr
subs @tmp[0],@a[0],@mod[0]
sbcs @tmp[1],@a[1],@mod[1]
sbcs @tmp[2],@a[2],@mod[2]
sbcs @tmp[3],@a[3],@mod[3]
sbcs xzr, @a[4],xzr
csel @a[0],@a[0],@tmp[0],lo
csel @a[1],@a[1],@tmp[1],lo
csel @a[2],@a[2],@tmp[2],lo
csel @a[3],@a[3],@tmp[3],lo
stp @a[0],@a[1],[$r_ptr]
stp @a[2],@a[3],[$r_ptr,#16]
ldr x29,[sp],#16
autiasp
ret
.size redc_mont_256,.-redc_mont_256
.type __mul_by_1_mont_256,%function
.align 5
__mul_by_1_mont_256:
mul $m0,$n0,@a[0]
ldp @mod[0],@mod[1],[$b_ptr]
ldp @mod[2],@mod[3],[$b_ptr,#16]
___
for ($i=1;$i<4;$i++) {
$code.=<<___;
//mul @tmp[0],@mod[0],$m0
mul @tmp[1],@mod[1],$m0
mul @tmp[2],@mod[2],$m0
mul @tmp[3],@mod[3],$m0
subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0]
umulh @tmp[0],@mod[0],$m0
adcs @a[1],@a[1],@tmp[1]
umulh @tmp[1],@mod[1],$m0
adcs @a[2],@a[2],@tmp[2]
umulh @tmp[2],@mod[2],$m0
adcs @a[3],@a[3],@tmp[3]
umulh @tmp[3],@mod[3],$m0
adc @a[4],xzr,xzr
adds @a[0],@a[1],@tmp[0]
adcs @a[1],@a[2],@tmp[1]
adcs @a[2],@a[3],@tmp[2]
mul $m0,$n0,@a[0]
adc @a[3],@a[4],@tmp[3]
___
}
$code.=<<___;
//mul @tmp[0],@mod[0],$m0
mul @tmp[1],@mod[1],$m0
mul @tmp[2],@mod[2],$m0
mul @tmp[3],@mod[3],$m0
subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0]
umulh @tmp[0],@mod[0],$m0
adcs @a[1],@a[1],@tmp[1]
umulh @tmp[1],@mod[1],$m0
adcs @a[2],@a[2],@tmp[2]
umulh @tmp[2],@mod[2],$m0
adcs @a[3],@a[3],@tmp[3]
umulh @tmp[3],@mod[3],$m0
adc @a[4],xzr,xzr
adds @a[0],@a[1],@tmp[0]
adcs @a[1],@a[2],@tmp[1]
adcs @a[2],@a[3],@tmp[2]
adc @a[3],@a[4],@tmp[3]
ret
.size __mul_by_1_mont_256,.-__mul_by_1_mont_256
___
}
print $code;
close STDOUT;

2015
blst/asm/mul_mont_384-armv8.pl Executable file

File diff suppressed because it is too large Load diff

513
blst/asm/mulq_mont_256-x86_64.pl Executable file
View file

@ -0,0 +1,513 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# As for "sparse" in subroutine names, see commentary in the
# asm/mulx_mont_256-x86_64.pl module.
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
# common argument layout
($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
$b_ptr = "%rbx";
{ ############################################################## 256 bits
my @acc=map("%r$_",(9..15));
{ ############################################################## mulq
my ($hi, $a0) = ("%rbp", $r_ptr);
$code.=<<___;
.text
.globl mul_mont_sparse_256
.hidden mul_mont_sparse_256
.type mul_mont_sparse_256,\@function,5,"unwind"
.align 32
mul_mont_sparse_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
push $r_ptr
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov 8*0($b_org), %rax
mov 8*0($a_ptr), @acc[4]
mov 8*1($a_ptr), @acc[5]
mov 8*2($a_ptr), @acc[3]
mov 8*3($a_ptr), $hi
mov $b_org, $b_ptr # evacuate from %rdx
mov %rax, @acc[6]
mulq @acc[4] # a[0]*b[0]
mov %rax, @acc[0]
mov @acc[6], %rax
mov %rdx, @acc[1]
call __mulq_mont_sparse_256
mov 8(%rsp),%r15
.cfi_restore %r15
mov 16(%rsp),%r14
.cfi_restore %r14
mov 24(%rsp),%r13
.cfi_restore %r13
mov 32(%rsp),%r12
.cfi_restore %r12
mov 40(%rsp),%rbx
.cfi_restore %rbx
mov 48(%rsp),%rbp
.cfi_restore %rbp
lea 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.cfi_epilogue
ret
.cfi_endproc
.size mul_mont_sparse_256,.-mul_mont_sparse_256
.globl sqr_mont_sparse_256
.hidden sqr_mont_sparse_256
.type sqr_mont_sparse_256,\@function,4,"unwind"
.align 32
sqr_mont_sparse_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
push $r_ptr
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov 8*0($a_ptr), %rax
mov $n_ptr, $n0
mov 8*1($a_ptr), @acc[5]
mov $b_org, $n_ptr
mov 8*2($a_ptr), @acc[3]
lea ($a_ptr), $b_ptr
mov 8*3($a_ptr), $hi
mov %rax, @acc[6]
mulq %rax # a[0]*a[0]
mov %rax, @acc[0]
mov @acc[6], %rax
mov %rdx, @acc[1]
call __mulq_mont_sparse_256
mov 8(%rsp),%r15
.cfi_restore %r15
mov 16(%rsp),%r14
.cfi_restore %r14
mov 24(%rsp),%r13
.cfi_restore %r13
mov 32(%rsp),%r12
.cfi_restore %r12
mov 40(%rsp),%rbx
.cfi_restore %rbx
mov 48(%rsp),%rbp
.cfi_restore %rbp
lea 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.cfi_epilogue
ret
.cfi_endproc
.size sqr_mont_sparse_256,.-sqr_mont_sparse_256
___
{
my @acc=@acc;
$code.=<<___;
.type __mulq_mont_sparse_256,\@abi-omnipotent
.align 32
__mulq_mont_sparse_256:
mulq @acc[5] # a[1]*b[0]
add %rax, @acc[1]
mov @acc[6], %rax
adc \$0, %rdx
mov %rdx, @acc[2]
mulq @acc[3] # a[2]*b[0]
add %rax, @acc[2]
mov @acc[6], %rax
adc \$0, %rdx
mov %rdx, @acc[3]
mulq $hi # a[3]*b[0]
add %rax, @acc[3]
mov 8($b_ptr), %rax
adc \$0, %rdx
xor @acc[5], @acc[5]
mov %rdx, @acc[4]
___
for (my $i=1; $i<4; $i++) {
my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1];
$code.=<<___;
mov @acc[0], $a0
imulq $n0, @acc[0]
################################# Multiply by b[$i]
mov %rax, @acc[6]
mulq 8*0($a_ptr)
add %rax, @acc[1]
mov @acc[6], %rax
adc \$0, %rdx
mov %rdx, $hi
mulq 8*1($a_ptr)
add %rax, @acc[2]
mov @acc[6], %rax
adc \$0, %rdx
add $hi, @acc[2]
adc \$0, %rdx
mov %rdx, $hi
mulq 8*2($a_ptr)
add %rax, @acc[3]
mov @acc[6], %rax
adc \$0, %rdx
add $hi, @acc[3]
adc \$0, %rdx
mov %rdx, $hi
mulq 8*3($a_ptr)
add %rax, @acc[4]
mov @acc[0], %rax
adc \$0, %rdx
add $hi, @acc[4]
adc %rdx, @acc[5] # can't overflow
xor @acc[6], @acc[6]
################################# reduction
mulq 8*0($n_ptr)
add %rax, $a0 # guaranteed to be zero
mov @acc[0], %rax
adc %rdx, $a0
mulq 8*1($n_ptr)
add %rax, @acc[1]
mov @acc[0], %rax
adc \$0, %rdx
add $a0, @acc[1]
adc \$0, %rdx
mov %rdx, $hi
mulq 8*2($n_ptr)
add %rax, @acc[2]
mov @acc[0], %rax
adc \$0, %rdx
add $hi, @acc[2]
adc \$0, %rdx
mov %rdx, $hi
mulq 8*3($n_ptr)
add %rax, @acc[3]
mov $b_next, %rax
adc \$0, %rdx
add $hi, @acc[3]
adc \$0, %rdx
add %rdx, @acc[4]
adc \$0, @acc[5]
adc \$0, @acc[6]
___
push(@acc,shift(@acc));
}
$code.=<<___;
imulq $n0, %rax
mov 8(%rsp), $a_ptr # restore $r_ptr
################################# last reduction
mov %rax, @acc[6]
mulq 8*0($n_ptr)
add %rax, @acc[0] # guaranteed to be zero
mov @acc[6], %rax
adc %rdx, @acc[0]
mulq 8*1($n_ptr)
add %rax, @acc[1]
mov @acc[6], %rax
adc \$0, %rdx
add @acc[0], @acc[1]
adc \$0, %rdx
mov %rdx, $hi
mulq 8*2($n_ptr)
add %rax, @acc[2]
mov @acc[6], %rax
adc \$0, %rdx
add $hi, @acc[2]
adc \$0, %rdx
mov %rdx, $hi
mulq 8*3($n_ptr)
mov @acc[2], $b_ptr
add $hi, @acc[3]
adc \$0, %rdx
add %rax, @acc[3]
mov @acc[1], %rax
adc \$0, %rdx
add %rdx, @acc[4]
adc \$0, @acc[5]
#################################
# Branch-less conditional subtraction of modulus
mov @acc[3], @acc[0]
sub 8*0($n_ptr), @acc[1]
sbb 8*1($n_ptr), @acc[2]
sbb 8*2($n_ptr), @acc[3]
mov @acc[4], $hi
sbb 8*3($n_ptr), @acc[4]
sbb \$0, @acc[5]
cmovc %rax, @acc[1]
cmovc $b_ptr, @acc[2]
cmovc @acc[0], @acc[3]
mov @acc[1], 8*0($a_ptr)
cmovc $hi, @acc[4]
mov @acc[2], 8*1($a_ptr)
mov @acc[3], 8*2($a_ptr)
mov @acc[4], 8*3($a_ptr)
ret
.cfi_endproc
.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256
___
} }
{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted"
$code.=<<___;
.globl from_mont_256
.hidden from_mont_256
.type from_mont_256,\@function,4,"unwind"
.align 32
from_mont_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov $b_org, $n_ptr
call __mulq_by_1_mont_256
#################################
# Branch-less conditional acc[0:3] - modulus
#mov @acc[4], %rax # __mulq_by_1_mont_256 does it
mov @acc[5], @acc[1]
mov @acc[6], @acc[2]
mov @acc[0], @acc[3]
sub 8*0($n_ptr), @acc[4]
sbb 8*1($n_ptr), @acc[5]
sbb 8*2($n_ptr), @acc[6]
sbb 8*3($n_ptr), @acc[0]
cmovnc @acc[4], %rax
cmovnc @acc[5], @acc[1]
cmovnc @acc[6], @acc[2]
mov %rax, 8*0($r_ptr)
cmovnc @acc[0], @acc[3]
mov @acc[1], 8*1($r_ptr)
mov @acc[2], 8*2($r_ptr)
mov @acc[3], 8*3($r_ptr)
mov 8(%rsp),%r15
.cfi_restore %r15
mov 16(%rsp),%r14
.cfi_restore %r14
mov 24(%rsp),%r13
.cfi_restore %r13
mov 32(%rsp),%r12
.cfi_restore %r12
mov 40(%rsp),%rbx
.cfi_restore %rbx
mov 48(%rsp),%rbp
.cfi_restore %rbp
lea 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.cfi_epilogue
ret
.cfi_endproc
.size from_mont_256,.-from_mont_256
.globl redc_mont_256
.hidden redc_mont_256
.type redc_mont_256,\@function,4,"unwind"
.align 32
redc_mont_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov $b_org, $n_ptr
call __mulq_by_1_mont_256
add 8*4($a_ptr), @acc[4] # accumulate upper half
adc 8*5($a_ptr), @acc[5]
mov @acc[4], %rax
adc 8*6($a_ptr), @acc[6]
mov @acc[5], @acc[1]
adc 8*7($a_ptr), @acc[0]
sbb $a_ptr, $a_ptr
#################################
# Branch-less conditional acc[0:4] - modulus
mov @acc[6], @acc[2]
sub 8*0($n_ptr), @acc[4]
sbb 8*1($n_ptr), @acc[5]
sbb 8*2($n_ptr), @acc[6]
mov @acc[0], @acc[3]
sbb 8*3($n_ptr), @acc[0]
sbb \$0, $a_ptr
cmovnc @acc[4], %rax
cmovnc @acc[5], @acc[1]
cmovnc @acc[6], @acc[2]
mov %rax, 8*0($r_ptr)
cmovnc @acc[0], @acc[3]
mov @acc[1], 8*1($r_ptr)
mov @acc[2], 8*2($r_ptr)
mov @acc[3], 8*3($r_ptr)
mov 8(%rsp),%r15
.cfi_restore %r15
mov 16(%rsp),%r14
.cfi_restore %r14
mov 24(%rsp),%r13
.cfi_restore %r13
mov 32(%rsp),%r12
.cfi_restore %r12
mov 40(%rsp),%rbx
.cfi_restore %rbx
mov 48(%rsp),%rbp
.cfi_restore %rbp
lea 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.cfi_epilogue
ret
.cfi_endproc
.size redc_mont_256,.-redc_mont_256
___
{
my @acc=@acc;
$code.=<<___;
.type __mulq_by_1_mont_256,\@abi-omnipotent
.align 32
__mulq_by_1_mont_256:
mov 8*0($a_ptr), %rax
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov 8*3($a_ptr), @acc[3]
mov %rax, @acc[4]
imulq $n0, %rax
mov %rax, @acc[0]
___
for (my $i=0; $i<4; $i++) {
my $hi = @acc[4];
$code.=<<___;
################################# reduction $i
mulq 8*0($n_ptr)
add %rax, @acc[4] # guaranteed to be zero
mov @acc[0], %rax
adc %rdx, @acc[4]
mulq 8*1($n_ptr)
add %rax, @acc[1]
mov @acc[0], %rax
adc \$0, %rdx
add @acc[4], @acc[1]
adc \$0, %rdx
mov %rdx, $hi
mulq 8*2($n_ptr)
___
$code.=<<___ if ($i<3);
mov @acc[1], @acc[5]
imulq $n0, @acc[1]
___
$code.=<<___;
add %rax, @acc[2]
mov @acc[0], %rax
adc \$0, %rdx
add $hi, @acc[2]
adc \$0, %rdx
mov %rdx, $hi
mulq 8*3($n_ptr)
add %rax, @acc[3]
mov @acc[1], %rax
adc \$0, %rdx
add $hi, @acc[3]
adc \$0, %rdx
mov %rdx, @acc[4]
___
push(@acc,shift(@acc));
}
$code.=<<___;
ret
.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256
___
} } }
print $code;
close STDOUT;

2675
blst/asm/mulq_mont_384-x86_64.pl Executable file

File diff suppressed because it is too large Load diff

486
blst/asm/mulx_mont_256-x86_64.pl Executable file
View file

@ -0,0 +1,486 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# "Sparse" in subroutine names refers to most significant limb of the
# modulus. Though "sparse" is a bit of misnomer, because limitation is
# just not-all-ones. Or in other words not larger than 2^256-2^192-1.
# In general Montgomery multiplication algorithm can handle one of the
# inputs being non-reduced and capped by 1<<radix_width, 1<<256 in this
# case, rather than the modulus. Whether or not mul_mont_sparse_256, a
# *taylored* implementation of the algorithm, can handle such input can
# be circumstantial. For example, in most general case it depends on
# similar "bit sparsity" of individual limbs of the second, fully reduced
# multiplicand. If you can't make such assumption about the limbs, then
# non-reduced value shouldn't be larger than "same old" 2^256-2^192-1.
# This requirement can be met by conditionally subtracting "bitwise
# left-aligned" modulus. For example, if modulus is 200 bits wide, you
# would need to conditionally subtract the value of modulus<<56. Common
# source of non-reduced values is redc_mont_256 treating 512-bit inputs.
# Well, more specifically ones with upper half not smaller than modulus.
# Just in case, why limitation at all and not general-purpose 256-bit
# subroutines? Unlike the 384-bit case, accounting for additional carry
# has disproportionate impact on performance, especially in adcx/adox
# implementation.
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
# common argument layout
($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
$b_ptr = "%rbx";
{ ############################################################## 255 bits
my @acc=map("%r$_",(10..15));
{ ############################################################## mulq
my ($lo,$hi)=("%rbp","%r9");
$code.=<<___;
.text
.globl mulx_mont_sparse_256
.hidden mulx_mont_sparse_256
.type mulx_mont_sparse_256,\@function,5,"unwind"
.align 32
mulx_mont_sparse_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$8,%rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov $b_org, $b_ptr # evacuate from %rdx
mov 8*0($b_org), %rdx
mov 8*0($a_ptr), @acc[4]
mov 8*1($a_ptr), @acc[5]
mov 8*2($a_ptr), $lo
mov 8*3($a_ptr), $hi
lea -128($a_ptr), $a_ptr # control u-op density
lea -128($n_ptr), $n_ptr # control u-op density
mulx @acc[4], %rax, @acc[1] # a[0]*b[0]
call __mulx_mont_sparse_256
mov 8(%rsp),%r15
.cfi_restore %r15
mov 16(%rsp),%r14
.cfi_restore %r14
mov 24(%rsp),%r13
.cfi_restore %r13
mov 32(%rsp),%r12
.cfi_restore %r12
mov 40(%rsp),%rbx
.cfi_restore %rbx
mov 48(%rsp),%rbp
.cfi_restore %rbp
lea 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.cfi_epilogue
ret
.cfi_endproc
.size mulx_mont_sparse_256,.-mulx_mont_sparse_256
.globl sqrx_mont_sparse_256
.hidden sqrx_mont_sparse_256
.type sqrx_mont_sparse_256,\@function,4,"unwind"
.align 32
sqrx_mont_sparse_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$8,%rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov $a_ptr, $b_ptr
mov $n_ptr, $n0
mov $b_org, $n_ptr
mov 8*0($a_ptr), %rdx
mov 8*1($a_ptr), @acc[5]
mov 8*2($a_ptr), $lo
mov 8*3($a_ptr), $hi
lea -128($b_ptr), $a_ptr # control u-op density
lea -128($n_ptr), $n_ptr # control u-op density
mulx %rdx, %rax, @acc[1] # a[0]*a[0]
call __mulx_mont_sparse_256
mov 8(%rsp),%r15
.cfi_restore %r15
mov 16(%rsp),%r14
.cfi_restore %r14
mov 24(%rsp),%r13
.cfi_restore %r13
mov 32(%rsp),%r12
.cfi_restore %r12
mov 40(%rsp),%rbx
.cfi_restore %rbx
mov 48(%rsp),%rbp
.cfi_restore %rbp
lea 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.cfi_epilogue
ret
.cfi_endproc
.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256
___
{
my @acc=@acc;
$code.=<<___;
.type __mulx_mont_sparse_256,\@abi-omnipotent
.align 32
__mulx_mont_sparse_256:
mulx @acc[5], @acc[5], @acc[2]
mulx $lo, $lo, @acc[3]
add @acc[5], @acc[1]
mulx $hi, $hi, @acc[4]
mov 8($b_ptr), %rdx
adc $lo, @acc[2]
adc $hi, @acc[3]
adc \$0, @acc[4]
___
for (my $i=1; $i<4; $i++) {
my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : "%rax";
my $a5 = $i==1 ? @acc[5] : $lo;
$code.=<<___;
mov %rax, @acc[0]
imulq $n0, %rax
################################# Multiply by b[$i]
xor $a5, $a5 # [@acc[5]=0,] cf=0, of=0
mulx 8*0+128($a_ptr), $lo, $hi
adox $lo, @acc[1]
adcx $hi, @acc[2]
mulx 8*1+128($a_ptr), $lo, $hi
adox $lo, @acc[2]
adcx $hi, @acc[3]
mulx 8*2+128($a_ptr), $lo, $hi
adox $lo, @acc[3]
adcx $hi, @acc[4]
mulx 8*3+128($a_ptr), $lo, $hi
mov %rax, %rdx
adox $lo, @acc[4]
adcx @acc[5], $hi # cf=0
adox $hi, @acc[5] # of=0
################################# reduction
mulx 8*0+128($n_ptr), $lo, %rax
adcx $lo, @acc[0] # guaranteed to be zero
adox @acc[1], %rax
mulx 8*1+128($n_ptr), $lo, $hi
adcx $lo, %rax # @acc[1]
adox $hi, @acc[2]
mulx 8*2+128($n_ptr), $lo, $hi
adcx $lo, @acc[2]
adox $hi, @acc[3]
mulx 8*3+128($n_ptr), $lo, $hi
mov $b_next, %rdx
adcx $lo, @acc[3]
adox $hi, @acc[4]
adcx @acc[0], @acc[4]
adox @acc[0], @acc[5]
adcx @acc[0], @acc[5]
adox @acc[0], @acc[0] # acc[5] in next iteration
adc \$0, @acc[0] # cf=0, of=0
___
push(@acc,shift(@acc));
}
$code.=<<___;
imulq $n0, %rdx
################################# last reduction
xor $lo, $lo # cf=0, of=0
mulx 8*0+128($n_ptr), @acc[0], $hi
adcx %rax, @acc[0] # guaranteed to be zero
adox $hi, @acc[1]
mulx 8*1+128($n_ptr), $lo, $hi
adcx $lo, @acc[1]
adox $hi, @acc[2]
mulx 8*2+128($n_ptr), $lo, $hi
adcx $lo, @acc[2]
adox $hi, @acc[3]
mulx 8*3+128($n_ptr), $lo, $hi
mov @acc[1], %rdx
lea 128($n_ptr), $n_ptr
adcx $lo, @acc[3]
adox $hi, @acc[4]
mov @acc[2], %rax
adcx @acc[0], @acc[4]
adox @acc[0], @acc[5]
adc \$0, @acc[5]
#################################
# Branch-less conditional acc[1:5] - modulus
mov @acc[3], $lo
sub 8*0($n_ptr), @acc[1]
sbb 8*1($n_ptr), @acc[2]
sbb 8*2($n_ptr), @acc[3]
mov @acc[4], $hi
sbb 8*3($n_ptr), @acc[4]
sbb \$0, @acc[5]
cmovc %rdx, @acc[1]
cmovc %rax, @acc[2]
cmovc $lo, @acc[3]
mov @acc[1], 8*0($r_ptr)
cmovc $hi, @acc[4]
mov @acc[2], 8*1($r_ptr)
mov @acc[3], 8*2($r_ptr)
mov @acc[4], 8*3($r_ptr)
ret
.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256
___
} }
{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted"
$code.=<<___;
.globl fromx_mont_256
.hidden fromx_mont_256
.type fromx_mont_256,\@function,4,"unwind"
.align 32
fromx_mont_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov $b_org, $n_ptr
call __mulx_by_1_mont_256
#################################
# Branch-less conditional acc[0:3] - modulus
#mov @acc[4], %rax # __mulq_by_1_mont_256 does it
mov @acc[5], %rdx
mov @acc[0], @acc[2]
mov @acc[1], @acc[3]
sub 8*0($n_ptr), @acc[4]
sbb 8*1($n_ptr), @acc[5]
sbb 8*2($n_ptr), @acc[0]
sbb 8*3($n_ptr), @acc[1]
cmovnc @acc[4], %rax
cmovnc @acc[5], %rdx
cmovnc @acc[0], @acc[2]
mov %rax, 8*0($r_ptr)
cmovnc @acc[1], @acc[3]
mov %rdx, 8*1($r_ptr)
mov @acc[2], 8*2($r_ptr)
mov @acc[3], 8*3($r_ptr)
mov 8(%rsp),%r15
.cfi_restore %r15
mov 16(%rsp),%r14
.cfi_restore %r14
mov 24(%rsp),%r13
.cfi_restore %r13
mov 32(%rsp),%r12
.cfi_restore %r12
mov 40(%rsp),%rbx
.cfi_restore %rbx
mov 48(%rsp),%rbp
.cfi_restore %rbp
lea 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.cfi_epilogue
ret
.cfi_endproc
.size fromx_mont_256,.-fromx_mont_256
.globl redcx_mont_256
.hidden redcx_mont_256
.type redcx_mont_256,\@function,4,"unwind"
.align 32
redcx_mont_256:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$8, %rsp
.cfi_adjust_cfa_offset 8
.cfi_end_prologue
mov $b_org, $n_ptr
call __mulx_by_1_mont_256
add 8*4($a_ptr), @acc[4] # accumulate upper half
adc 8*5($a_ptr), @acc[5]
mov @acc[4], %rax
adc 8*6($a_ptr), @acc[0]
mov @acc[5], %rdx
adc 8*7($a_ptr), @acc[1]
sbb $a_ptr, $a_ptr
#################################
# Branch-less conditional acc[0:4] - modulus
mov @acc[0], @acc[2]
sub 8*0($n_ptr), @acc[4]
sbb 8*1($n_ptr), @acc[5]
sbb 8*2($n_ptr), @acc[0]
mov @acc[1], @acc[3]
sbb 8*3($n_ptr), @acc[1]
sbb \$0, $a_ptr
cmovnc @acc[4], %rax
cmovnc @acc[5], %rdx
cmovnc @acc[0], @acc[2]
mov %rax, 8*0($r_ptr)
cmovnc @acc[1], @acc[3]
mov %rdx, 8*1($r_ptr)
mov @acc[2], 8*2($r_ptr)
mov @acc[3], 8*3($r_ptr)
mov 8(%rsp),%r15
.cfi_restore %r15
mov 16(%rsp),%r14
.cfi_restore %r14
mov 24(%rsp),%r13
.cfi_restore %r13
mov 32(%rsp),%r12
.cfi_restore %r12
mov 40(%rsp),%rbx
.cfi_restore %rbx
mov 48(%rsp),%rbp
.cfi_restore %rbp
lea 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.cfi_epilogue
ret
.cfi_endproc
.size redcx_mont_256,.-redcx_mont_256
___
{
my @acc=@acc;
$code.=<<___;
.type __mulx_by_1_mont_256,\@abi-omnipotent
.align 32
__mulx_by_1_mont_256:
mov 8*0($a_ptr), %rax
mov 8*1($a_ptr), @acc[1]
mov 8*2($a_ptr), @acc[2]
mov 8*3($a_ptr), @acc[3]
mov %rax, @acc[4]
imulq $n0, %rax
mov %rax, @acc[0]
___
for (my $i=0; $i<4; $i++) {
my $hi = @acc[4];
$code.=<<___;
################################# reduction $i
mulq 8*0($n_ptr)
add %rax, @acc[4] # guaranteed to be zero
mov @acc[0], %rax
adc %rdx, @acc[4]
mulq 8*1($n_ptr)
add %rax, @acc[1]
mov @acc[0], %rax
adc \$0, %rdx
add @acc[4], @acc[1]
adc \$0, %rdx
mov %rdx, $hi
mulq 8*2($n_ptr)
___
$code.=<<___ if ($i<3);
mov @acc[1], @acc[5]
imulq $n0, @acc[1]
___
$code.=<<___;
add %rax, @acc[2]
mov @acc[0], %rax
adc \$0, %rdx
add $hi, @acc[2]
adc \$0, %rdx
mov %rdx, $hi
mulq 8*3($n_ptr)
add %rax, @acc[3]
mov @acc[1], %rax
adc \$0, %rdx
add $hi, @acc[3]
adc \$0, %rdx
mov %rdx, @acc[4]
___
push(@acc,shift(@acc));
}
$code.=<<___;
ret
.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256
___
} } }
print $code;
close STDOUT;

2384
blst/asm/mulx_mont_384-x86_64.pl Executable file

File diff suppressed because it is too large Load diff

541
blst/asm/sha256-armv8.pl Executable file
View file

@ -0,0 +1,541 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# ====================================================================
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
# project.
# ====================================================================
#
# sha256_block procedure for ARMv8.
#
# This module is stripped of scalar code paths, with raionale that all
# known processors are NEON-capable.
#
# See original module at CRYPTOGAMS for further details.
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
$BITS=256;
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$reg_t="w";
$pre="blst_";
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
$code.=<<___;
.text
.align 6
.type .LK$BITS,%object
.LK$BITS:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0 //terminator
.size .LK$BITS,.-.LK$BITS
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm"
.align 2
___
if ($SZ==4) {
my $Ktbl="x3";
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
my @MSG=map("v$_.16b",(4..7));
my ($W0,$W1)=("v16.4s","v17.4s");
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
$code.=<<___;
.globl ${pre}sha256_block_armv8
.type ${pre}sha256_block_armv8,%function
.align 6
${pre}sha256_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
adr $Ktbl,.LK256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
ld1.32 {$W0},[$Ktbl],#16
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
rev32 @MSG[2],@MSG[2]
rev32 @MSG[3],@MSG[3]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
orr $EFGH_SAVE,$EFGH,$EFGH
___
for($i=0;$i<12;$i++) {
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
ld1.32 {$W0},[$Ktbl],#16
add.i32 $W1,$W1,@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
ld1.32 {$W1},[$Ktbl]
add.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
add.i32 $W1,$W1,@MSG[3]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
add.i32 $ABCD,$ABCD,$ABCD_SAVE
add.i32 $EFGH,$EFGH,$EFGH_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD,$EFGH},[$ctx]
ldr x29,[sp],#16
ret
.size ${pre}sha256_block_armv8,.-${pre}sha256_block_armv8
___
}
if ($SZ==4) { ######################################### NEON stuff #
# You'll surely note a lot of similarities with sha256-armv4 module,
# and of course it's not a coincidence. sha256-armv4 was used as
# initial template, but was adapted for ARMv8 instruction set and
# extensively re-tuned for all-round performance.
my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
my $Ktbl="x16";
my $Xfer="x17";
my @X = map("q$_",(0..3));
my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
my $j=0;
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
sub Xupdate()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
&ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
eval(shift(@insns));
eval(shift(@insns));
&mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15]
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T2,$T0,$sigma0[0]);
eval(shift(@insns));
&ushr_32 ($T1,$T0,$sigma0[2]);
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
eval(shift(@insns));
&sli_32 ($T2,$T0,32-$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T3,$T0,$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T1,$T1,$T2);
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T3,$T0,32-$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T4,$T7,$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T4,$T7,32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T5,$T7,$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T3,$T7,$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&sli_u32 ($T3,$T7,32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T5,$T5,$T3); # sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T6,@X[0],$sigma1[0]);
eval(shift(@insns));
&ushr_32 ($T7,@X[0],$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T6,@X[0],32-$sigma1[0]);
eval(shift(@insns));
&ushr_32 ($T5,@X[0],$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T7,$T7,$T6);
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T5,@X[0],32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&ld1_32 ("{$T0}","[$Ktbl], #16");
eval(shift(@insns));
&eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T5,$T5,$T5);
eval(shift(@insns));
eval(shift(@insns));
&mov (&Dhi($T5), &Dlo($T7));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 ($T0,$T0,@X[0]);
while($#insns>=1) { eval(shift(@insns)); }
&st1_32 ("{$T0}","[$Xfer], #16");
eval(shift(@insns));
push(@X,shift(@X)); # "rotate" X[]
}
sub Xpreload()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
eval(shift(@insns));
eval(shift(@insns));
&ld1_8 ("{@X[0]}","[$inp],#16");
eval(shift(@insns));
eval(shift(@insns));
&ld1_32 ("{$T0}","[$Ktbl],#16");
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&rev32 (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 ($T0,$T0,@X[0]);
foreach (@insns) { eval; } # remaining instructions
&st1_32 ("{$T0}","[$Xfer], #16");
push(@X,shift(@X)); # "rotate" X[]
}
sub body_00_15 () {
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
'&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
'&and ($t1,$f,$e)',
'&bic ($t4,$g,$e)',
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
'&orr ($t1,$t1,$t4)', # Ch(e,f,g)
'&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
'&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
'&ror ($t0,$t0,"#$Sigma1[0]")',
'&eor ($t2,$a,$b)', # a^b, b^c in next round
'&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
'&add ($h,$h,$t0)', # h+=Sigma1(e)
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
'&ror ($t4,$t4,"#$Sigma0[0]")',
'&add ($d,$d,$h)', # d+=h
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
)
}
$code.=<<___;
.globl ${pre}sha256_block_data_order
.type ${pre}sha256_block_data_order,%function
.align 4
${pre}sha256_block_data_order:
stp x29, x30, [sp, #-16]!
mov x29, sp
sub sp,sp,#16*4
adr $Ktbl,.LK256
add $num,$inp,$num,lsl#6 // len to point at the end of inp
ld1.8 {@X[0]},[$inp], #16
ld1.8 {@X[1]},[$inp], #16
ld1.8 {@X[2]},[$inp], #16
ld1.8 {@X[3]},[$inp], #16
ld1.32 {$T0},[$Ktbl], #16
ld1.32 {$T1},[$Ktbl], #16
ld1.32 {$T2},[$Ktbl], #16
ld1.32 {$T3},[$Ktbl], #16
rev32 @X[0],@X[0] // yes, even on
rev32 @X[1],@X[1] // big-endian
rev32 @X[2],@X[2]
rev32 @X[3],@X[3]
mov $Xfer,sp
add.32 $T0,$T0,@X[0]
add.32 $T1,$T1,@X[1]
add.32 $T2,$T2,@X[2]
st1.32 {$T0-$T1},[$Xfer], #32
add.32 $T3,$T3,@X[3]
st1.32 {$T2-$T3},[$Xfer]
sub $Xfer,$Xfer,#32
ldp $A,$B,[$ctx]
ldp $C,$D,[$ctx,#8]
ldp $E,$F,[$ctx,#16]
ldp $G,$H,[$ctx,#24]
ldr $t1,[sp,#0]
mov $t2,wzr
eor $t3,$B,$C
mov $t4,wzr
b .L_00_48
.align 4
.L_00_48:
___
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
$code.=<<___;
cmp $t1,#0 // check for K256 terminator
ldr $t1,[sp,#0]
sub $Xfer,$Xfer,#64
bne .L_00_48
sub $Ktbl,$Ktbl,#256 // rewind $Ktbl
cmp $inp,$num
mov $Xfer, #64
csel $Xfer, $Xfer, xzr, eq
sub $inp,$inp,$Xfer // avoid SEGV
mov $Xfer,sp
___
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
$code.=<<___;
add $A,$A,$t4 // h+=Sigma0(a) from the past
ldp $t0,$t1,[$ctx,#0]
add $A,$A,$t2 // h+=Maj(a,b,c) from the past
ldp $t2,$t3,[$ctx,#8]
add $A,$A,$t0 // accumulate
add $B,$B,$t1
ldp $t0,$t1,[$ctx,#16]
add $C,$C,$t2
add $D,$D,$t3
ldp $t2,$t3,[$ctx,#24]
add $E,$E,$t0
add $F,$F,$t1
ldr $t1,[sp,#0]
stp $A,$B,[$ctx,#0]
add $G,$G,$t2
mov $t2,wzr
stp $C,$D,[$ctx,#8]
add $H,$H,$t3
stp $E,$F,[$ctx,#16]
eor $t3,$B,$C
stp $G,$H,[$ctx,#24]
mov $t4,wzr
mov $Xfer,sp
b.ne .L_00_48
ldr x29,[x29]
add sp,sp,#16*4+16
ret
.size ${pre}sha256_block_data_order,.-${pre}sha256_block_data_order
___
}
{
my ($out,$inp,$len) = map("x$_",(0..2));
$code.=<<___;
.globl ${pre}sha256_emit
.hidden ${pre}sha256_emit
.type ${pre}sha256_emit,%function
.align 4
${pre}sha256_emit:
ldp x4,x5,[$inp]
ldp x6,x7,[$inp,#16]
#ifndef __AARCH64EB__
rev x4,x4
rev x5,x5
rev x6,x6
rev x7,x7
#endif
str w4,[$out,#4]
lsr x4,x4,#32
str w5,[$out,#12]
lsr x5,x5,#32
str w6,[$out,#20]
lsr x6,x6,#32
str w7,[$out,#28]
lsr x7,x7,#32
str w4,[$out,#0]
str w5,[$out,#8]
str w6,[$out,#16]
str w7,[$out,#24]
ret
.size ${pre}sha256_emit,.-${pre}sha256_emit
.globl ${pre}sha256_bcopy
.hidden ${pre}sha256_bcopy
.type ${pre}sha256_bcopy,%function
.align 4
${pre}sha256_bcopy:
.Loop_bcopy:
ldrb w3,[$inp],#1
sub $len,$len,#1
strb w3,[$out],#1
cbnz $len,.Loop_bcopy
ret
.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy
.globl ${pre}sha256_hcopy
.hidden ${pre}sha256_hcopy
.type ${pre}sha256_hcopy,%function
.align 4
${pre}sha256_hcopy:
ldp x4,x5,[$inp]
ldp x6,x7,[$inp,#16]
stp x4,x5,[$out]
stp x6,x7,[$out,#16]
ret
.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy
___
}
{ my %opcode = (
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/\/\// and !/^$/);
print;
}
close SELF;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
s/\.[ui]?8(\s)/$1/;
s/\.\w?64\b// and s/\.16b/\.2d/g or
s/\.\w?32\b// and s/\.16b/\.4s/g;
m/\bext\b/ and s/\.2d/\.16b/g or
m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,337 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# ====================================================================
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
# project.
# ====================================================================
#
# sha256_block procedure for x86_64.
#
# Scalar-only version with minor twist minimizing 'lea' instructions.
$flavour = shift;
$output = pop;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
$pre="blst_";
$func="${pre}sha256_block_data_order";
$TABLE="K256";
$SZ=4;
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
"%r8d","%r9d","%r10d","%r11d");
($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$ctx="%rdi"; # 1st arg, zapped by $a3
$inp="%rsi"; # 2nd arg
$Tbl="%rbp";
$_ctx="16*$SZ+0*8(%rsp)";
$_inp="16*$SZ+1*8(%rsp)";
$_end="16*$SZ+2*8(%rsp)";
$framesz="16*$SZ+3*8";
sub ROUND_00_15()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
my $STRIDE=$SZ;
# $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
$code.=<<___;
ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
mov $f,$a2
xor $e,$a0
ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
xor $g,$a2 # f^g
mov $T1,`$SZ*($i&0xf)`(%rsp)
xor $a,$a1
and $e,$a2 # (f^g)&e
ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
add $h,$T1 # T1+=h
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
xor $e,$a0
add $a2,$T1 # T1+=Ch(e,f,g)
mov $a,$a2
add `$SZ*$i`($Tbl),$T1 # T1+=K[round]
xor $a,$a1
xor $b,$a2 # a^b, b^c in next round
ror \$$Sigma1[0],$a0 # Sigma1(e)
mov $b,$h
and $a2,$a3
ror \$$Sigma0[0],$a1 # Sigma0(a)
add $a0,$T1 # T1+=Sigma1(e)
xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
add $T1,$d # d+=T1
add $T1,$h # h+=T1
___
$code.=<<___ if ($i==31);
lea `16*$SZ`($Tbl),$Tbl # round+=16
___
$code.=<<___ if ($i<15);
add $a1,$h # h+=Sigma0(a)
___
($a2,$a3) = ($a3,$a2);
}
sub ROUND_16_XX()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
mov $a0,$T1
ror \$`$sigma0[1]-$sigma0[0]`,$a0
add $a1,$a # modulo-scheduled h+=Sigma0(a)
mov $a2,$a1
ror \$`$sigma1[1]-$sigma1[0]`,$a2
xor $T1,$a0
shr \$$sigma0[2],$T1
ror \$$sigma0[0],$a0
xor $a1,$a2
shr \$$sigma1[2],$a1
ror \$$sigma1[0],$a2
xor $a0,$T1 # sigma0(X[(i+1)&0xf])
xor $a1,$a2 # sigma1(X[(i+14)&0xf])
add `$SZ*(($i+9)&0xf)`(%rsp),$T1
add `$SZ*($i&0xf)`(%rsp),$T1
mov $e,$a0
add $a2,$T1
mov $a,$a1
___
&ROUND_00_15(@_);
}
$code=<<___;
.text
.globl $func
.type $func,\@function,3,"unwind"
.align 16
$func:
.cfi_startproc
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
shl \$4,%rdx # num*16
sub \$$framesz,%rsp
.cfi_adjust_cfa_offset $framesz
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
.cfi_end_prologue
mov $SZ*0($ctx),$A
mov $SZ*1($ctx),$B
mov $SZ*2($ctx),$C
mov $SZ*3($ctx),$D
mov $SZ*4($ctx),$E
mov $SZ*5($ctx),$F
mov $SZ*6($ctx),$G
mov $SZ*7($ctx),$H
jmp .Lloop
.align 16
.Lloop:
mov $B,$a3
lea $TABLE(%rip),$Tbl
xor $C,$a3 # magic
___
for($i=0;$i<16;$i++) {
$code.=" mov $SZ*$i($inp),$T1\n";
$code.=" mov @ROT[4],$a0\n";
$code.=" mov @ROT[0],$a1\n";
$code.=" bswap $T1\n";
&ROUND_00_15($i,@ROT);
unshift(@ROT,pop(@ROT));
}
$code.=<<___;
jmp .Lrounds_16_xx
.align 16
.Lrounds_16_xx:
___
for(;$i<32;$i++) {
&ROUND_16_XX($i,@ROT);
unshift(@ROT,pop(@ROT));
}
$code.=<<___;
cmpb \$0x19,`$SZ-1`($Tbl)
jnz .Lrounds_16_xx
mov $_ctx,$ctx
add $a1,$A # modulo-scheduled h+=Sigma0(a)
lea 16*$SZ($inp),$inp
add $SZ*0($ctx),$A
add $SZ*1($ctx),$B
add $SZ*2($ctx),$C
add $SZ*3($ctx),$D
add $SZ*4($ctx),$E
add $SZ*5($ctx),$F
add $SZ*6($ctx),$G
add $SZ*7($ctx),$H
cmp $_end,$inp
mov $A,$SZ*0($ctx)
mov $B,$SZ*1($ctx)
mov $C,$SZ*2($ctx)
mov $D,$SZ*3($ctx)
mov $E,$SZ*4($ctx)
mov $F,$SZ*5($ctx)
mov $G,$SZ*6($ctx)
mov $H,$SZ*7($ctx)
jb .Lloop
lea $framesz+6*8(%rsp),%r11
.cfi_def_cfa %r11,8
mov $framesz(%rsp),%r15
.cfi_restore %r15
mov -40(%r11),%r14
.cfi_restore %r14
mov -32(%r11),%r13
.cfi_restore %r13
mov -24(%r11),%r12
.cfi_restore %r12
mov -16(%r11),%rbp
.cfi_restore %rbp
mov -8(%r11),%rbx
.cfi_restore %rbx
.cfi_epilogue
lea (%r11),%rsp
ret
.cfi_endproc
.size $func,.-$func
.align 64
.type $TABLE,\@object
$TABLE:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
___
{
my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order
("%rdi","%rsi","%rdx"); # Unix order
$code.=<<___;
.globl ${pre}sha256_emit
.hidden ${pre}sha256_emit
.type ${pre}sha256_emit,\@abi-omnipotent
.align 16
${pre}sha256_emit:
mov 0($inp), %r8
mov 8($inp), %r9
mov 16($inp), %r10
bswap %r8
mov 24($inp), %r11
bswap %r9
mov %r8d, 4($out)
bswap %r10
mov %r9d, 12($out)
bswap %r11
mov %r10d, 20($out)
shr \$32, %r8
mov %r11d, 28($out)
shr \$32, %r9
mov %r8d, 0($out)
shr \$32, %r10
mov %r9d, 8($out)
shr \$32, %r11
mov %r10d, 16($out)
mov %r11d, 24($out)
ret
.size ${pre}sha256_emit,.-${pre}sha256_emit
.globl ${pre}sha256_bcopy
.hidden ${pre}sha256_bcopy
.type ${pre}sha256_bcopy,\@abi-omnipotent
.align 16
${pre}sha256_bcopy:
sub $inp, $out
.Loop_bcopy:
movzb ($inp), %eax
lea 1($inp), $inp
mov %al, -1($out,$inp)
dec $len
jnz .Loop_bcopy
ret
.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy
.globl ${pre}sha256_hcopy
.hidden ${pre}sha256_hcopy
.type ${pre}sha256_hcopy,\@abi-omnipotent
.align 16
${pre}sha256_hcopy:
mov 0($inp), %r8
mov 8($inp), %r9
mov 16($inp), %r10
mov 24($inp), %r11
mov %r8, 0($out)
mov %r9, 8($out)
mov %r10, 16($out)
mov %r11, 24($out)
ret
.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy
___
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
print $_,"\n";
}
close STDOUT;

789
blst/asm/sha256-x86_64.pl Executable file
View file

@ -0,0 +1,789 @@
#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# ====================================================================
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
# project.
# ====================================================================
#
# sha256_block procedure for x86_64.
#
# This module is stripped of AVX and even scalar code paths, with
# raionale that
#
# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one*
# processor, venerable Sandy Bridge;
# b) AVX2 incurs costly power transitions, which would be justifiable
# if AVX2 code was executing most of the time, which is not the
# case in the context;
# c) all comtemporary processors support SSSE3, so that nobody would
# actually use scalar code path anyway;
#
# See original module at CRYPTOGAMS for further details.
$flavour = shift;
$output = pop;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
$pre="blst_";
$func="${pre}sha256_block_data_order";
$TABLE="K256";
$SZ=4;
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
"%r8d","%r9d","%r10d","%r11d");
($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$ctx="%rdi"; # 1st arg, zapped by $a3
$inp="%rsi"; # 2nd arg
$Tbl="%rbp";
$_ctx="16*$SZ+0*8(%rsp)";
$_inp="16*$SZ+1*8(%rsp)";
$_end="16*$SZ+2*8(%rsp)";
$framesz="16*$SZ+3*8";
$code=<<___;
.text
.align 64
.type $TABLE,\@object
$TABLE:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
___
######################################################################
# SIMD code paths
#
{{{
######################################################################
# Intel SHA Extensions implementation of SHA256 update function.
#
my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
my @MSG=map("%xmm$_",(3..6));
$code.=<<___;
.globl ${pre}sha256_block_data_order_shaext
.hidden ${pre}sha256_block_data_order_shaext
.type ${pre}sha256_block_data_order_shaext,\@function,3,"unwind"
.align 64
${pre}sha256_block_data_order_shaext:
.cfi_startproc
___
$code.=<<___ if ($win64);
sub \$0x58,%rsp
.cfi_adjust_cfa_offset 0x58
movaps %xmm6,-0x58(%r11)
.cfi_offset %xmm6,-0x60
movaps %xmm7,-0x48(%r11)
.cfi_offset %xmm7,-0x50
movaps %xmm8,-0x38(%r11)
.cfi_offset %xmm8,-0x40
movaps %xmm9,-0x28(%r11)
.cfi_offset %xmm9,-0x30
movaps %xmm10,-0x18(%r11)
.cfi_offset %xmm10,-0x20
.cfi_end_prologue
___
$code.=<<___;
lea K256+0x80(%rip),$Tbl
movdqu ($ctx),$ABEF # DCBA
movdqu 16($ctx),$CDGH # HGFE
movdqa 0x100-0x80($Tbl),$TMP # byte swap mask
pshufd \$0x1b,$ABEF,$Wi # ABCD
pshufd \$0xb1,$ABEF,$ABEF # CDAB
pshufd \$0x1b,$CDGH,$CDGH # EFGH
movdqa $TMP,$BSWAP # offload
palignr \$8,$CDGH,$ABEF # ABEF
punpcklqdq $Wi,$CDGH # CDGH
jmp .Loop_shaext
.align 16
.Loop_shaext:
movdqu ($inp),@MSG[0]
movdqu 0x10($inp),@MSG[1]
movdqu 0x20($inp),@MSG[2]
pshufb $TMP,@MSG[0]
movdqu 0x30($inp),@MSG[3]
movdqa 0*16-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
pshufb $TMP,@MSG[1]
movdqa $CDGH,$CDGH_SAVE # offload
sha256rnds2 $ABEF,$CDGH # 0-3
pshufd \$0x0e,$Wi,$Wi
nop
movdqa $ABEF,$ABEF_SAVE # offload
sha256rnds2 $CDGH,$ABEF
movdqa 1*16-0x80($Tbl),$Wi
paddd @MSG[1],$Wi
pshufb $TMP,@MSG[2]
sha256rnds2 $ABEF,$CDGH # 4-7
pshufd \$0x0e,$Wi,$Wi
lea 0x40($inp),$inp
sha256msg1 @MSG[1],@MSG[0]
sha256rnds2 $CDGH,$ABEF
movdqa 2*16-0x80($Tbl),$Wi
paddd @MSG[2],$Wi
pshufb $TMP,@MSG[3]
sha256rnds2 $ABEF,$CDGH # 8-11
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[3],$TMP
palignr \$4,@MSG[2],$TMP
nop
paddd $TMP,@MSG[0]
sha256msg1 @MSG[2],@MSG[1]
sha256rnds2 $CDGH,$ABEF
movdqa 3*16-0x80($Tbl),$Wi
paddd @MSG[3],$Wi
sha256msg2 @MSG[3],@MSG[0]
sha256rnds2 $ABEF,$CDGH # 12-15
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[0],$TMP
palignr \$4,@MSG[3],$TMP
nop
paddd $TMP,@MSG[1]
sha256msg1 @MSG[3],@MSG[2]
sha256rnds2 $CDGH,$ABEF
___
for($i=4;$i<16-3;$i++) {
$code.=<<___;
movdqa $i*16-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
sha256msg2 @MSG[0],@MSG[1]
sha256rnds2 $ABEF,$CDGH # 16-19...
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[1],$TMP
palignr \$4,@MSG[0],$TMP
nop
paddd $TMP,@MSG[2]
sha256msg1 @MSG[0],@MSG[3]
sha256rnds2 $CDGH,$ABEF
___
push(@MSG,shift(@MSG));
}
$code.=<<___;
movdqa 13*16-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
sha256msg2 @MSG[0],@MSG[1]
sha256rnds2 $ABEF,$CDGH # 52-55
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[1],$TMP
palignr \$4,@MSG[0],$TMP
sha256rnds2 $CDGH,$ABEF
paddd $TMP,@MSG[2]
movdqa 14*16-0x80($Tbl),$Wi
paddd @MSG[1],$Wi
sha256rnds2 $ABEF,$CDGH # 56-59
pshufd \$0x0e,$Wi,$Wi
sha256msg2 @MSG[1],@MSG[2]
movdqa $BSWAP,$TMP
sha256rnds2 $CDGH,$ABEF
movdqa 15*16-0x80($Tbl),$Wi
paddd @MSG[2],$Wi
nop
sha256rnds2 $ABEF,$CDGH # 60-63
pshufd \$0x0e,$Wi,$Wi
dec $num
nop
sha256rnds2 $CDGH,$ABEF
paddd $CDGH_SAVE,$CDGH
paddd $ABEF_SAVE,$ABEF
jnz .Loop_shaext
pshufd \$0xb1,$CDGH,$CDGH # DCHG
pshufd \$0x1b,$ABEF,$TMP # FEBA
pshufd \$0xb1,$ABEF,$ABEF # BAFE
punpckhqdq $CDGH,$ABEF # DCBA
palignr \$8,$TMP,$CDGH # HGFE
movdqu $ABEF,($ctx)
movdqu $CDGH,16($ctx)
___
$code.=<<___ if ($win64);
movaps -0x58(%r11),%xmm6
movaps -0x48(%r11),%xmm7
movaps -0x38(%r11),%xmm8
movaps -0x28(%r11),%xmm9
movaps -0x18(%r11),%xmm10
mov %r11,%rsp
.cfi_def_cfa %r11,8
.cfi_epilogue
___
$code.=<<___;
ret
.cfi_endproc
.size ${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext
___
}}}
{{{
my $a4=$T1;
my ($a,$b,$c,$d,$e,$f,$g,$h);
sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
my $arg = pop;
$arg = "\$$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
}
sub body_00_15 () {
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
'&ror ($a0,$Sigma1[2]-$Sigma1[1])',
'&mov ($a,$a1)',
'&mov ($a4,$f)',
'&ror ($a1,$Sigma0[2]-$Sigma0[1])',
'&xor ($a0,$e)',
'&xor ($a4,$g)', # f^g
'&ror ($a0,$Sigma1[1]-$Sigma1[0])',
'&xor ($a1,$a)',
'&and ($a4,$e)', # (f^g)&e
'&xor ($a0,$e)',
'&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
'&mov ($a2,$a)',
'&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
'&ror ($a1,$Sigma0[1]-$Sigma0[0])',
'&xor ($a2,$b)', # a^b, b^c in next round
'&add ($h,$a4)', # h+=Ch(e,f,g)
'&ror ($a0,$Sigma1[0])', # Sigma1(e)
'&and ($a3,$a2)', # (b^c)&(a^b)
'&xor ($a1,$a)',
'&add ($h,$a0)', # h+=Sigma1(e)
'&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
'&ror ($a1,$Sigma0[0])', # Sigma0(a)
'&add ($d,$h)', # d+=h
'&add ($h,$a3)', # h+=Maj(a,b,c)
'&mov ($a0,$d)',
'&add ($a1,$h);'. # h+=Sigma0(a)
'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
);
}
######################################################################
# SSSE3 code path
#
{
my $Tbl = $inp;
my $_ctx="0(%rbp)";
my $_inp="8(%rbp)";
my $_end="16(%rbp)";
my $framesz=4*8+$win64*16*4+8;
my @X = map("%xmm$_",(0..3));
my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
$code.=<<___;
.globl ${func}
.hidden ${func}
.type ${func},\@function,3,"unwind"
.align 64
${func}:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
shl \$4,%rdx # num*16
sub \$$framesz,%rsp
.cfi_adjust_cfa_offset $framesz
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
mov $ctx,0(%rsp) # save ctx, 1st arg
#mov $inp,8(%rsp) # save inp, 2nd arg
mov %rdx,16(%rsp) # save end pointer, "3rd" arg
___
$code.=<<___ if ($win64);
movaps %xmm6,0x20(%rsp)
.cfi_offset %xmm6,-0x78
movaps %xmm7,0x30(%rsp)
.cfi_offset %xmm7,-0x68
movaps %xmm8,0x40(%rsp)
.cfi_offset %xmm8,-0x58
movaps %xmm9,0x50(%rsp)
.cfi_offset %xmm9,-0x48
___
$code.=<<___;
mov %rsp,%rbp
.cfi_def_cfa_register %rbp
.cfi_end_prologue
lea -16*$SZ(%rsp),%rsp
mov $SZ*0($ctx),$A
and \$-64,%rsp # align stack
mov $SZ*1($ctx),$B
mov $SZ*2($ctx),$C
mov $SZ*3($ctx),$D
mov $SZ*4($ctx),$E
mov $SZ*5($ctx),$F
mov $SZ*6($ctx),$G
mov $SZ*7($ctx),$H
___
$code.=<<___;
#movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t4
#movdqa $TABLE+`$SZ*$rounds`+64(%rip),$t5
jmp .Lloop_ssse3
.align 16
.Lloop_ssse3:
movdqa $TABLE+`$SZ*$rounds`(%rip),$t3
mov $inp,$_inp # offload $inp
movdqu 0x00($inp),@X[0]
movdqu 0x10($inp),@X[1]
movdqu 0x20($inp),@X[2]
pshufb $t3,@X[0]
movdqu 0x30($inp),@X[3]
lea $TABLE(%rip),$Tbl
pshufb $t3,@X[1]
movdqa 0x00($Tbl),$t0
movdqa 0x10($Tbl),$t1
pshufb $t3,@X[2]
paddd @X[0],$t0
movdqa 0x20($Tbl),$t2
pshufb $t3,@X[3]
movdqa 0x30($Tbl),$t3
paddd @X[1],$t1
paddd @X[2],$t2
paddd @X[3],$t3
movdqa $t0,0x00(%rsp)
mov $A,$a1
movdqa $t1,0x10(%rsp)
mov $B,$a3
movdqa $t2,0x20(%rsp)
xor $C,$a3 # magic
movdqa $t3,0x30(%rsp)
mov $E,$a0
jmp .Lssse3_00_47
.align 16
.Lssse3_00_47:
sub \$`-16*$SZ`,$Tbl # size optimization
___
sub Xupdate_256_SSSE3 () {
(
'&movdqa ($t0,@X[1]);',
'&movdqa ($t3,@X[3])',
'&palignr ($t0,@X[0],$SZ)', # X[1..4]
'&palignr ($t3,@X[2],$SZ);', # X[9..12]
'&movdqa ($t1,$t0)',
'&movdqa ($t2,$t0);',
'&psrld ($t0,$sigma0[2])',
'&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
'&psrld ($t2,$sigma0[0])',
'&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
'&pslld ($t1,8*$SZ-$sigma0[1]);'.
'&pxor ($t0,$t2)',
'&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
'&pxor ($t0,$t1)',
'&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
'&pxor ($t0,$t2);',
'&movdqa ($t2,$t3)',
'&pxor ($t0,$t1);', # sigma0(X[1..4])
'&psrld ($t3,$sigma1[2])',
'&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
'&psrlq ($t2,$sigma1[0])',
'&pxor ($t3,$t2);',
'&psrlq ($t2,$sigma1[1]-$sigma1[0])',
'&pxor ($t3,$t2)',
'&pshufb ($t3,$t4)', # sigma1(X[14..15])
'&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
'&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
'&movdqa ($t2,$t3);',
'&psrld ($t3,$sigma1[2])',
'&psrlq ($t2,$sigma1[0])',
'&pxor ($t3,$t2);',
'&psrlq ($t2,$sigma1[1]-$sigma1[0])',
'&pxor ($t3,$t2);',
'&movdqa ($t2,16*$j."($Tbl)")',
'&pshufb ($t3,$t5)',
'&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
);
}
sub SSSE3_256_00_47 () {
my $j = shift;
my $body = shift;
my @X = @_;
my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
if (0) {
foreach (Xupdate_256_SSSE3()) { # 36 instructions
eval;
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
}
} else { # squeeze extra 4% on Westmere and 19% on Atom
eval(shift(@insns)); #@
&movdqa ($t0,@X[1]);
eval(shift(@insns));
eval(shift(@insns));
&movdqa ($t3,@X[3]);
eval(shift(@insns)); #@
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); #@
eval(shift(@insns));
&palignr ($t0,@X[0],$SZ); # X[1..4]
eval(shift(@insns));
eval(shift(@insns));
&palignr ($t3,@X[2],$SZ); # X[9..12]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); #@
&movdqa ($t1,$t0);
eval(shift(@insns));
eval(shift(@insns));
&movdqa ($t2,$t0);
eval(shift(@insns)); #@
eval(shift(@insns));
&psrld ($t0,$sigma0[2]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[0],$t3); # X[0..3] += X[9..12]
eval(shift(@insns)); #@
eval(shift(@insns));
&psrld ($t2,$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&pshufd ($t3,@X[3],0b11111010); # X[4..15]
eval(shift(@insns));
eval(shift(@insns)); #@
&pslld ($t1,8*$SZ-$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&pxor ($t0,$t2);
eval(shift(@insns)); #@
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); #@
&psrld ($t2,$sigma0[1]-$sigma0[0]);
eval(shift(@insns));
&pxor ($t0,$t1);
eval(shift(@insns));
eval(shift(@insns));
&pslld ($t1,$sigma0[1]-$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&pxor ($t0,$t2);
eval(shift(@insns));
eval(shift(@insns)); #@
&movdqa ($t2,$t3);
eval(shift(@insns));
eval(shift(@insns));
&pxor ($t0,$t1); # sigma0(X[1..4])
eval(shift(@insns)); #@
eval(shift(@insns));
eval(shift(@insns));
&psrld ($t3,$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
eval(shift(@insns)); #@
eval(shift(@insns));
&psrlq ($t2,$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pxor ($t3,$t2);
eval(shift(@insns)); #@
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); #@
&psrlq ($t2,$sigma1[1]-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&pxor ($t3,$t2);
eval(shift(@insns)); #@
eval(shift(@insns));
eval(shift(@insns));
#&pshufb ($t3,$t4); # sigma1(X[14..15])
&pshufd ($t3,$t3,0b10000000);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&psrldq ($t3,8);
eval(shift(@insns));
eval(shift(@insns)); #@
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); #@
&paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pshufd ($t3,@X[0],0b01010000); # X[16..17]
eval(shift(@insns));
eval(shift(@insns)); #@
eval(shift(@insns));
&movdqa ($t2,$t3);
eval(shift(@insns));
eval(shift(@insns));
&psrld ($t3,$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns)); #@
&psrlq ($t2,$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&pxor ($t3,$t2);
eval(shift(@insns)); #@
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); #@
eval(shift(@insns));
&psrlq ($t2,$sigma1[1]-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&pxor ($t3,$t2);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); #@
#&pshufb ($t3,$t5);
&pshufd ($t3,$t3,0b00001000);
eval(shift(@insns));
eval(shift(@insns));
&movdqa ($t2,16*$j."($Tbl)");
eval(shift(@insns)); #@
eval(shift(@insns));
&pslldq ($t3,8);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
eval(shift(@insns)); #@
eval(shift(@insns));
eval(shift(@insns));
}
&paddd ($t2,@X[0]);
foreach (@insns) { eval; } # remaining instructions
&movdqa (16*$j."(%rsp)",$t2);
}
for ($i=0,$j=0; $j<4; $j++) {
&SSSE3_256_00_47($j,\&body_00_15,@X);
push(@X,shift(@X)); # rotate(@X)
}
&cmpb ($SZ-1+16*$SZ."($Tbl)",0);
&jne (".Lssse3_00_47");
for ($i=0; $i<16; ) {
foreach(body_00_15()) { eval; }
}
$code.=<<___;
mov $_ctx,$ctx
mov $a1,$A
mov $_inp,$inp
add $SZ*0($ctx),$A
add $SZ*1($ctx),$B
add $SZ*2($ctx),$C
add $SZ*3($ctx),$D
add $SZ*4($ctx),$E
add $SZ*5($ctx),$F
add $SZ*6($ctx),$G
add $SZ*7($ctx),$H
lea 16*$SZ($inp),$inp
cmp $_end,$inp
mov $A,$SZ*0($ctx)
mov $B,$SZ*1($ctx)
mov $C,$SZ*2($ctx)
mov $D,$SZ*3($ctx)
mov $E,$SZ*4($ctx)
mov $F,$SZ*5($ctx)
mov $G,$SZ*6($ctx)
mov $H,$SZ*7($ctx)
jb .Lloop_ssse3
xorps %xmm0, %xmm0
lea $framesz+6*8(%rbp),%r11
.cfi_def_cfa %r11,8
movaps %xmm0, 0x00(%rsp) # scrub the stack
movaps %xmm0, 0x10(%rsp)
movaps %xmm0, 0x20(%rsp)
movaps %xmm0, 0x30(%rsp)
___
$code.=<<___ if ($win64);
movaps 0x20(%rbp),%xmm6
movaps 0x30(%rbp),%xmm7
movaps 0x40(%rbp),%xmm8
movaps 0x50(%rbp),%xmm9
___
$code.=<<___;
mov $framesz(%rbp),%r15
.cfi_restore %r15
mov -40(%r11),%r14
.cfi_restore %r14
mov -32(%r11),%r13
.cfi_restore %r13
mov -24(%r11),%r12
.cfi_restore %r12
mov -16(%r11),%rbx
.cfi_restore %rbx
mov -8(%r11),%rbp
.cfi_restore %rbp
.cfi_epilogue
lea (%r11),%rsp
ret
.cfi_endproc
.size ${func},.-${func}
___
}
}}}
{
my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order
("%rdi","%rsi","%rdx"); # Unix order
$code.=<<___;
.globl ${pre}sha256_emit
.hidden ${pre}sha256_emit
.type ${pre}sha256_emit,\@abi-omnipotent
.align 16
${pre}sha256_emit:
mov 0($inp), %r8
mov 8($inp), %r9
mov 16($inp), %r10
bswap %r8
mov 24($inp), %r11
bswap %r9
mov %r8d, 4($out)
bswap %r10
mov %r9d, 12($out)
bswap %r11
mov %r10d, 20($out)
shr \$32, %r8
mov %r11d, 28($out)
shr \$32, %r9
mov %r8d, 0($out)
shr \$32, %r10
mov %r9d, 8($out)
shr \$32, %r11
mov %r10d, 16($out)
mov %r11d, 24($out)
ret
.size ${pre}sha256_emit,.-${pre}sha256_emit
.globl ${pre}sha256_bcopy
.hidden ${pre}sha256_bcopy
.type ${pre}sha256_bcopy,\@abi-omnipotent
.align 16
${pre}sha256_bcopy:
sub $inp, $out
.Loop_bcopy:
movzb ($inp), %eax
lea 1($inp), $inp
mov %al, -1($out,$inp)
dec $len
jnz .Loop_bcopy
ret
.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy
.globl ${pre}sha256_hcopy
.hidden ${pre}sha256_hcopy
.type ${pre}sha256_hcopy,\@abi-omnipotent
.align 16
${pre}sha256_hcopy:
mov 0($inp), %r8
mov 8($inp), %r9
mov 16($inp), %r10
mov 24($inp), %r11
mov %r8, 0($out)
mov %r9, 8($out)
mov %r10, 16($out)
mov %r11, 24($out)
ret
.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy
___
}
sub sha256op38 {
my $instr = shift;
my %opcodelet = (
"sha256rnds2" => 0xcb,
"sha256msg1" => 0xcc,
"sha256msg2" => 0xcd );
if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
my @opcode=(0x0f,0x38);
push @opcode,$opcodelet{$instr};
push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
} else {
return $instr."\t".@_[0];
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
print $_,"\n";
}
close STDOUT;

1781
blst/asm/x86_64-xlate.pl Executable file

File diff suppressed because it is too large Load diff