548 lines
11 KiB
Raku
Executable file
548 lines
11 KiB
Raku
Executable file
#!/usr/bin/env perl
|
|
#
|
|
# Copyright Supranational LLC
|
|
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
|
or die "can't call $xlate: $!";
|
|
|
|
# common argument layout
|
|
($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx");
|
|
$b_ptr = "%rbx";
|
|
|
|
{ ############################################################## 256 bits add
|
|
my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12));
|
|
|
|
$code.=<<___;
|
|
.text
|
|
|
|
.globl add_mod_256
|
|
.hidden add_mod_256
|
|
.type add_mod_256,\@function,4,"unwind"
|
|
.align 32
|
|
add_mod_256:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
sub \$8, %rsp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
|
|
.Loaded_a_add_mod_256:
|
|
add 8*0($b_org), @acc[0]
|
|
adc 8*1($b_org), @acc[1]
|
|
mov @acc[0], @acc[4]
|
|
adc 8*2($b_org), @acc[2]
|
|
mov @acc[1], @acc[5]
|
|
adc 8*3($b_org), @acc[3]
|
|
sbb $b_org, $b_org
|
|
|
|
mov @acc[2], @acc[6]
|
|
sub 8*0($n_ptr), @acc[0]
|
|
sbb 8*1($n_ptr), @acc[1]
|
|
sbb 8*2($n_ptr), @acc[2]
|
|
mov @acc[3], @acc[7]
|
|
sbb 8*3($n_ptr), @acc[3]
|
|
sbb \$0, $b_org
|
|
|
|
cmovc @acc[4], @acc[0]
|
|
cmovc @acc[5], @acc[1]
|
|
mov @acc[0], 8*0($r_ptr)
|
|
cmovc @acc[6], @acc[2]
|
|
mov @acc[1], 8*1($r_ptr)
|
|
cmovc @acc[7], @acc[3]
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
|
|
mov 8(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 16(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 24(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -24
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size add_mod_256,.-add_mod_256
|
|
|
|
########################################################################
|
|
.globl mul_by_3_mod_256
|
|
.hidden mul_by_3_mod_256
|
|
.type mul_by_3_mod_256,\@function,3,"unwind"
|
|
.align 32
|
|
mul_by_3_mod_256:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
.cfi_end_prologue
|
|
|
|
mov $b_org,$n_ptr
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov $a_ptr,$b_org
|
|
mov 8*3($a_ptr), @acc[3]
|
|
|
|
call __lshift_mod_256
|
|
mov 0(%rsp),%r12
|
|
.cfi_restore %r12
|
|
jmp .Loaded_a_add_mod_256
|
|
|
|
mov 8(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 16(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 24(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -24
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size mul_by_3_mod_256,.-mul_by_3_mod_256
|
|
|
|
.type __lshift_mod_256,\@abi-omnipotent
|
|
.align 32
|
|
__lshift_mod_256:
|
|
add @acc[0], @acc[0]
|
|
adc @acc[1], @acc[1]
|
|
mov @acc[0], @acc[4]
|
|
adc @acc[2], @acc[2]
|
|
mov @acc[1], @acc[5]
|
|
adc @acc[3], @acc[3]
|
|
sbb @acc[8], @acc[8]
|
|
|
|
mov @acc[2], @acc[6]
|
|
sub 8*0($n_ptr), @acc[0]
|
|
sbb 8*1($n_ptr), @acc[1]
|
|
sbb 8*2($n_ptr), @acc[2]
|
|
mov @acc[3], @acc[7]
|
|
sbb 8*3($n_ptr), @acc[3]
|
|
sbb \$0, @acc[8]
|
|
|
|
cmovc @acc[4], @acc[0]
|
|
cmovc @acc[5], @acc[1]
|
|
cmovc @acc[6], @acc[2]
|
|
cmovc @acc[7], @acc[3]
|
|
|
|
ret
|
|
.size __lshift_mod_256,.-__lshift_mod_256
|
|
|
|
########################################################################
|
|
.globl lshift_mod_256
|
|
.hidden lshift_mod_256
|
|
.type lshift_mod_256,\@function,4,"unwind"
|
|
.align 32
|
|
lshift_mod_256:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
|
|
.Loop_lshift_mod_256:
|
|
call __lshift_mod_256
|
|
dec %edx
|
|
jnz .Loop_lshift_mod_256
|
|
|
|
mov @acc[0], 8*0($r_ptr)
|
|
mov @acc[1], 8*1($r_ptr)
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
|
|
mov 0(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 8(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 16(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 24(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -24
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size lshift_mod_256,.-lshift_mod_256
|
|
|
|
########################################################################
|
|
.globl rshift_mod_256
|
|
.hidden rshift_mod_256
|
|
.type rshift_mod_256,\@function,4,"unwind"
|
|
.align 32
|
|
rshift_mod_256:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
sub \$8, %rsp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[7]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
|
|
.Loop_rshift_mod_256:
|
|
mov @acc[7], @acc[0]
|
|
and \$1, @acc[7]
|
|
mov 8*0($n_ptr), @acc[4]
|
|
neg @acc[7]
|
|
mov 8*1($n_ptr), @acc[5]
|
|
mov 8*2($n_ptr), @acc[6]
|
|
|
|
and @acc[7], @acc[4]
|
|
and @acc[7], @acc[5]
|
|
and @acc[7], @acc[6]
|
|
and 8*3($n_ptr), @acc[7]
|
|
|
|
add @acc[4], @acc[0]
|
|
adc @acc[5], @acc[1]
|
|
adc @acc[6], @acc[2]
|
|
adc @acc[7], @acc[3]
|
|
sbb @acc[4], @acc[4]
|
|
|
|
shr \$1, @acc[0]
|
|
mov @acc[1], @acc[7]
|
|
shr \$1, @acc[1]
|
|
mov @acc[2], @acc[6]
|
|
shr \$1, @acc[2]
|
|
mov @acc[3], @acc[5]
|
|
shr \$1, @acc[3]
|
|
|
|
shl \$63, @acc[7]
|
|
shl \$63, @acc[6]
|
|
or @acc[0], @acc[7]
|
|
shl \$63, @acc[5]
|
|
or @acc[6], @acc[1]
|
|
shl \$63, @acc[4]
|
|
or @acc[5], @acc[2]
|
|
or @acc[4], @acc[3]
|
|
|
|
dec %edx
|
|
jnz .Loop_rshift_mod_256
|
|
|
|
mov @acc[7], 8*0($r_ptr)
|
|
mov @acc[1], 8*1($r_ptr)
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
|
|
mov 8(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 16(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 24(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -24
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size rshift_mod_256,.-rshift_mod_256
|
|
|
|
########################################################################
|
|
.globl cneg_mod_256
|
|
.hidden cneg_mod_256
|
|
.type cneg_mod_256,\@function,4,"unwind"
|
|
.align 32
|
|
cneg_mod_256:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[8] # load a[0:3]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov @acc[8], @acc[0]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
or @acc[1], @acc[8]
|
|
or @acc[2], @acc[8]
|
|
or @acc[3], @acc[8]
|
|
mov \$-1, @acc[7]
|
|
|
|
mov 8*0($n_ptr), @acc[4] # load n[0:3]
|
|
cmovnz @acc[7], @acc[8] # mask = a[0:3] ? -1 : 0
|
|
mov 8*1($n_ptr), @acc[5]
|
|
mov 8*2($n_ptr), @acc[6]
|
|
and @acc[8], @acc[4] # n[0:3] &= mask
|
|
mov 8*3($n_ptr), @acc[7]
|
|
and @acc[8], @acc[5]
|
|
and @acc[8], @acc[6]
|
|
and @acc[8], @acc[7]
|
|
|
|
sub @acc[0], @acc[4] # a[0:3] ? n[0:3]-a[0:3] : 0-0
|
|
sbb @acc[1], @acc[5]
|
|
sbb @acc[2], @acc[6]
|
|
sbb @acc[3], @acc[7]
|
|
|
|
or $b_org, $b_org # check condition flag
|
|
|
|
cmovz @acc[0], @acc[4] # flag ? n[0:3]-a[0:3] : a[0:3]
|
|
cmovz @acc[1], @acc[5]
|
|
mov @acc[4], 8*0($r_ptr)
|
|
cmovz @acc[2], @acc[6]
|
|
mov @acc[5], 8*1($r_ptr)
|
|
cmovz @acc[3], @acc[7]
|
|
mov @acc[6], 8*2($r_ptr)
|
|
mov @acc[7], 8*3($r_ptr)
|
|
|
|
mov 0(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 8(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 16(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 24(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -24
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size cneg_mod_256,.-cneg_mod_256
|
|
|
|
########################################################################
|
|
.globl sub_mod_256
|
|
.hidden sub_mod_256
|
|
.type sub_mod_256,\@function,4,"unwind"
|
|
.align 32
|
|
sub_mod_256:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
sub \$8, %rsp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
|
|
sub 8*0($b_org), @acc[0]
|
|
mov 8*0($n_ptr), @acc[4]
|
|
sbb 8*1($b_org), @acc[1]
|
|
mov 8*1($n_ptr), @acc[5]
|
|
sbb 8*2($b_org), @acc[2]
|
|
mov 8*2($n_ptr), @acc[6]
|
|
sbb 8*3($b_org), @acc[3]
|
|
mov 8*3($n_ptr), @acc[7]
|
|
sbb $b_org, $b_org
|
|
|
|
and $b_org, @acc[4]
|
|
and $b_org, @acc[5]
|
|
and $b_org, @acc[6]
|
|
and $b_org, @acc[7]
|
|
|
|
add @acc[4], @acc[0]
|
|
adc @acc[5], @acc[1]
|
|
mov @acc[0], 8*0($r_ptr)
|
|
adc @acc[6], @acc[2]
|
|
mov @acc[1], 8*1($r_ptr)
|
|
adc @acc[7], @acc[3]
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
|
|
mov 8(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 16(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 24(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -24
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size sub_mod_256,.-sub_mod_256
|
|
|
|
########################################################################
|
|
.globl check_mod_256
|
|
.hidden check_mod_256
|
|
.type check_mod_256,\@function,2,"unwind"
|
|
.align 32
|
|
check_mod_256:
|
|
.cfi_startproc
|
|
mov 8*0($r_ptr), %rax
|
|
mov 8*1($r_ptr), @acc[1]
|
|
mov 8*2($r_ptr), @acc[2]
|
|
mov 8*3($r_ptr), @acc[3]
|
|
|
|
mov %rax, @acc[0] # see if it's zero
|
|
or @acc[1], %rax
|
|
or @acc[2], %rax
|
|
or @acc[3], %rax
|
|
|
|
sub 8*0($a_ptr), @acc[0] # does subtracting modulus borrow?
|
|
sbb 8*1($a_ptr), @acc[1]
|
|
sbb 8*2($a_ptr), @acc[2]
|
|
sbb 8*3($a_ptr), @acc[3]
|
|
sbb $a_ptr, $a_ptr
|
|
|
|
mov \$1, %rdx
|
|
cmp \$0, %rax
|
|
cmovne %rdx, %rax
|
|
and $a_ptr, %rax
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size check_mod_256,.-check_mod_256
|
|
|
|
########################################################################
|
|
.globl add_n_check_mod_256
|
|
.hidden add_n_check_mod_256
|
|
.type add_n_check_mod_256,\@function,4,"unwind"
|
|
.align 32
|
|
add_n_check_mod_256:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
sub \$8, %rsp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
|
|
add 8*0($b_org), @acc[0]
|
|
adc 8*1($b_org), @acc[1]
|
|
mov @acc[0], @acc[4]
|
|
adc 8*2($b_org), @acc[2]
|
|
mov @acc[1], @acc[5]
|
|
adc 8*3($b_org), @acc[3]
|
|
sbb $b_org, $b_org
|
|
|
|
mov @acc[2], @acc[6]
|
|
sub 8*0($n_ptr), @acc[0]
|
|
sbb 8*1($n_ptr), @acc[1]
|
|
sbb 8*2($n_ptr), @acc[2]
|
|
mov @acc[3], @acc[7]
|
|
sbb 8*3($n_ptr), @acc[3]
|
|
sbb \$0, $b_org
|
|
|
|
cmovc @acc[4], @acc[0]
|
|
cmovc @acc[5], @acc[1]
|
|
mov @acc[0], 8*0($r_ptr)
|
|
cmovc @acc[6], @acc[2]
|
|
mov @acc[1], 8*1($r_ptr)
|
|
cmovc @acc[7], @acc[3]
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
|
|
or @acc[1], @acc[0]
|
|
or @acc[3], @acc[2]
|
|
or @acc[2], @acc[0]
|
|
mov \$1, %rax
|
|
cmovz @acc[0], %rax
|
|
|
|
mov 8(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 16(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 24(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -24
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size add_n_check_mod_256,.-add_n_check_mod_256
|
|
|
|
########################################################################
|
|
.globl sub_n_check_mod_256
|
|
.hidden sub_n_check_mod_256
|
|
.type sub_n_check_mod_256,\@function,4,"unwind"
|
|
.align 32
|
|
sub_n_check_mod_256:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
sub \$8, %rsp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
|
|
sub 8*0($b_org), @acc[0]
|
|
mov 8*0($n_ptr), @acc[4]
|
|
sbb 8*1($b_org), @acc[1]
|
|
mov 8*1($n_ptr), @acc[5]
|
|
sbb 8*2($b_org), @acc[2]
|
|
mov 8*2($n_ptr), @acc[6]
|
|
sbb 8*3($b_org), @acc[3]
|
|
mov 8*3($n_ptr), @acc[7]
|
|
sbb $b_org, $b_org
|
|
|
|
and $b_org, @acc[4]
|
|
and $b_org, @acc[5]
|
|
and $b_org, @acc[6]
|
|
and $b_org, @acc[7]
|
|
|
|
add @acc[4], @acc[0]
|
|
adc @acc[5], @acc[1]
|
|
mov @acc[0], 8*0($r_ptr)
|
|
adc @acc[6], @acc[2]
|
|
mov @acc[1], 8*1($r_ptr)
|
|
adc @acc[7], @acc[3]
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
|
|
or @acc[1], @acc[0]
|
|
or @acc[3], @acc[2]
|
|
or @acc[2], @acc[0]
|
|
mov \$1, %rax
|
|
cmovz @acc[0], %rax
|
|
|
|
mov 8(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 16(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 24(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -24
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size sub_n_check_mod_256,.-sub_n_check_mod_256
|
|
___
|
|
}
|
|
|
|
print $code;
|
|
close STDOUT;
|