1431 lines
28 KiB
Perl
Executable file
1431 lines
28 KiB
Perl
Executable file
#!/usr/bin/env perl
|
|
#
|
|
# Copyright Supranational LLC
|
|
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
|
or die "can't call $xlate: $!";
|
|
|
|
# common argument layout
|
|
($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
|
|
$b_ptr = "%rbx";
|
|
|
|
{ ############################################################## 384 bits add
|
|
my @acc=map("%r$_",(8..15, "ax", "bx", "bp"));
|
|
push(@acc, $a_ptr);
|
|
|
|
$code.=<<___;
|
|
.text
|
|
|
|
.globl add_mod_384
|
|
.hidden add_mod_384
|
|
.type add_mod_384,\@function,4,"unwind"
|
|
.align 32
|
|
add_mod_384:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
sub \$8, %rsp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
call __add_mod_384
|
|
|
|
mov 8(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 16(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 32(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 48(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size add_mod_384,.-add_mod_384
|
|
|
|
.type __add_mod_384,\@abi-omnipotent
|
|
.align 32
|
|
__add_mod_384:
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
mov 8*4($a_ptr), @acc[4]
|
|
mov 8*5($a_ptr), @acc[5]
|
|
|
|
__add_mod_384_a_is_loaded:
|
|
add 8*0($b_org), @acc[0]
|
|
adc 8*1($b_org), @acc[1]
|
|
adc 8*2($b_org), @acc[2]
|
|
mov @acc[0], @acc[6]
|
|
adc 8*3($b_org), @acc[3]
|
|
mov @acc[1], @acc[7]
|
|
adc 8*4($b_org), @acc[4]
|
|
mov @acc[2], @acc[8]
|
|
adc 8*5($b_org), @acc[5]
|
|
mov @acc[3], @acc[9]
|
|
sbb $b_org, $b_org
|
|
|
|
sub 8*0($n_ptr), @acc[0]
|
|
sbb 8*1($n_ptr), @acc[1]
|
|
mov @acc[4], @acc[10]
|
|
sbb 8*2($n_ptr), @acc[2]
|
|
sbb 8*3($n_ptr), @acc[3]
|
|
sbb 8*4($n_ptr), @acc[4]
|
|
mov @acc[5], @acc[11]
|
|
sbb 8*5($n_ptr), @acc[5]
|
|
sbb \$0, $b_org
|
|
|
|
cmovc @acc[6], @acc[0]
|
|
cmovc @acc[7], @acc[1]
|
|
cmovc @acc[8], @acc[2]
|
|
mov @acc[0], 8*0($r_ptr)
|
|
cmovc @acc[9], @acc[3]
|
|
mov @acc[1], 8*1($r_ptr)
|
|
cmovc @acc[10], @acc[4]
|
|
mov @acc[2], 8*2($r_ptr)
|
|
cmovc @acc[11], @acc[5]
|
|
mov @acc[3], 8*3($r_ptr)
|
|
mov @acc[4], 8*4($r_ptr)
|
|
mov @acc[5], 8*5($r_ptr)
|
|
|
|
ret
|
|
.size __add_mod_384,.-__add_mod_384
|
|
|
|
.globl add_mod_384x
|
|
.hidden add_mod_384x
|
|
.type add_mod_384x,\@function,4,"unwind"
|
|
.align 32
|
|
add_mod_384x:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
sub \$24, %rsp
|
|
.cfi_adjust_cfa_offset 24
|
|
.cfi_end_prologue
|
|
|
|
mov $a_ptr, 8*0(%rsp)
|
|
mov $b_org, 8*1(%rsp)
|
|
lea 48($a_ptr), $a_ptr # a->im
|
|
lea 48($b_org), $b_org # b->im
|
|
lea 48($r_ptr), $r_ptr # ret->im
|
|
call __add_mod_384 # add_mod_384(ret->im, a->im, b->im, mod);
|
|
|
|
mov 8*0(%rsp), $a_ptr # a->re
|
|
mov 8*1(%rsp), $b_org # b->re
|
|
lea -48($r_ptr), $r_ptr # ret->re
|
|
call __add_mod_384 # add_mod_384(ret->re, a->re, b->re, mod);
|
|
|
|
mov 24+8*0(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 24+8*1(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24+8*2(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 24+8*3(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 24+8*4(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 24+8*5(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 24+8*6(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -24-8*6
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size add_mod_384x,.-add_mod_384x
|
|
|
|
########################################################################
|
|
.globl rshift_mod_384
|
|
.hidden rshift_mod_384
|
|
.type rshift_mod_384,\@function,4,"unwind"
|
|
.align 32
|
|
rshift_mod_384:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
push $r_ptr
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
mov 8*4($a_ptr), @acc[4]
|
|
mov 8*5($a_ptr), @acc[5]
|
|
|
|
.Loop_rshift_mod_384:
|
|
call __rshift_mod_384
|
|
dec %edx
|
|
jnz .Loop_rshift_mod_384
|
|
|
|
mov @acc[0], 8*0($r_ptr)
|
|
mov @acc[1], 8*1($r_ptr)
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
mov @acc[4], 8*4($r_ptr)
|
|
mov @acc[5], 8*5($r_ptr)
|
|
|
|
mov 8(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 16(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 32(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 48(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size rshift_mod_384,.-rshift_mod_384
|
|
|
|
.type __rshift_mod_384,\@abi-omnipotent
|
|
.align 32
|
|
__rshift_mod_384:
|
|
mov \$1, @acc[11]
|
|
mov 8*0($n_ptr), @acc[6]
|
|
and @acc[0], @acc[11]
|
|
mov 8*1($n_ptr), @acc[7]
|
|
neg @acc[11]
|
|
mov 8*2($n_ptr), @acc[8]
|
|
and @acc[11], @acc[6]
|
|
mov 8*3($n_ptr), @acc[9]
|
|
and @acc[11], @acc[7]
|
|
mov 8*4($n_ptr), @acc[10]
|
|
and @acc[11], @acc[8]
|
|
and @acc[11], @acc[9]
|
|
and @acc[11], @acc[10]
|
|
and 8*5($n_ptr), @acc[11]
|
|
|
|
add @acc[0], @acc[6]
|
|
adc @acc[1], @acc[7]
|
|
adc @acc[2], @acc[8]
|
|
adc @acc[3], @acc[9]
|
|
adc @acc[4], @acc[10]
|
|
adc @acc[5], @acc[11]
|
|
sbb @acc[5], @acc[5]
|
|
|
|
shr \$1, @acc[6]
|
|
mov @acc[7], @acc[0]
|
|
shr \$1, @acc[7]
|
|
mov @acc[8], @acc[1]
|
|
shr \$1, @acc[8]
|
|
mov @acc[9], @acc[2]
|
|
shr \$1, @acc[9]
|
|
mov @acc[10], @acc[3]
|
|
shr \$1, @acc[10]
|
|
mov @acc[11], @acc[4]
|
|
shr \$1, @acc[11]
|
|
shl \$63, @acc[0]
|
|
shl \$63, @acc[1]
|
|
or @acc[6], @acc[0]
|
|
shl \$63, @acc[2]
|
|
or @acc[7], @acc[1]
|
|
shl \$63, @acc[3]
|
|
or @acc[8], @acc[2]
|
|
shl \$63, @acc[4]
|
|
or @acc[9], @acc[3]
|
|
shl \$63, @acc[5]
|
|
or @acc[10], @acc[4]
|
|
or @acc[11], @acc[5]
|
|
|
|
ret
|
|
.size __rshift_mod_384,.-__rshift_mod_384
|
|
|
|
.globl div_by_2_mod_384
|
|
.hidden div_by_2_mod_384
|
|
.type div_by_2_mod_384,\@function,3,"unwind"
|
|
.align 32
|
|
div_by_2_mod_384:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
push $r_ptr
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov $b_org, $n_ptr
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
mov 8*4($a_ptr), @acc[4]
|
|
mov 8*5($a_ptr), @acc[5]
|
|
|
|
call __rshift_mod_384
|
|
|
|
mov @acc[0], 8*0($r_ptr)
|
|
mov @acc[1], 8*1($r_ptr)
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
mov @acc[4], 8*4($r_ptr)
|
|
mov @acc[5], 8*5($r_ptr)
|
|
|
|
mov 8(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 16(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 32(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 48(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size div_by_2_mod_384,.-div_by_2_mod_384
|
|
|
|
########################################################################
|
|
.globl lshift_mod_384
|
|
.hidden lshift_mod_384
|
|
.type lshift_mod_384,\@function,4,"unwind"
|
|
.align 32
|
|
lshift_mod_384:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
push $r_ptr
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
mov 8*4($a_ptr), @acc[4]
|
|
mov 8*5($a_ptr), @acc[5]
|
|
|
|
.Loop_lshift_mod_384:
|
|
add @acc[0], @acc[0]
|
|
adc @acc[1], @acc[1]
|
|
adc @acc[2], @acc[2]
|
|
mov @acc[0], @acc[6]
|
|
adc @acc[3], @acc[3]
|
|
mov @acc[1], @acc[7]
|
|
adc @acc[4], @acc[4]
|
|
mov @acc[2], @acc[8]
|
|
adc @acc[5], @acc[5]
|
|
mov @acc[3], @acc[9]
|
|
sbb $r_ptr, $r_ptr
|
|
|
|
sub 8*0($n_ptr), @acc[0]
|
|
sbb 8*1($n_ptr), @acc[1]
|
|
mov @acc[4], @acc[10]
|
|
sbb 8*2($n_ptr), @acc[2]
|
|
sbb 8*3($n_ptr), @acc[3]
|
|
sbb 8*4($n_ptr), @acc[4]
|
|
mov @acc[5], @acc[11]
|
|
sbb 8*5($n_ptr), @acc[5]
|
|
sbb \$0, $r_ptr
|
|
|
|
mov (%rsp), $r_ptr
|
|
cmovc @acc[6], @acc[0]
|
|
cmovc @acc[7], @acc[1]
|
|
cmovc @acc[8], @acc[2]
|
|
cmovc @acc[9], @acc[3]
|
|
cmovc @acc[10], @acc[4]
|
|
cmovc @acc[11], @acc[5]
|
|
|
|
dec %edx
|
|
jnz .Loop_lshift_mod_384
|
|
|
|
mov @acc[0], 8*0($r_ptr)
|
|
mov @acc[1], 8*1($r_ptr)
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
mov @acc[4], 8*4($r_ptr)
|
|
mov @acc[5], 8*5($r_ptr)
|
|
|
|
mov 8(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 16(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 32(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 48(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size lshift_mod_384,.-lshift_mod_384
|
|
|
|
.type __lshift_mod_384,\@abi-omnipotent
|
|
.align 32
|
|
__lshift_mod_384:
|
|
add @acc[0], @acc[0]
|
|
adc @acc[1], @acc[1]
|
|
adc @acc[2], @acc[2]
|
|
mov @acc[0], @acc[6]
|
|
adc @acc[3], @acc[3]
|
|
mov @acc[1], @acc[7]
|
|
adc @acc[4], @acc[4]
|
|
mov @acc[2], @acc[8]
|
|
adc @acc[5], @acc[5]
|
|
mov @acc[3], @acc[9]
|
|
sbb $b_org, $b_org
|
|
|
|
sub 8*0($n_ptr), @acc[0]
|
|
sbb 8*1($n_ptr), @acc[1]
|
|
mov @acc[4], @acc[10]
|
|
sbb 8*2($n_ptr), @acc[2]
|
|
sbb 8*3($n_ptr), @acc[3]
|
|
sbb 8*4($n_ptr), @acc[4]
|
|
mov @acc[5], @acc[11]
|
|
sbb 8*5($n_ptr), @acc[5]
|
|
sbb \$0, $b_org
|
|
|
|
cmovc @acc[6], @acc[0]
|
|
cmovc @acc[7], @acc[1]
|
|
cmovc @acc[8], @acc[2]
|
|
cmovc @acc[9], @acc[3]
|
|
cmovc @acc[10], @acc[4]
|
|
cmovc @acc[11], @acc[5]
|
|
|
|
ret
|
|
.size __lshift_mod_384,.-__lshift_mod_384
|
|
|
|
########################################################################
|
|
.globl mul_by_3_mod_384
|
|
.hidden mul_by_3_mod_384
|
|
.type mul_by_3_mod_384,\@function,3,"unwind"
|
|
.align 32
|
|
mul_by_3_mod_384:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
push $a_ptr
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
mov 8*4($a_ptr), @acc[4]
|
|
mov 8*5($a_ptr), @acc[5]
|
|
mov $b_org, $n_ptr
|
|
|
|
call __lshift_mod_384
|
|
|
|
mov (%rsp), $b_org
|
|
call __add_mod_384_a_is_loaded
|
|
|
|
mov 8(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 16(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 32(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 48(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size mul_by_3_mod_384,.-mul_by_3_mod_384
|
|
|
|
.globl mul_by_8_mod_384
|
|
.hidden mul_by_8_mod_384
|
|
.type mul_by_8_mod_384,\@function,3,"unwind"
|
|
.align 32
|
|
mul_by_8_mod_384:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
sub \$8, %rsp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
mov 8*4($a_ptr), @acc[4]
|
|
mov 8*5($a_ptr), @acc[5]
|
|
mov $b_org, $n_ptr
|
|
|
|
call __lshift_mod_384
|
|
call __lshift_mod_384
|
|
call __lshift_mod_384
|
|
|
|
mov @acc[0], 8*0($r_ptr)
|
|
mov @acc[1], 8*1($r_ptr)
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
mov @acc[4], 8*4($r_ptr)
|
|
mov @acc[5], 8*5($r_ptr)
|
|
|
|
mov 8(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 16(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 32(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 48(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size mul_by_8_mod_384,.-mul_by_8_mod_384
|
|
|
|
########################################################################
|
|
.globl mul_by_3_mod_384x
|
|
.hidden mul_by_3_mod_384x
|
|
.type mul_by_3_mod_384x,\@function,3,"unwind"
|
|
.align 32
|
|
mul_by_3_mod_384x:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
push $a_ptr
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
mov 8*4($a_ptr), @acc[4]
|
|
mov 8*5($a_ptr), @acc[5]
|
|
mov $b_org, $n_ptr
|
|
|
|
call __lshift_mod_384
|
|
|
|
mov (%rsp), $b_org
|
|
call __add_mod_384_a_is_loaded
|
|
|
|
mov (%rsp), $a_ptr
|
|
lea 8*6($r_ptr), $r_ptr
|
|
|
|
mov 8*6($a_ptr), @acc[0]
|
|
mov 8*7($a_ptr), @acc[1]
|
|
mov 8*8($a_ptr), @acc[2]
|
|
mov 8*9($a_ptr), @acc[3]
|
|
mov 8*10($a_ptr), @acc[4]
|
|
mov 8*11($a_ptr), @acc[5]
|
|
|
|
call __lshift_mod_384
|
|
|
|
mov \$8*6, $b_org
|
|
add (%rsp), $b_org
|
|
call __add_mod_384_a_is_loaded
|
|
|
|
mov 8(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 16(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 32(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 48(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size mul_by_3_mod_384x,.-mul_by_3_mod_384x
|
|
|
|
.globl mul_by_8_mod_384x
|
|
.hidden mul_by_8_mod_384x
|
|
.type mul_by_8_mod_384x,\@function,3,"unwind"
|
|
.align 32
|
|
mul_by_8_mod_384x:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
push $a_ptr
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
mov 8*4($a_ptr), @acc[4]
|
|
mov 8*5($a_ptr), @acc[5]
|
|
mov $b_org, $n_ptr
|
|
|
|
call __lshift_mod_384
|
|
call __lshift_mod_384
|
|
call __lshift_mod_384
|
|
|
|
mov (%rsp), $a_ptr
|
|
mov @acc[0], 8*0($r_ptr)
|
|
mov @acc[1], 8*1($r_ptr)
|
|
mov @acc[2], 8*2($r_ptr)
|
|
mov @acc[3], 8*3($r_ptr)
|
|
mov @acc[4], 8*4($r_ptr)
|
|
mov @acc[5], 8*5($r_ptr)
|
|
|
|
mov 48+8*0($a_ptr), @acc[0]
|
|
mov 48+8*1($a_ptr), @acc[1]
|
|
mov 48+8*2($a_ptr), @acc[2]
|
|
mov 48+8*3($a_ptr), @acc[3]
|
|
mov 48+8*4($a_ptr), @acc[4]
|
|
mov 48+8*5($a_ptr), @acc[5]
|
|
|
|
call __lshift_mod_384
|
|
call __lshift_mod_384
|
|
call __lshift_mod_384
|
|
|
|
mov @acc[0], 48+8*0($r_ptr)
|
|
mov @acc[1], 48+8*1($r_ptr)
|
|
mov @acc[2], 48+8*2($r_ptr)
|
|
mov @acc[3], 48+8*3($r_ptr)
|
|
mov @acc[4], 48+8*4($r_ptr)
|
|
mov @acc[5], 48+8*5($r_ptr)
|
|
|
|
mov 8(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 16(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 32(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 48(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size mul_by_8_mod_384x,.-mul_by_8_mod_384x
|
|
|
|
########################################################################
|
|
.globl cneg_mod_384
|
|
.hidden cneg_mod_384
|
|
.type cneg_mod_384,\@function,4,"unwind"
|
|
.align 32
|
|
cneg_mod_384:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
push $b_org # condition flag
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), $b_org # load a[0:5]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov $b_org, @acc[0]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
or @acc[1], $b_org
|
|
mov 8*4($a_ptr), @acc[4]
|
|
or @acc[2], $b_org
|
|
mov 8*5($a_ptr), @acc[5]
|
|
or @acc[3], $b_org
|
|
mov \$-1, @acc[11]
|
|
or @acc[4], $b_org
|
|
or @acc[5], $b_org
|
|
|
|
mov 8*0($n_ptr), @acc[6] # load n[0:5]
|
|
cmovnz @acc[11], $b_org # mask = a[0:5] ? -1 : 0
|
|
mov 8*1($n_ptr), @acc[7]
|
|
mov 8*2($n_ptr), @acc[8]
|
|
and $b_org, @acc[6] # n[0:5] &= mask
|
|
mov 8*3($n_ptr), @acc[9]
|
|
and $b_org, @acc[7]
|
|
mov 8*4($n_ptr), @acc[10]
|
|
and $b_org, @acc[8]
|
|
mov 8*5($n_ptr), @acc[11]
|
|
and $b_org, @acc[9]
|
|
mov 0(%rsp), $n_ptr # restore condition flag
|
|
and $b_org, @acc[10]
|
|
and $b_org, @acc[11]
|
|
|
|
sub @acc[0], @acc[6] # a[0:5] ? n[0:5]-a[0:5] : 0-0
|
|
sbb @acc[1], @acc[7]
|
|
sbb @acc[2], @acc[8]
|
|
sbb @acc[3], @acc[9]
|
|
sbb @acc[4], @acc[10]
|
|
sbb @acc[5], @acc[11]
|
|
|
|
or $n_ptr, $n_ptr # check condition flag
|
|
|
|
cmovz @acc[0], @acc[6] # flag ? n[0:5]-a[0:5] : a[0:5]
|
|
cmovz @acc[1], @acc[7]
|
|
cmovz @acc[2], @acc[8]
|
|
mov @acc[6], 8*0($r_ptr)
|
|
cmovz @acc[3], @acc[9]
|
|
mov @acc[7], 8*1($r_ptr)
|
|
cmovz @acc[4], @acc[10]
|
|
mov @acc[8], 8*2($r_ptr)
|
|
cmovz @acc[5], @acc[11]
|
|
mov @acc[9], 8*3($r_ptr)
|
|
mov @acc[10], 8*4($r_ptr)
|
|
mov @acc[11], 8*5($r_ptr)
|
|
|
|
mov 8(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 16(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 32(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 48(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size cneg_mod_384,.-cneg_mod_384
|
|
|
|
########################################################################
|
|
.globl sub_mod_384
|
|
.hidden sub_mod_384
|
|
.type sub_mod_384,\@function,4,"unwind"
|
|
.align 32
|
|
sub_mod_384:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
sub \$8, %rsp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
call __sub_mod_384
|
|
|
|
mov 8(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 16(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 32(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 40(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 48(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size sub_mod_384,.-sub_mod_384
|
|
|
|
.type __sub_mod_384,\@abi-omnipotent
|
|
.align 32
|
|
__sub_mod_384:
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
mov 8*4($a_ptr), @acc[4]
|
|
mov 8*5($a_ptr), @acc[5]
|
|
|
|
sub 8*0($b_org), @acc[0]
|
|
mov 8*0($n_ptr), @acc[6]
|
|
sbb 8*1($b_org), @acc[1]
|
|
mov 8*1($n_ptr), @acc[7]
|
|
sbb 8*2($b_org), @acc[2]
|
|
mov 8*2($n_ptr), @acc[8]
|
|
sbb 8*3($b_org), @acc[3]
|
|
mov 8*3($n_ptr), @acc[9]
|
|
sbb 8*4($b_org), @acc[4]
|
|
mov 8*4($n_ptr), @acc[10]
|
|
sbb 8*5($b_org), @acc[5]
|
|
mov 8*5($n_ptr), @acc[11]
|
|
sbb $b_org, $b_org
|
|
|
|
and $b_org, @acc[6]
|
|
and $b_org, @acc[7]
|
|
and $b_org, @acc[8]
|
|
and $b_org, @acc[9]
|
|
and $b_org, @acc[10]
|
|
and $b_org, @acc[11]
|
|
|
|
add @acc[6], @acc[0]
|
|
adc @acc[7], @acc[1]
|
|
mov @acc[0], 8*0($r_ptr)
|
|
adc @acc[8], @acc[2]
|
|
mov @acc[1], 8*1($r_ptr)
|
|
adc @acc[9], @acc[3]
|
|
mov @acc[2], 8*2($r_ptr)
|
|
adc @acc[10], @acc[4]
|
|
mov @acc[3], 8*3($r_ptr)
|
|
adc @acc[11], @acc[5]
|
|
mov @acc[4], 8*4($r_ptr)
|
|
mov @acc[5], 8*5($r_ptr)
|
|
|
|
ret
|
|
.size __sub_mod_384,.-__sub_mod_384
|
|
|
|
.globl sub_mod_384x
|
|
.hidden sub_mod_384x
|
|
.type sub_mod_384x,\@function,4,"unwind"
|
|
.align 32
|
|
sub_mod_384x:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
sub \$24, %rsp
|
|
.cfi_adjust_cfa_offset 24
|
|
.cfi_end_prologue
|
|
|
|
mov $a_ptr, 8*0(%rsp)
|
|
mov $b_org, 8*1(%rsp)
|
|
lea 48($a_ptr), $a_ptr # a->im
|
|
lea 48($b_org), $b_org # b->im
|
|
lea 48($r_ptr), $r_ptr # ret->im
|
|
call __sub_mod_384 # sub_mod_384(ret->im, a->im, b->im, mod);
|
|
|
|
mov 8*0(%rsp), $a_ptr # a->re
|
|
mov 8*1(%rsp), $b_org # b->re
|
|
lea -48($r_ptr), $r_ptr # ret->re
|
|
call __sub_mod_384 # sub_mod_384(ret->re, a->re, b->re, mod);
|
|
|
|
mov 24+8*0(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 24+8*1(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 24+8*2(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 24+8*3(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 24+8*4(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 24+8*5(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 24+8*6(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -24-8*6
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size sub_mod_384x,.-sub_mod_384x
|
|
___
|
|
}
|
|
{ ###################################################### ret = a * (1 + i)
|
|
my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx");
|
|
my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp"));
|
|
|
|
$code.=<<___;
|
|
.globl mul_by_1_plus_i_mod_384x
|
|
.hidden mul_by_1_plus_i_mod_384x
|
|
.type mul_by_1_plus_i_mod_384x,\@function,3,"unwind"
|
|
.align 32
|
|
mul_by_1_plus_i_mod_384x:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
sub \$56, %rsp
|
|
.cfi_adjust_cfa_offset 56
|
|
.cfi_end_prologue
|
|
|
|
mov 8*0($a_ptr), @acc[0]
|
|
mov 8*1($a_ptr), @acc[1]
|
|
mov 8*2($a_ptr), @acc[2]
|
|
mov 8*3($a_ptr), @acc[3]
|
|
mov 8*4($a_ptr), @acc[4]
|
|
mov 8*5($a_ptr), @acc[5]
|
|
|
|
mov @acc[0], @acc[6]
|
|
add 8*6($a_ptr), @acc[0] # a->re + a->im
|
|
mov @acc[1], @acc[7]
|
|
adc 8*7($a_ptr), @acc[1]
|
|
mov @acc[2], @acc[8]
|
|
adc 8*8($a_ptr), @acc[2]
|
|
mov @acc[3], @acc[9]
|
|
adc 8*9($a_ptr), @acc[3]
|
|
mov @acc[4], @acc[10]
|
|
adc 8*10($a_ptr), @acc[4]
|
|
mov @acc[5], @acc[11]
|
|
adc 8*11($a_ptr), @acc[5]
|
|
mov $r_ptr, 8*6(%rsp) # offload r_ptr
|
|
sbb $r_ptr, $r_ptr
|
|
|
|
sub 8*6($a_ptr), @acc[6] # a->re - a->im
|
|
sbb 8*7($a_ptr), @acc[7]
|
|
sbb 8*8($a_ptr), @acc[8]
|
|
sbb 8*9($a_ptr), @acc[9]
|
|
sbb 8*10($a_ptr), @acc[10]
|
|
sbb 8*11($a_ptr), @acc[11]
|
|
sbb $a_ptr, $a_ptr
|
|
|
|
mov @acc[0], 8*0(%rsp) # offload a->re + a->im [without carry]
|
|
mov 8*0($n_ptr), @acc[0]
|
|
mov @acc[1], 8*1(%rsp)
|
|
mov 8*1($n_ptr), @acc[1]
|
|
mov @acc[2], 8*2(%rsp)
|
|
mov 8*2($n_ptr), @acc[2]
|
|
mov @acc[3], 8*3(%rsp)
|
|
mov 8*3($n_ptr), @acc[3]
|
|
mov @acc[4], 8*4(%rsp)
|
|
and $a_ptr, @acc[0]
|
|
mov 8*4($n_ptr), @acc[4]
|
|
mov @acc[5], 8*5(%rsp)
|
|
and $a_ptr, @acc[1]
|
|
mov 8*5($n_ptr), @acc[5]
|
|
and $a_ptr, @acc[2]
|
|
and $a_ptr, @acc[3]
|
|
and $a_ptr, @acc[4]
|
|
and $a_ptr, @acc[5]
|
|
mov 8*6(%rsp), $a_ptr # restore r_ptr
|
|
|
|
add @acc[0], @acc[6]
|
|
mov 8*0(%rsp), @acc[0] # restore a->re + a->im
|
|
adc @acc[1], @acc[7]
|
|
mov 8*1(%rsp), @acc[1]
|
|
adc @acc[2], @acc[8]
|
|
mov 8*2(%rsp), @acc[2]
|
|
adc @acc[3], @acc[9]
|
|
mov 8*3(%rsp), @acc[3]
|
|
adc @acc[4], @acc[10]
|
|
mov 8*4(%rsp), @acc[4]
|
|
adc @acc[5], @acc[11]
|
|
mov 8*5(%rsp), @acc[5]
|
|
|
|
mov @acc[6], 8*0($a_ptr) # ret->re = a->re - a->im
|
|
mov @acc[0], @acc[6]
|
|
mov @acc[7], 8*1($a_ptr)
|
|
mov @acc[8], 8*2($a_ptr)
|
|
mov @acc[1], @acc[7]
|
|
mov @acc[9], 8*3($a_ptr)
|
|
mov @acc[10], 8*4($a_ptr)
|
|
mov @acc[2], @acc[8]
|
|
mov @acc[11], 8*5($a_ptr)
|
|
|
|
sub 8*0($n_ptr), @acc[0]
|
|
mov @acc[3], @acc[9]
|
|
sbb 8*1($n_ptr), @acc[1]
|
|
sbb 8*2($n_ptr), @acc[2]
|
|
mov @acc[4], @acc[10]
|
|
sbb 8*3($n_ptr), @acc[3]
|
|
sbb 8*4($n_ptr), @acc[4]
|
|
mov @acc[5], @acc[11]
|
|
sbb 8*5($n_ptr), @acc[5]
|
|
sbb \$0, $r_ptr
|
|
|
|
cmovc @acc[6], @acc[0]
|
|
cmovc @acc[7], @acc[1]
|
|
cmovc @acc[8], @acc[2]
|
|
mov @acc[0], 8*6($a_ptr) # ret->im = a->re + a->im
|
|
cmovc @acc[9], @acc[3]
|
|
mov @acc[1], 8*7($a_ptr)
|
|
cmovc @acc[10], @acc[4]
|
|
mov @acc[2], 8*8($a_ptr)
|
|
cmovc @acc[11], @acc[5]
|
|
mov @acc[3], 8*9($a_ptr)
|
|
mov @acc[4], 8*10($a_ptr)
|
|
mov @acc[5], 8*11($a_ptr)
|
|
|
|
mov 56+8*0(%rsp),%r15
|
|
.cfi_restore %r15
|
|
mov 56+8*1(%rsp),%r14
|
|
.cfi_restore %r14
|
|
mov 56+8*2(%rsp),%r13
|
|
.cfi_restore %r13
|
|
mov 56+8*3(%rsp),%r12
|
|
.cfi_restore %r12
|
|
mov 56+8*4(%rsp),%rbx
|
|
.cfi_restore %rbx
|
|
mov 56+8*5(%rsp),%rbp
|
|
.cfi_restore %rbp
|
|
lea 56+8*6(%rsp),%rsp
|
|
.cfi_adjust_cfa_offset -56-8*6
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
|
|
___
|
|
}
|
|
{ ######################################################
|
|
my ($r_ptr,$n_ptr) = ("%rdi","%rsi");
|
|
my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp"));
|
|
|
|
$code.=<<___;
|
|
.globl sgn0_pty_mod_384
|
|
.hidden sgn0_pty_mod_384
|
|
.type sgn0_pty_mod_384,\@function,2,"unwind"
|
|
.align 32
|
|
sgn0_pty_mod_384:
|
|
.cfi_startproc
|
|
.cfi_end_prologue
|
|
mov 8*0($r_ptr), @acc[0]
|
|
mov 8*1($r_ptr), @acc[1]
|
|
mov 8*2($r_ptr), @acc[2]
|
|
mov 8*3($r_ptr), @acc[3]
|
|
mov 8*4($r_ptr), @acc[4]
|
|
mov 8*5($r_ptr), @acc[5]
|
|
|
|
xor %rax, %rax
|
|
mov @acc[0], $r_ptr
|
|
add @acc[0], @acc[0]
|
|
adc @acc[1], @acc[1]
|
|
adc @acc[2], @acc[2]
|
|
adc @acc[3], @acc[3]
|
|
adc @acc[4], @acc[4]
|
|
adc @acc[5], @acc[5]
|
|
adc \$0, %rax
|
|
|
|
sub 8*0($n_ptr), @acc[0]
|
|
sbb 8*1($n_ptr), @acc[1]
|
|
sbb 8*2($n_ptr), @acc[2]
|
|
sbb 8*3($n_ptr), @acc[3]
|
|
sbb 8*4($n_ptr), @acc[4]
|
|
sbb 8*5($n_ptr), @acc[5]
|
|
sbb \$0, %rax
|
|
|
|
not %rax # 2*x > p, which means "negative"
|
|
and \$1, $r_ptr
|
|
and \$2, %rax
|
|
or $r_ptr, %rax # pack sign and parity
|
|
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size sgn0_pty_mod_384,.-sgn0_pty_mod_384
|
|
|
|
.globl sgn0_pty_mod_384x
|
|
.hidden sgn0_pty_mod_384x
|
|
.type sgn0_pty_mod_384x,\@function,2,"unwind"
|
|
.align 32
|
|
sgn0_pty_mod_384x:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
sub \$8, %rsp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_end_prologue
|
|
|
|
mov 8*6($r_ptr), @acc[0] # sgn0(a->im)
|
|
mov 8*7($r_ptr), @acc[1]
|
|
mov 8*8($r_ptr), @acc[2]
|
|
mov 8*9($r_ptr), @acc[3]
|
|
mov 8*10($r_ptr), @acc[4]
|
|
mov 8*11($r_ptr), @acc[5]
|
|
|
|
mov @acc[0], @acc[6]
|
|
or @acc[1], @acc[0]
|
|
or @acc[2], @acc[0]
|
|
or @acc[3], @acc[0]
|
|
or @acc[4], @acc[0]
|
|
or @acc[5], @acc[0]
|
|
|
|
lea 0($r_ptr), %rax # sgn0(a->re)
|
|
xor $r_ptr, $r_ptr
|
|
mov @acc[6], @acc[7]
|
|
add @acc[6], @acc[6]
|
|
adc @acc[1], @acc[1]
|
|
adc @acc[2], @acc[2]
|
|
adc @acc[3], @acc[3]
|
|
adc @acc[4], @acc[4]
|
|
adc @acc[5], @acc[5]
|
|
adc \$0, $r_ptr
|
|
|
|
sub 8*0($n_ptr), @acc[6]
|
|
sbb 8*1($n_ptr), @acc[1]
|
|
sbb 8*2($n_ptr), @acc[2]
|
|
sbb 8*3($n_ptr), @acc[3]
|
|
sbb 8*4($n_ptr), @acc[4]
|
|
sbb 8*5($n_ptr), @acc[5]
|
|
sbb \$0, $r_ptr
|
|
|
|
mov @acc[0], 0(%rsp) # a->im is zero or not
|
|
not $r_ptr # 2*x > p, which means "negative"
|
|
and \$1, @acc[7]
|
|
and \$2, $r_ptr
|
|
or @acc[7], $r_ptr # pack sign and parity
|
|
|
|
mov 8*0(%rax), @acc[0]
|
|
mov 8*1(%rax), @acc[1]
|
|
mov 8*2(%rax), @acc[2]
|
|
mov 8*3(%rax), @acc[3]
|
|
mov 8*4(%rax), @acc[4]
|
|
mov 8*5(%rax), @acc[5]
|
|
|
|
mov @acc[0], @acc[6]
|
|
or @acc[1], @acc[0]
|
|
or @acc[2], @acc[0]
|
|
or @acc[3], @acc[0]
|
|
or @acc[4], @acc[0]
|
|
or @acc[5], @acc[0]
|
|
|
|
xor %rax, %rax
|
|
mov @acc[6], @acc[7]
|
|
add @acc[6], @acc[6]
|
|
adc @acc[1], @acc[1]
|
|
adc @acc[2], @acc[2]
|
|
adc @acc[3], @acc[3]
|
|
adc @acc[4], @acc[4]
|
|
adc @acc[5], @acc[5]
|
|
adc \$0, %rax
|
|
|
|
sub 8*0($n_ptr), @acc[6]
|
|
sbb 8*1($n_ptr), @acc[1]
|
|
sbb 8*2($n_ptr), @acc[2]
|
|
sbb 8*3($n_ptr), @acc[3]
|
|
sbb 8*4($n_ptr), @acc[4]
|
|
sbb 8*5($n_ptr), @acc[5]
|
|
sbb \$0, %rax
|
|
|
|
mov 0(%rsp), @acc[6]
|
|
|
|
not %rax # 2*x > p, which means "negative"
|
|
|
|
test @acc[0], @acc[0]
|
|
cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re)
|
|
|
|
test @acc[6], @acc[6]
|
|
cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re)
|
|
|
|
and \$1, @acc[7]
|
|
and \$2, %rax
|
|
or @acc[7], %rax # pack sign and parity
|
|
|
|
mov 8(%rsp), %rbx
|
|
.cfi_restore %rbx
|
|
mov 16(%rsp), %rbp
|
|
.cfi_restore %rbp
|
|
lea 24(%rsp), %rsp
|
|
.cfi_adjust_cfa_offset -24
|
|
.cfi_epilogue
|
|
ret
|
|
.cfi_endproc
|
|
.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
|
|
___
|
|
}
|
|
if (0) {
|
|
my $inp = $win64 ? "%rcx" : "%rdi";
|
|
$code.=<<___;
|
|
.globl nbits_384
|
|
.hidden nbits_384
|
|
.type nbits_384,\@abi-omnipotent
|
|
.align 32
|
|
nbits_384:
|
|
mov 8*5($inp), %r8
|
|
mov 8*4($inp), %r9
|
|
mov 8*3($inp), %r10
|
|
mov 8*2($inp), %r11
|
|
mov \$-1, %rdx
|
|
mov \$127, %eax
|
|
bsr %r8, %r8
|
|
cmovnz %rdx,%r9
|
|
cmovz %rax,%r8
|
|
bsr %r9, %r9
|
|
cmovnz %rdx,%r10
|
|
cmovz %rax,%r9
|
|
xor \$63,%r8
|
|
bsr %r10, %r10
|
|
cmovnz %rdx, %r11
|
|
cmovz %rax, %r10
|
|
xor \$63,%r9
|
|
add %r8, %r9
|
|
mov 8*1($inp), %r8
|
|
bsr %r11, %r11
|
|
cmovnz %rdx, %r8
|
|
cmovz %rax, %r11
|
|
xor \$63, %r10
|
|
add %r9, %r10
|
|
mov 8*0($inp), %r9
|
|
bsr %r8, %r8
|
|
cmovnz %rdx, %r9
|
|
cmovz %rax, %r8
|
|
xor \$63, %r11
|
|
add %r10, %r11
|
|
bsr %r9, %r9
|
|
cmovz %rax, %r9
|
|
xor \$63, %r8
|
|
add %r11, %r8
|
|
xor \$63, %r9
|
|
add %r8, %r9
|
|
mov \$384, %eax
|
|
sub %r9, %rax
|
|
ret
|
|
.size nbits_384,.-nbits_384
|
|
___
|
|
}
|
|
|
|
if (1) {
|
|
my ($out, $inp1, $inp2, $select) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9d")
|
|
: ("%rdi", "%rsi", "%rdx", "%ecx");
|
|
|
|
sub vec_select {
|
|
my $sz = shift;
|
|
my $half = $sz/2;
|
|
my ($xmm0,$xmm1,$xmm2,$xmm3)=map("%xmm$_",(0..3));
|
|
|
|
$code.=<<___;
|
|
.globl vec_select_$sz
|
|
.hidden vec_select_$sz
|
|
.type vec_select_$sz,\@abi-omnipotent
|
|
.align 32
|
|
vec_select_$sz:
|
|
movd $select, %xmm5
|
|
pxor %xmm4,%xmm4
|
|
pshufd \$0,%xmm5,%xmm5 # broadcast
|
|
movdqu ($inp1),$xmm0
|
|
lea $half($inp1),$inp1
|
|
pcmpeqd %xmm4,%xmm5
|
|
movdqu ($inp2),$xmm1
|
|
lea $half($inp2),$inp2
|
|
pcmpeqd %xmm5,%xmm4
|
|
lea $half($out),$out
|
|
___
|
|
for($i=0; $i<$sz-16; $i+=16) {
|
|
$code.=<<___;
|
|
pand %xmm4,$xmm0
|
|
movdqu $i+16-$half($inp1),$xmm2
|
|
pand %xmm5,$xmm1
|
|
movdqu $i+16-$half($inp2),$xmm3
|
|
por $xmm1,$xmm0
|
|
movdqu $xmm0,$i-$half($out)
|
|
___
|
|
($xmm0,$xmm1,$xmm2,$xmm3)=($xmm2,$xmm3,$xmm0,$xmm1);
|
|
}
|
|
$code.=<<___;
|
|
pand %xmm4,$xmm0
|
|
pand %xmm5,$xmm1
|
|
por $xmm1,$xmm0
|
|
movdqu $xmm0,$i-$half($out)
|
|
ret
|
|
.size vec_select_$sz,.-vec_select_$sz
|
|
___
|
|
}
|
|
vec_select(48);
|
|
vec_select(96);
|
|
vec_select(192);
|
|
vec_select(144);
|
|
vec_select(288);
|
|
}
|
|
|
|
{
|
|
my ($inp, $end) = $win64 ? ("%rcx", "%rdx") : ("%rdi", "%rsi");
|
|
|
|
$code.=<<___;
|
|
.globl vec_prefetch
|
|
.hidden vec_prefetch
|
|
.type vec_prefetch,\@abi-omnipotent
|
|
.align 32
|
|
vec_prefetch:
|
|
leaq -1($inp,$end), $end
|
|
mov \$64, %rax
|
|
xor %r8, %r8
|
|
prefetchnta ($inp)
|
|
lea ($inp,%rax), $inp
|
|
cmp $end, $inp
|
|
cmova $end, $inp
|
|
cmova %r8, %rax
|
|
prefetchnta ($inp)
|
|
lea ($inp,%rax), $inp
|
|
cmp $end, $inp
|
|
cmova $end, $inp
|
|
cmova %r8, %rax
|
|
prefetchnta ($inp)
|
|
lea ($inp,%rax), $inp
|
|
cmp $end, $inp
|
|
cmova $end, $inp
|
|
cmova %r8, %rax
|
|
prefetchnta ($inp)
|
|
lea ($inp,%rax), $inp
|
|
cmp $end, $inp
|
|
cmova $end, $inp
|
|
cmova %r8, %rax
|
|
prefetchnta ($inp)
|
|
lea ($inp,%rax), $inp
|
|
cmp $end, $inp
|
|
cmova $end, $inp
|
|
cmova %r8, %rax
|
|
prefetchnta ($inp)
|
|
lea ($inp,%rax), $inp
|
|
cmp $end, $inp
|
|
cmova $end, $inp
|
|
prefetchnta ($inp)
|
|
ret
|
|
.size vec_prefetch,.-vec_prefetch
|
|
___
|
|
}
|
|
print $code;
|
|
close STDOUT;
|