887 lines
20 KiB
Perl
887 lines
20 KiB
Perl
|
#!/usr/bin/env perl
|
||
|
#
|
||
|
# Copyright Supranational LLC
|
||
|
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
#
|
||
|
# Both constant-time and fast Euclidean inversion as suggested in
|
||
|
# https://eprint.iacr.org/2020/972. Performance is >5x better than
|
||
|
# modulus-specific FLT addition chain...
|
||
|
#
|
||
|
# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
|
||
|
#
|
||
|
$python_ref.=<<'___';
|
||
|
def ct_inverse_mod_383(inp, mod):
|
||
|
a, u = inp, 1
|
||
|
b, v = mod, 0
|
||
|
|
||
|
k = 62
|
||
|
w = 64
|
||
|
mask = (1 << w) - 1
|
||
|
|
||
|
for i in range(0, 766 // k):
|
||
|
# __ab_approximation_62
|
||
|
n = max(a.bit_length(), b.bit_length())
|
||
|
if n < 128:
|
||
|
a_, b_ = a, b
|
||
|
else:
|
||
|
a_ = (a & mask) | ((a >> (n-w)) << w)
|
||
|
b_ = (b & mask) | ((b >> (n-w)) << w)
|
||
|
|
||
|
# __inner_loop_62
|
||
|
f0, g0, f1, g1 = 1, 0, 0, 1
|
||
|
for j in range(0, k):
|
||
|
if a_ & 1:
|
||
|
if a_ < b_:
|
||
|
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
|
||
|
a_, f0, g0 = a_-b_, f0-f1, g0-g1
|
||
|
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
|
||
|
|
||
|
# __smulq_383_n_shift_by_62
|
||
|
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
|
||
|
if a < 0:
|
||
|
a, f0, g0 = -a, -f0, -g0
|
||
|
if b < 0:
|
||
|
b, f1, g1 = -b, -f1, -g1
|
||
|
|
||
|
# __smulq_767x63
|
||
|
u, v = u*f0 + v*g0, u*f1 + v*g1
|
||
|
|
||
|
if 766 % k:
|
||
|
f0, g0, f1, g1 = 1, 0, 0, 1
|
||
|
for j in range(0, 766 % k):
|
||
|
if a & 1:
|
||
|
if a < b:
|
||
|
a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
|
||
|
a, f0, g0 = a-b, f0-f1, g0-g1
|
||
|
a, f1, g1 = a >> 1, f1 << 1, g1 << 1
|
||
|
|
||
|
v = u*f1 + v*g1
|
||
|
|
||
|
if v < 0:
|
||
|
v += mod << (768 - mod.bit_length()) # left aligned
|
||
|
|
||
|
return v & (2**768 - 1) # to be reduced % mod
|
||
|
___
|
||
|
|
||
|
$flavour = shift;
|
||
|
$output = shift;
|
||
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||
|
|
||
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||
|
|
||
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||
|
die "can't locate x86_64-xlate.pl";
|
||
|
|
||
|
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||
|
or die "can't call $xlate: $!";
|
||
|
|
||
|
my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
|
||
|
my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
|
||
|
my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
|
||
|
my $cnt = "%edi";
|
||
|
|
||
|
$frame = 8*11+2*512;
|
||
|
|
||
|
$code.=<<___;
|
||
|
.text
|
||
|
|
||
|
.globl ct_inverse_mod_383
|
||
|
.type ct_inverse_mod_383,\@function,4,"unwind"
|
||
|
.align 32
|
||
|
ct_inverse_mod_383:
|
||
|
.cfi_startproc
|
||
|
push %rbp
|
||
|
.cfi_push %rbp
|
||
|
push %rbx
|
||
|
.cfi_push %rbx
|
||
|
push %r12
|
||
|
.cfi_push %r12
|
||
|
push %r13
|
||
|
.cfi_push %r13
|
||
|
push %r14
|
||
|
.cfi_push %r14
|
||
|
push %r15
|
||
|
.cfi_push %r15
|
||
|
sub \$$frame, %rsp
|
||
|
.cfi_adjust_cfa_offset $frame
|
||
|
.cfi_end_prologue
|
||
|
|
||
|
lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot
|
||
|
and \$-512, %rax # in the frame...
|
||
|
mov $out_ptr, 8*4(%rsp)
|
||
|
mov $nx_ptr, 8*5(%rsp)
|
||
|
|
||
|
mov 8*0($in_ptr), @acc[0] # load input
|
||
|
mov 8*1($in_ptr), @acc[1]
|
||
|
mov 8*2($in_ptr), @acc[2]
|
||
|
mov 8*3($in_ptr), @acc[3]
|
||
|
mov 8*4($in_ptr), @acc[4]
|
||
|
mov 8*5($in_ptr), @acc[5]
|
||
|
|
||
|
mov 8*0($n_ptr), @acc[6] # load modulus
|
||
|
mov 8*1($n_ptr), @acc[7]
|
||
|
mov 8*2($n_ptr), @acc[8]
|
||
|
mov 8*3($n_ptr), @acc[9]
|
||
|
mov 8*4($n_ptr), @acc[10]
|
||
|
mov 8*5($n_ptr), @acc[11]
|
||
|
|
||
|
mov @acc[0], 8*0(%rax) # copy input to |a|
|
||
|
mov @acc[1], 8*1(%rax)
|
||
|
mov @acc[2], 8*2(%rax)
|
||
|
mov @acc[3], 8*3(%rax)
|
||
|
mov @acc[4], 8*4(%rax)
|
||
|
mov @acc[5], 8*5(%rax)
|
||
|
|
||
|
mov @acc[6], 8*6(%rax) # copy modulus to |b|
|
||
|
mov @acc[7], 8*7(%rax)
|
||
|
mov @acc[8], 8*8(%rax)
|
||
|
mov @acc[9], 8*9(%rax)
|
||
|
mov @acc[10], 8*10(%rax)
|
||
|
mov %rax, $in_ptr # pointer to source |a|b|1|0|
|
||
|
mov @acc[11], 8*11(%rax)
|
||
|
|
||
|
################################# first iteration
|
||
|
mov \$62, $cnt
|
||
|
call __ab_approximation_62
|
||
|
#mov $f0, 8*7(%rsp)
|
||
|
#mov $g0, 8*8(%rsp)
|
||
|
mov $f1, 8*9(%rsp)
|
||
|
mov $g1, 8*10(%rsp)
|
||
|
|
||
|
mov \$256, $out_ptr
|
||
|
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||
|
call __smulq_383_n_shift_by_62
|
||
|
#mov $f0, 8*7(%rsp) # corrected |f0|
|
||
|
#mov $g0, 8*8(%rsp) # corrected |g0|
|
||
|
mov $f0, 8*12($out_ptr) # initialize |u| with |f0|
|
||
|
|
||
|
mov 8*9(%rsp), $f0 # |f1|
|
||
|
mov 8*10(%rsp), $g0 # |g1|
|
||
|
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
|
||
|
call __smulq_383_n_shift_by_62
|
||
|
#mov $f0, 8*9(%rsp) # corrected |f1|
|
||
|
#mov $g0, 8*10(%rsp) # corrected |g1|
|
||
|
mov $f0, 8*12($out_ptr) # initialize |v| with |f1|
|
||
|
|
||
|
################################# second iteration
|
||
|
xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||
|
mov \$62, $cnt
|
||
|
call __ab_approximation_62
|
||
|
#mov $f0, 8*7(%rsp)
|
||
|
#mov $g0, 8*8(%rsp)
|
||
|
mov $f1, 8*9(%rsp)
|
||
|
mov $g1, 8*10(%rsp)
|
||
|
|
||
|
mov \$256, $out_ptr
|
||
|
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||
|
call __smulq_383_n_shift_by_62
|
||
|
mov $f0, 8*7(%rsp) # corrected |f0|
|
||
|
mov $g0, 8*8(%rsp) # corrected |g0|
|
||
|
|
||
|
mov 8*9(%rsp), $f0 # |f1|
|
||
|
mov 8*10(%rsp), $g0 # |g1|
|
||
|
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
|
||
|
call __smulq_383_n_shift_by_62
|
||
|
#mov $f0, 8*9(%rsp) # corrected |f1|
|
||
|
#mov $g0, 8*10(%rsp) # corrected |g1|
|
||
|
|
||
|
mov 8*12($in_ptr), %rax # |u|
|
||
|
mov 8*18($in_ptr), @acc[3] # |v|
|
||
|
mov $f0, %rbx
|
||
|
mov %rax, @acc[2]
|
||
|
imulq 8*7(%rsp) # |u|*|f0|
|
||
|
mov %rax, @acc[0]
|
||
|
mov @acc[3], %rax
|
||
|
mov %rdx, @acc[1]
|
||
|
imulq 8*8(%rsp) # |v|*|g0|
|
||
|
add %rax, @acc[0]
|
||
|
adc %rdx, @acc[1]
|
||
|
mov @acc[0], 8*6($out_ptr) # destination |u|
|
||
|
mov @acc[1], 8*7($out_ptr)
|
||
|
sar \$63, @acc[1] # sign extension
|
||
|
mov @acc[1], 8*8($out_ptr)
|
||
|
mov @acc[1], 8*9($out_ptr)
|
||
|
mov @acc[1], 8*10($out_ptr)
|
||
|
mov @acc[1], 8*11($out_ptr)
|
||
|
lea 8*12($in_ptr),$in_ptr # make in_ptr "rewindable" with xor
|
||
|
|
||
|
mov @acc[2], %rax
|
||
|
imulq %rbx # |u|*|f1|
|
||
|
mov %rax, @acc[0]
|
||
|
mov @acc[3], %rax
|
||
|
mov %rdx, @acc[1]
|
||
|
imulq %rcx # |v|*|g1|
|
||
|
add %rax, @acc[0]
|
||
|
adc %rdx, @acc[1]
|
||
|
mov @acc[0], 8*12($out_ptr) # destination |v|
|
||
|
mov @acc[1], 8*13($out_ptr)
|
||
|
sar \$63, @acc[1] # sign extension
|
||
|
mov @acc[1], 8*14($out_ptr)
|
||
|
mov @acc[1], 8*15($out_ptr)
|
||
|
mov @acc[1], 8*16($out_ptr)
|
||
|
mov @acc[1], 8*17($out_ptr)
|
||
|
___
|
||
|
for($i=2; $i<11; $i++) {
|
||
|
my $smul_767x63 = $i>5 ? "__smulq_767x63"
|
||
|
: "__smulq_383x63";
|
||
|
$code.=<<___;
|
||
|
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||
|
mov \$62, $cnt
|
||
|
call __ab_approximation_62
|
||
|
#mov $f0, 8*7(%rsp)
|
||
|
#mov $g0, 8*8(%rsp)
|
||
|
mov $f1, 8*9(%rsp)
|
||
|
mov $g1, 8*10(%rsp)
|
||
|
|
||
|
mov \$256, $out_ptr
|
||
|
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||
|
call __smulq_383_n_shift_by_62
|
||
|
mov $f0, 8*7(%rsp) # corrected |f0|
|
||
|
mov $g0, 8*8(%rsp) # corrected |g0|
|
||
|
|
||
|
mov 8*9(%rsp), $f0 # |f1|
|
||
|
mov 8*10(%rsp), $g0 # |g1|
|
||
|
lea 8*6($out_ptr), $out_ptr # pointer to destination |b|
|
||
|
call __smulq_383_n_shift_by_62
|
||
|
mov $f0, 8*9(%rsp) # corrected |f1|
|
||
|
mov $g0, 8*10(%rsp) # corrected |g1|
|
||
|
|
||
|
mov 8*7(%rsp), $f0 # |f0|
|
||
|
mov 8*8(%rsp), $g0 # |g0|
|
||
|
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
|
||
|
lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
|
||
|
call __smulq_383x63
|
||
|
|
||
|
mov 8*9(%rsp), $f0 # |f1|
|
||
|
mov 8*10(%rsp), $g0 # |g1|
|
||
|
lea 8*6($out_ptr),$out_ptr # pointer to destination |v|
|
||
|
call $smul_767x63
|
||
|
___
|
||
|
$code.=<<___ if ($i==5);
|
||
|
sar \$63, @acc[5] # sign extension
|
||
|
mov @acc[5], 8*6($out_ptr)
|
||
|
mov @acc[5], 8*7($out_ptr)
|
||
|
mov @acc[5], 8*8($out_ptr)
|
||
|
mov @acc[5], 8*9($out_ptr)
|
||
|
mov @acc[5], 8*10($out_ptr)
|
||
|
mov @acc[5], 8*11($out_ptr)
|
||
|
___
|
||
|
}
|
||
|
$code.=<<___;
|
||
|
################################# iteration before last
|
||
|
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||
|
mov \$62, $cnt
|
||
|
#call __ab_approximation_62 # |a| and |b| are exact, just load
|
||
|
mov 8*0($in_ptr), @acc[0] # |a_lo|
|
||
|
mov 8*1($in_ptr), @acc[1] # |a_hi|
|
||
|
mov 8*6($in_ptr), @acc[2] # |b_lo|
|
||
|
mov 8*7($in_ptr), @acc[3] # |b_hi|
|
||
|
call __inner_loop_62
|
||
|
#mov $f0, 8*7(%rsp)
|
||
|
#mov $g0, 8*8(%rsp)
|
||
|
mov $f1, 8*9(%rsp)
|
||
|
mov $g1, 8*10(%rsp)
|
||
|
|
||
|
mov \$256, $out_ptr
|
||
|
xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v|
|
||
|
mov @acc[0], 8*0($out_ptr)
|
||
|
mov @acc[2], 8*6($out_ptr)
|
||
|
|
||
|
#mov 8*7(%rsp), $f0 # |f0|
|
||
|
#mov 8*8(%rsp), $g0 # |g0|
|
||
|
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
|
||
|
lea 8*12($out_ptr),$out_ptr # pointer to destination |u|
|
||
|
call __smulq_383x63
|
||
|
|
||
|
mov 8*9(%rsp), $f0 # |f1|
|
||
|
mov 8*10(%rsp), $g0 # |g1|
|
||
|
lea 8*6($out_ptr),$out_ptr # pointer to destination |v|
|
||
|
call __smulq_767x63
|
||
|
|
||
|
################################# last iteration
|
||
|
xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v|
|
||
|
mov \$22, $cnt # 766 % 62
|
||
|
#call __ab_approximation_62 # |a| and |b| are exact, just load
|
||
|
mov 8*0($in_ptr), @acc[0] # |a_lo|
|
||
|
xor @acc[1], @acc[1] # |a_hi|
|
||
|
mov 8*6($in_ptr), @acc[2] # |b_lo|
|
||
|
xor @acc[3], @acc[3] # |b_hi|
|
||
|
call __inner_loop_62
|
||
|
#mov $f0, 8*7(%rsp)
|
||
|
#mov $g0, 8*8(%rsp)
|
||
|
#mov $f1, 8*9(%rsp)
|
||
|
#mov $g1, 8*10(%rsp)
|
||
|
|
||
|
#mov 8*7(%rsp), $f0 # |f0|
|
||
|
#mov 8*8(%rsp), $g0 # |g0|
|
||
|
lea 8*12($in_ptr), $in_ptr # pointer to source |u|v|
|
||
|
#lea 8*6($out_ptr), $out_ptr # pointer to destination |u|
|
||
|
#call __smulq_383x63
|
||
|
|
||
|
#mov 8*9(%rsp), $f0 # |f1|
|
||
|
#mov 8*10(%rsp), $g0 # |g1|
|
||
|
mov $f1, $f0
|
||
|
mov $g1, $g0
|
||
|
mov 8*4(%rsp), $out_ptr # original out_ptr
|
||
|
call __smulq_767x63
|
||
|
|
||
|
mov 8*5(%rsp), $in_ptr # original n_ptr
|
||
|
mov %rax, %rdx # top limb of the result
|
||
|
sar \$63, %rax # result's sign as mask
|
||
|
|
||
|
mov %rax, @acc[0] # mask |modulus|
|
||
|
mov %rax, @acc[1]
|
||
|
mov %rax, @acc[2]
|
||
|
and 8*0($in_ptr), @acc[0]
|
||
|
and 8*1($in_ptr), @acc[1]
|
||
|
mov %rax, @acc[3]
|
||
|
and 8*2($in_ptr), @acc[2]
|
||
|
and 8*3($in_ptr), @acc[3]
|
||
|
mov %rax, @acc[4]
|
||
|
and 8*4($in_ptr), @acc[4]
|
||
|
and 8*5($in_ptr), %rax
|
||
|
|
||
|
add @acc[0], @acc[6] # conditionally add |modulus|<<384
|
||
|
adc @acc[1], @acc[7]
|
||
|
adc @acc[2], @acc[8]
|
||
|
adc @acc[3], @acc[9]
|
||
|
adc @acc[4], %rcx
|
||
|
adc %rax, %rdx
|
||
|
|
||
|
mov @acc[6], 8*6($out_ptr) # store absolute value
|
||
|
mov @acc[7], 8*7($out_ptr)
|
||
|
mov @acc[8], 8*8($out_ptr)
|
||
|
mov @acc[9], 8*9($out_ptr)
|
||
|
mov %rcx, 8*10($out_ptr)
|
||
|
mov %rdx, 8*11($out_ptr)
|
||
|
|
||
|
lea $frame(%rsp), %r8 # size optimization
|
||
|
mov 8*0(%r8),%r15
|
||
|
.cfi_restore %r15
|
||
|
mov 8*1(%r8),%r14
|
||
|
.cfi_restore %r14
|
||
|
mov 8*2(%r8),%r13
|
||
|
.cfi_restore %r13
|
||
|
mov 8*3(%r8),%r12
|
||
|
.cfi_restore %r12
|
||
|
mov 8*4(%r8),%rbx
|
||
|
.cfi_restore %rbx
|
||
|
mov 8*5(%r8),%rbp
|
||
|
.cfi_restore %rbp
|
||
|
lea 8*6(%r8),%rsp
|
||
|
.cfi_adjust_cfa_offset -$frame-8*6
|
||
|
.cfi_epilogue
|
||
|
ret
|
||
|
.cfi_endproc
|
||
|
.size ct_inverse_mod_383,.-ct_inverse_mod_383
|
||
|
___
|
||
|
########################################################################
|
||
|
# see corresponding commentary in ctx_inverse_mod_384-x86_64...
|
||
|
{
|
||
|
my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
|
||
|
my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
|
||
|
my $fx = @acc[9];
|
||
|
|
||
|
$code.=<<___;
|
||
|
.type __smulq_767x63,\@abi-omnipotent
|
||
|
.align 32
|
||
|
__smulq_767x63:
|
||
|
mov 8*0($in_ptr), @acc[0] # load |u|
|
||
|
mov 8*1($in_ptr), @acc[1]
|
||
|
mov 8*2($in_ptr), @acc[2]
|
||
|
mov 8*3($in_ptr), @acc[3]
|
||
|
mov 8*4($in_ptr), @acc[4]
|
||
|
mov 8*5($in_ptr), @acc[5]
|
||
|
|
||
|
mov $f0, $fx
|
||
|
sar \$63, $f0 # |f0|'s sign as mask
|
||
|
xor %rax, %rax
|
||
|
sub $f0, %rax # |f0|'s sign as bit
|
||
|
|
||
|
mov $out_ptr, 8*1(%rsp)
|
||
|
mov $in_ptr, 8*2(%rsp)
|
||
|
lea 8*6($in_ptr), $in_ptr # pointer to |v|
|
||
|
|
||
|
xor $f0, $fx # conditionally negate |f0|
|
||
|
add %rax, $fx
|
||
|
|
||
|
xor $f0, @acc[0] # conditionally negate |u|
|
||
|
xor $f0, @acc[1]
|
||
|
xor $f0, @acc[2]
|
||
|
xor $f0, @acc[3]
|
||
|
xor $f0, @acc[4]
|
||
|
xor $f0, @acc[5]
|
||
|
add @acc[0], %rax
|
||
|
adc \$0, @acc[1]
|
||
|
adc \$0, @acc[2]
|
||
|
adc \$0, @acc[3]
|
||
|
adc \$0, @acc[4]
|
||
|
adc \$0, @acc[5]
|
||
|
|
||
|
mulq $fx # |u|*|f0|
|
||
|
mov %rax, 8*0($out_ptr) # offload |u|*|f0|
|
||
|
mov @acc[1], %rax
|
||
|
mov %rdx, @acc[1]
|
||
|
___
|
||
|
for($i=1; $i<5; $i++) {
|
||
|
$code.=<<___;
|
||
|
mulq $fx
|
||
|
add %rax, @acc[$i]
|
||
|
mov @acc[$i+1], %rax
|
||
|
adc \$0, %rdx
|
||
|
mov %rdx, @acc[$i+1]
|
||
|
mov @acc[$i], 8*$i($out_ptr)
|
||
|
___
|
||
|
}
|
||
|
$code.=<<___;
|
||
|
imulq $fx
|
||
|
add %rax, @acc[$i]
|
||
|
adc \$0, %rdx
|
||
|
|
||
|
mov @acc[5], 8*5($out_ptr)
|
||
|
mov %rdx, 8*6($out_ptr)
|
||
|
sar \$63, %rdx # sign extension
|
||
|
mov %rdx, 8*7($out_ptr)
|
||
|
___
|
||
|
{
|
||
|
my $fx=$in_ptr;
|
||
|
$code.=<<___;
|
||
|
mov $g0, $f0 # load |g0|
|
||
|
|
||
|
mov 8*0($in_ptr), @acc[0] # load |v|
|
||
|
mov 8*1($in_ptr), @acc[1]
|
||
|
mov 8*2($in_ptr), @acc[2]
|
||
|
mov 8*3($in_ptr), @acc[3]
|
||
|
mov 8*4($in_ptr), @acc[4]
|
||
|
mov 8*5($in_ptr), @acc[5]
|
||
|
mov 8*6($in_ptr), @acc[6]
|
||
|
mov 8*7($in_ptr), @acc[7]
|
||
|
mov 8*8($in_ptr), @acc[8]
|
||
|
mov 8*9($in_ptr), @acc[9]
|
||
|
mov 8*10($in_ptr), @acc[10]
|
||
|
mov 8*11($in_ptr), @acc[11]
|
||
|
|
||
|
mov $f0, $fx # overrides in_ptr
|
||
|
sar \$63, $f0 # |g0|'s sign as mask
|
||
|
xor %rax, %rax
|
||
|
sub $f0, %rax # |g0|'s sign as bit
|
||
|
|
||
|
xor $f0, $fx # conditionally negate |g0|
|
||
|
add %rax, $fx
|
||
|
|
||
|
xor $f0, @acc[0] # conditionally negate |v|
|
||
|
xor $f0, @acc[1]
|
||
|
xor $f0, @acc[2]
|
||
|
xor $f0, @acc[3]
|
||
|
xor $f0, @acc[4]
|
||
|
xor $f0, @acc[5]
|
||
|
xor $f0, @acc[6]
|
||
|
xor $f0, @acc[7]
|
||
|
xor $f0, @acc[8]
|
||
|
xor $f0, @acc[9]
|
||
|
xor $f0, @acc[10]
|
||
|
xor $f0, @acc[11]
|
||
|
add @acc[0], %rax
|
||
|
adc \$0, @acc[1]
|
||
|
adc \$0, @acc[2]
|
||
|
adc \$0, @acc[3]
|
||
|
adc \$0, @acc[4]
|
||
|
adc \$0, @acc[5]
|
||
|
adc \$0, @acc[6]
|
||
|
adc \$0, @acc[7]
|
||
|
adc \$0, @acc[8]
|
||
|
adc \$0, @acc[9]
|
||
|
adc \$0, @acc[10]
|
||
|
adc \$0, @acc[11]
|
||
|
|
||
|
mulq $fx # |v|*|g0|
|
||
|
mov %rax, @acc[0]
|
||
|
mov @acc[1], %rax
|
||
|
mov %rdx, @acc[1]
|
||
|
___
|
||
|
for($i=1; $i<11; $i++) {
|
||
|
$code.=<<___;
|
||
|
mulq $fx
|
||
|
add %rax, @acc[$i]
|
||
|
mov @acc[$i+1], %rax
|
||
|
adc \$0, %rdx
|
||
|
mov %rdx, @acc[$i+1]
|
||
|
___
|
||
|
}
|
||
|
$code.=<<___;
|
||
|
mov 8*1(%rsp), %rdx # out_ptr
|
||
|
imulq $fx, %rax
|
||
|
mov 8*2(%rsp), $in_ptr # restore original in_ptr
|
||
|
add @acc[11], %rax
|
||
|
|
||
|
add 8*0(%rdx), @acc[0] # accumulate |u|*|f0|
|
||
|
adc 8*1(%rdx), @acc[1]
|
||
|
adc 8*2(%rdx), @acc[2]
|
||
|
adc 8*3(%rdx), @acc[3]
|
||
|
adc 8*4(%rdx), @acc[4]
|
||
|
adc 8*5(%rdx), @acc[5]
|
||
|
adc 8*6(%rdx), @acc[6]
|
||
|
mov 8*7(%rdx), @acc[11] # sign extension
|
||
|
adc @acc[11], @acc[7]
|
||
|
adc @acc[11], @acc[8]
|
||
|
adc @acc[11], @acc[9]
|
||
|
adc @acc[11], @acc[10]
|
||
|
adc @acc[11], %rax
|
||
|
|
||
|
mov %rdx, $out_ptr # restore original out_ptr
|
||
|
|
||
|
mov @acc[0], 8*0(%rdx)
|
||
|
mov @acc[1], 8*1(%rdx)
|
||
|
mov @acc[2], 8*2(%rdx)
|
||
|
mov @acc[3], 8*3(%rdx)
|
||
|
mov @acc[4], 8*4(%rdx)
|
||
|
mov @acc[5], 8*5(%rdx)
|
||
|
mov @acc[6], 8*6(%rdx)
|
||
|
mov @acc[7], 8*7(%rdx)
|
||
|
mov @acc[8], 8*8(%rdx)
|
||
|
mov @acc[9], 8*9(%rdx)
|
||
|
mov @acc[10], 8*10(%rdx)
|
||
|
mov %rax, 8*11(%rdx)
|
||
|
|
||
|
ret
|
||
|
.size __smulq_767x63,.-__smulq_767x63
|
||
|
___
|
||
|
}
|
||
|
$code.=<<___;
|
||
|
.type __smulq_383x63,\@abi-omnipotent
|
||
|
.align 32
|
||
|
__smulq_383x63:
|
||
|
___
|
||
|
for($j=0; $j<2; $j++) {
|
||
|
$code.=<<___;
|
||
|
mov 8*0($in_ptr), @acc[0] # load |u| (or |v|)
|
||
|
mov 8*1($in_ptr), @acc[1]
|
||
|
mov 8*2($in_ptr), @acc[2]
|
||
|
mov 8*3($in_ptr), @acc[3]
|
||
|
mov 8*4($in_ptr), @acc[4]
|
||
|
mov 8*5($in_ptr), @acc[5]
|
||
|
|
||
|
mov %rdx, $fx
|
||
|
sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s)
|
||
|
xor %rax, %rax
|
||
|
sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s)
|
||
|
|
||
|
xor %rdx, $fx # conditionally negate |f0|
|
||
|
add %rax, $fx
|
||
|
|
||
|
xor %rdx, @acc[0] # conditionally negate |u| (or |v|)
|
||
|
xor %rdx, @acc[1]
|
||
|
xor %rdx, @acc[2]
|
||
|
xor %rdx, @acc[3]
|
||
|
xor %rdx, @acc[4]
|
||
|
xor %rdx, @acc[5]
|
||
|
add @acc[0], %rax
|
||
|
adc \$0, @acc[1]
|
||
|
adc \$0, @acc[2]
|
||
|
adc \$0, @acc[3]
|
||
|
adc \$0, @acc[4]
|
||
|
adc \$0, @acc[5]
|
||
|
|
||
|
mulq $fx # |u|*|f0| (or |v|*|g0|)
|
||
|
mov %rax, @acc[0]
|
||
|
mov @acc[1], %rax
|
||
|
mov %rdx, @acc[1]
|
||
|
___
|
||
|
for($i=1; $i<5; $i++) {
|
||
|
$code.=<<___;
|
||
|
mulq $fx
|
||
|
add %rax, @acc[$i]
|
||
|
mov @acc[$i+1], %rax
|
||
|
adc \$0, %rdx
|
||
|
mov %rdx, @acc[$i+1]
|
||
|
___
|
||
|
}
|
||
|
$code.=<<___ if ($j==0);
|
||
|
imulq $fx, %rax
|
||
|
add %rax, @acc[$i]
|
||
|
|
||
|
lea 8*6($in_ptr), $in_ptr # pointer to |v|
|
||
|
mov $g0, %rdx
|
||
|
|
||
|
mov @acc[0], 8*0($out_ptr) # offload |u|*|f0|
|
||
|
mov @acc[1], 8*1($out_ptr)
|
||
|
mov @acc[2], 8*2($out_ptr)
|
||
|
mov @acc[3], 8*3($out_ptr)
|
||
|
mov @acc[4], 8*4($out_ptr)
|
||
|
mov @acc[5], 8*5($out_ptr)
|
||
|
___
|
||
|
}
|
||
|
$code.=<<___;
|
||
|
imulq $fx, %rax
|
||
|
add %rax, @acc[$i]
|
||
|
|
||
|
lea -8*6($in_ptr), $in_ptr # restore original in_ptr
|
||
|
|
||
|
add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0|
|
||
|
adc 8*1($out_ptr), @acc[1]
|
||
|
adc 8*2($out_ptr), @acc[2]
|
||
|
adc 8*3($out_ptr), @acc[3]
|
||
|
adc 8*4($out_ptr), @acc[4]
|
||
|
adc 8*5($out_ptr), @acc[5]
|
||
|
|
||
|
mov @acc[0], 8*0($out_ptr)
|
||
|
mov @acc[1], 8*1($out_ptr)
|
||
|
mov @acc[2], 8*2($out_ptr)
|
||
|
mov @acc[3], 8*3($out_ptr)
|
||
|
mov @acc[4], 8*4($out_ptr)
|
||
|
mov @acc[5], 8*5($out_ptr)
|
||
|
|
||
|
ret
|
||
|
.size __smulq_383x63,.-__smulq_383x63
|
||
|
___
|
||
|
{
|
||
|
$code.=<<___;
|
||
|
.type __smulq_383_n_shift_by_62,\@abi-omnipotent
|
||
|
.align 32
|
||
|
__smulq_383_n_shift_by_62:
|
||
|
mov $f0, @acc[8]
|
||
|
___
|
||
|
my $f0 = @acc[8];
|
||
|
for($j=0; $j<2; $j++) {
|
||
|
$code.=<<___;
|
||
|
mov 8*0($in_ptr), @acc[0] # load |a| (or |b|)
|
||
|
mov 8*1($in_ptr), @acc[1]
|
||
|
mov 8*2($in_ptr), @acc[2]
|
||
|
mov 8*3($in_ptr), @acc[3]
|
||
|
mov 8*4($in_ptr), @acc[4]
|
||
|
mov 8*5($in_ptr), @acc[5]
|
||
|
|
||
|
mov %rdx, $fx
|
||
|
sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s)
|
||
|
xor %rax, %rax
|
||
|
sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s)
|
||
|
|
||
|
xor %rdx, $fx # conditionally negate |f0| (or |g0|)
|
||
|
add %rax, $fx
|
||
|
|
||
|
xor %rdx, @acc[0] # conditionally negate |a| (or |b|)
|
||
|
xor %rdx, @acc[1]
|
||
|
xor %rdx, @acc[2]
|
||
|
xor %rdx, @acc[3]
|
||
|
xor %rdx, @acc[4]
|
||
|
xor %rdx, @acc[5]
|
||
|
add @acc[0], %rax
|
||
|
adc \$0, @acc[1]
|
||
|
adc \$0, @acc[2]
|
||
|
adc \$0, @acc[3]
|
||
|
adc \$0, @acc[4]
|
||
|
adc \$0, @acc[5]
|
||
|
|
||
|
mulq $fx # |a|*|f0| (or |b|*|g0|)
|
||
|
mov %rax, @acc[0]
|
||
|
mov @acc[1], %rax
|
||
|
mov %rdx, @acc[1]
|
||
|
___
|
||
|
for($i=1; $i<5; $i++) {
|
||
|
$code.=<<___;
|
||
|
mulq $fx
|
||
|
add %rax, @acc[$i]
|
||
|
mov @acc[$i+1], %rax
|
||
|
adc \$0, %rdx
|
||
|
mov %rdx, @acc[$i+1]
|
||
|
___
|
||
|
}
|
||
|
$code.=<<___ if ($j==0);
|
||
|
imulq $fx
|
||
|
add %rax, @acc[$i]
|
||
|
adc \$0, %rdx
|
||
|
|
||
|
lea 8*6($in_ptr), $in_ptr # pointer to |b|
|
||
|
mov %rdx, @acc[6]
|
||
|
mov $g0, %rdx
|
||
|
|
||
|
mov @acc[0], 8*0($out_ptr)
|
||
|
mov @acc[1], 8*1($out_ptr)
|
||
|
mov @acc[2], 8*2($out_ptr)
|
||
|
mov @acc[3], 8*3($out_ptr)
|
||
|
mov @acc[4], 8*4($out_ptr)
|
||
|
mov @acc[5], 8*5($out_ptr)
|
||
|
___
|
||
|
}
|
||
|
$code.=<<___;
|
||
|
imulq $fx
|
||
|
add %rax, @acc[$i]
|
||
|
adc \$0, %rdx
|
||
|
|
||
|
lea -8*6($in_ptr), $in_ptr # restore original in_ptr
|
||
|
|
||
|
add 8*0($out_ptr), @acc[0]
|
||
|
adc 8*1($out_ptr), @acc[1]
|
||
|
adc 8*2($out_ptr), @acc[2]
|
||
|
adc 8*3($out_ptr), @acc[3]
|
||
|
adc 8*4($out_ptr), @acc[4]
|
||
|
adc 8*5($out_ptr), @acc[5]
|
||
|
adc %rdx, @acc[6]
|
||
|
mov $f0, %rdx
|
||
|
|
||
|
shrd \$62, @acc[1], @acc[0]
|
||
|
shrd \$62, @acc[2], @acc[1]
|
||
|
shrd \$62, @acc[3], @acc[2]
|
||
|
shrd \$62, @acc[4], @acc[3]
|
||
|
shrd \$62, @acc[5], @acc[4]
|
||
|
shrd \$62, @acc[6], @acc[5]
|
||
|
|
||
|
sar \$63, @acc[6] # sign as mask
|
||
|
xor $fx, $fx
|
||
|
sub @acc[6], $fx # sign as bit
|
||
|
|
||
|
xor @acc[6], @acc[0] # conditionally negate the result
|
||
|
xor @acc[6], @acc[1]
|
||
|
xor @acc[6], @acc[2]
|
||
|
xor @acc[6], @acc[3]
|
||
|
xor @acc[6], @acc[4]
|
||
|
xor @acc[6], @acc[5]
|
||
|
add $fx, @acc[0]
|
||
|
adc \$0, @acc[1]
|
||
|
adc \$0, @acc[2]
|
||
|
adc \$0, @acc[3]
|
||
|
adc \$0, @acc[4]
|
||
|
adc \$0, @acc[5]
|
||
|
|
||
|
mov @acc[0], 8*0($out_ptr)
|
||
|
mov @acc[1], 8*1($out_ptr)
|
||
|
mov @acc[2], 8*2($out_ptr)
|
||
|
mov @acc[3], 8*3($out_ptr)
|
||
|
mov @acc[4], 8*4($out_ptr)
|
||
|
mov @acc[5], 8*5($out_ptr)
|
||
|
|
||
|
xor @acc[6], %rdx # conditionally negate |f0|
|
||
|
xor @acc[6], $g0 # conditionally negate |g0|
|
||
|
add $fx, %rdx
|
||
|
add $fx, $g0
|
||
|
|
||
|
ret
|
||
|
.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62
|
||
|
___
|
||
|
} }
|
||
|
|
||
|
{
|
||
|
my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
|
||
|
my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi");
|
||
|
{
|
||
|
my @a = ($a_lo, $t1, $a_hi);
|
||
|
my @b = ($b_lo, $t2, $b_hi);
|
||
|
|
||
|
$code.=<<___;
|
||
|
.type __ab_approximation_62,\@abi-omnipotent
|
||
|
.align 32
|
||
|
__ab_approximation_62:
|
||
|
mov 8*5($in_ptr), @a[2] # load |a| in reverse order
|
||
|
mov 8*11($in_ptr), @b[2] # load |b| in reverse order
|
||
|
mov 8*4($in_ptr), @a[1]
|
||
|
mov 8*10($in_ptr), @b[1]
|
||
|
mov 8*3($in_ptr), @a[0]
|
||
|
mov 8*9($in_ptr), @b[0]
|
||
|
|
||
|
mov @a[2], $t0
|
||
|
or @b[2], $t0 # check top-most limbs, ...
|
||
|
cmovz @a[1], @a[2]
|
||
|
cmovz @b[1], @b[2]
|
||
|
cmovz @a[0], @a[1]
|
||
|
cmovz @b[0], @b[1]
|
||
|
mov 8*2($in_ptr), @a[0]
|
||
|
mov 8*8($in_ptr), @b[0]
|
||
|
|
||
|
mov @a[2], $t0
|
||
|
or @b[2], $t0 # ... ones before top-most, ...
|
||
|
cmovz @a[1], @a[2]
|
||
|
cmovz @b[1], @b[2]
|
||
|
cmovz @a[0], @a[1]
|
||
|
cmovz @b[0], @b[1]
|
||
|
mov 8*1($in_ptr), @a[0]
|
||
|
mov 8*7($in_ptr), @b[0]
|
||
|
|
||
|
mov @a[2], $t0
|
||
|
or @b[2], $t0 # ... and ones before that ...
|
||
|
cmovz @a[1], @a[2]
|
||
|
cmovz @b[1], @b[2]
|
||
|
cmovz @a[0], @a[1]
|
||
|
cmovz @b[0], @b[1]
|
||
|
mov 8*0($in_ptr), @a[0]
|
||
|
mov 8*6($in_ptr), @b[0]
|
||
|
|
||
|
mov @a[2], $t0
|
||
|
or @b[2], $t0
|
||
|
bsr $t0, %rcx
|
||
|
lea 1(%rcx), %rcx
|
||
|
cmovz @a[1], @a[2]
|
||
|
cmovz @b[1], @b[2]
|
||
|
cmovz $t0, %rcx
|
||
|
neg %rcx
|
||
|
#and \$63, %rcx # debugging artefact
|
||
|
|
||
|
shldq %cl, @a[1], @a[2] # align second limb to the left
|
||
|
shldq %cl, @b[1], @b[2]
|
||
|
|
||
|
jmp __inner_loop_62
|
||
|
|
||
|
ret
|
||
|
.size __ab_approximation_62,.-__ab_approximation_62
|
||
|
___
|
||
|
}
|
||
|
$code.=<<___;
|
||
|
.type __inner_loop_62,\@abi-omnipotent
|
||
|
.align 8
|
||
|
.long 0
|
||
|
__inner_loop_62:
|
||
|
mov \$1, $f0 # |f0|=1
|
||
|
xor $g0, $g0 # |g0|=0
|
||
|
xor $f1, $f1 # |f1|=0
|
||
|
mov \$1, $g1 # |g1|=1
|
||
|
mov $in_ptr, 8(%rsp)
|
||
|
|
||
|
.Loop_62:
|
||
|
xor $t0, $t0
|
||
|
xor $t1, $t1
|
||
|
test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_|
|
||
|
mov $b_lo, $t2
|
||
|
mov $b_hi, $t3
|
||
|
cmovnz $b_lo, $t0
|
||
|
cmovnz $b_hi, $t1
|
||
|
sub $a_lo, $t2 # |b_|-|a_|
|
||
|
sbb $a_hi, $t3
|
||
|
mov $a_lo, $t4
|
||
|
mov $a_hi, $t5
|
||
|
sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even)
|
||
|
sbb $t1, $a_hi
|
||
|
cmovc $t2, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_|
|
||
|
cmovc $t3, $a_hi
|
||
|
cmovc $t4, $b_lo # |b_| = |a_|
|
||
|
cmovc $t5, $b_hi
|
||
|
mov $f0, $t0 # exchange |f0| and |f1|
|
||
|
cmovc $f1, $f0
|
||
|
cmovc $t0, $f1
|
||
|
mov $g0, $t1 # exchange |g0| and |g1|
|
||
|
cmovc $g1, $g0
|
||
|
cmovc $t1, $g1
|
||
|
xor $t0, $t0
|
||
|
xor $t1, $t1
|
||
|
shrd \$1, $a_hi, $a_lo
|
||
|
shr \$1, $a_hi
|
||
|
test \$1, $t4 # if |a_| was odd, then we'll be subtracting...
|
||
|
cmovnz $f1, $t0
|
||
|
cmovnz $g1, $t1
|
||
|
add $f1, $f1 # |f1|<<=1
|
||
|
add $g1, $g1 # |g1|<<=1
|
||
|
sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even)
|
||
|
sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...)
|
||
|
sub \$1, $cnt
|
||
|
jnz .Loop_62
|
||
|
|
||
|
mov 8(%rsp), $in_ptr
|
||
|
ret
|
||
|
.size __inner_loop_62,.-__inner_loop_62
|
||
|
___
|
||
|
}
|
||
|
|
||
|
print $code;
|
||
|
close STDOUT;
|