ftu/blst/asm/div3w-x86_64.pl
2022-09-09 02:47:49 -04:00

185 lines
4.9 KiB
Raku
Executable file

#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
$c_ref=<<'___';
/*
* |div_top| points at two most significant limbs of the dividend, |d_hi|
* and |d_lo| are two most significant limbs of the divisor. If divisor
* is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
* The divisor is required to be "bitwise left-aligned," and dividend's
* top limbs to be not larger than the divisor's. The latter limitation
* can be problematic in the first iteration of multi-precision division,
* where in most general case the condition would have to be "smaller."
* The subroutine considers four limbs, two of which are "overlapping,"
* hence the name... Another way to look at it is to think of the pair
* of the dividend's limbs being suffixed with a zero:
* +-------+-------+-------+
* R | | | 0 |
* +-------+-------+-------+
* +-------+-------+
* D | | |
* +-------+-------+
*/
limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)
{
llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0];
llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo;
limb_t Q = 0, mask;
size_t i;
for (i = 0; i < LIMB_BITS; i++) {
Q <<= 1;
mask = (R >= D);
Q |= mask;
R -= (D & ((llimb_t)0 - mask));
D >>= 1;
}
mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */
Q <<= 1;
Q |= (R >= D);
return (Q | mask);
}
___
$code.=<<___;
.text
.globl div_3_limbs
.hidden div_3_limbs
.type div_3_limbs,\@function,3
.align 32
div_3_limbs:
mov (%rdi),%r8 # load R.lo
mov 8(%rdi),%r9 # load R.hi
xor %rax,%rax # Q = 0
mov \$64,%ecx # loop counter
.Loop:
mov %r8,%r10 # put aside R
sub %rsi,%r8 # R -= D
mov %r9,%r11
sbb %rdx,%r9
lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit
mov %rdx,%rdi
cmovc %r10,%r8 # restore R if R - D borrowed
cmovc %r11,%r9
sbb \$0,%rax # subtract speculative bit
shl \$63,%rdi
shr \$1,%rsi
shr \$1,%rdx
or %rdi,%rsi # D >>= 1
sub \$1,%ecx
jnz .Loop
lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit
sar \$63,%rax # top bit -> mask
sub %rsi,%r8 # R -= D
sbb %rdx,%r9
sbb \$0,%rcx # subtract speculative bit
or %rcx,%rax # all ones if overflow
ret
.size div_3_limbs,.-div_3_limbs
___
########################################################################
# Calculate remainder and adjust the quotient, which can be off-by-one.
# Then save quotient in limb next to top limb of the remainder. There is
# place, because the remainder/next-iteration-dividend gets shorter by
# one limb.
{
my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");
my @acc = ("%r8", "%r9", "%rdx");
my @tmp = ("%r10", "%r11", "%rax");
$code.=<<___;
.globl quot_rem_128
.hidden quot_rem_128
.type quot_rem_128,\@function,3
.align 32
quot_rem_128:
mov %rdx, %rax
mov %rdx, $quotient
mulq 0($divisor) # divisor[0:1] * quotient
mov %rax, @acc[0]
mov $quotient, %rax
mov %rdx, @acc[1]
mulq 8($divisor)
add %rax, @acc[1]
adc \$0, %rdx # %rdx is @acc[2]
mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend
mov 8($div_rem), @tmp[1]
mov 16($div_rem), @tmp[2]
sub @acc[0], @tmp[0] # dividend - divisor * quotient
sbb @acc[1], @tmp[1]
sbb @acc[2], @tmp[2]
sbb @acc[0], @acc[0] # borrow -> mask
add @acc[0], $quotient # if borrowed, adjust the quotient ...
mov @acc[0], @acc[1]
and 0($divisor), @acc[0]
and 8($divisor), @acc[1]
add @acc[0], @tmp[0] # ... and add divisor
adc @acc[1], @tmp[1]
mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ...
mov @tmp[1], 8($div_rem)
mov $quotient, 16($div_rem) # ... and 1 limb of the quotient
mov $quotient, %rax # return adjusted quotient
ret
.size quot_rem_128,.-quot_rem_128
########################################################################
# Unlike 128-bit case above, quotient is exact. As result just one limb
# of the dividend is sufficient to calculate the remainder...
.globl quot_rem_64
.hidden quot_rem_64
.type quot_rem_64,\@function,3
.align 32
quot_rem_64:
mov %rdx, %rax # return quotient
imulq 0($divisor), %rdx # divisor[0] * quotient
mov 0($div_rem), @tmp[0] # load 1 limb of the dividend
sub %rdx, @tmp[0] # dividend - divisor * quotient
mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ...
mov %rax, 8($div_rem) # ... and 1 limb of the quotient
ret
.size quot_rem_64,.-quot_rem_64
___
}
print $code;
close STDOUT;