185 lines
4.9 KiB
Raku
Executable file
185 lines
4.9 KiB
Raku
Executable file
#!/usr/bin/env perl
|
|
#
|
|
# Copyright Supranational LLC
|
|
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
|
or die "can't call $xlate: $!";
|
|
|
|
$c_ref=<<'___';
|
|
/*
|
|
* |div_top| points at two most significant limbs of the dividend, |d_hi|
|
|
* and |d_lo| are two most significant limbs of the divisor. If divisor
|
|
* is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
|
|
* The divisor is required to be "bitwise left-aligned," and dividend's
|
|
* top limbs to be not larger than the divisor's. The latter limitation
|
|
* can be problematic in the first iteration of multi-precision division,
|
|
* where in most general case the condition would have to be "smaller."
|
|
* The subroutine considers four limbs, two of which are "overlapping,"
|
|
* hence the name... Another way to look at it is to think of the pair
|
|
* of the dividend's limbs being suffixed with a zero:
|
|
* +-------+-------+-------+
|
|
* R | | | 0 |
|
|
* +-------+-------+-------+
|
|
* +-------+-------+
|
|
* D | | |
|
|
* +-------+-------+
|
|
*/
|
|
limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)
|
|
{
|
|
llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0];
|
|
llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo;
|
|
limb_t Q = 0, mask;
|
|
size_t i;
|
|
|
|
for (i = 0; i < LIMB_BITS; i++) {
|
|
Q <<= 1;
|
|
mask = (R >= D);
|
|
Q |= mask;
|
|
R -= (D & ((llimb_t)0 - mask));
|
|
D >>= 1;
|
|
}
|
|
|
|
mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */
|
|
|
|
Q <<= 1;
|
|
Q |= (R >= D);
|
|
|
|
return (Q | mask);
|
|
}
|
|
___
|
|
|
|
$code.=<<___;
|
|
.text
|
|
|
|
.globl div_3_limbs
|
|
.hidden div_3_limbs
|
|
.type div_3_limbs,\@function,3
|
|
.align 32
|
|
div_3_limbs:
|
|
mov (%rdi),%r8 # load R.lo
|
|
mov 8(%rdi),%r9 # load R.hi
|
|
xor %rax,%rax # Q = 0
|
|
mov \$64,%ecx # loop counter
|
|
|
|
.Loop:
|
|
mov %r8,%r10 # put aside R
|
|
sub %rsi,%r8 # R -= D
|
|
mov %r9,%r11
|
|
sbb %rdx,%r9
|
|
lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit
|
|
mov %rdx,%rdi
|
|
cmovc %r10,%r8 # restore R if R - D borrowed
|
|
cmovc %r11,%r9
|
|
sbb \$0,%rax # subtract speculative bit
|
|
shl \$63,%rdi
|
|
shr \$1,%rsi
|
|
shr \$1,%rdx
|
|
or %rdi,%rsi # D >>= 1
|
|
sub \$1,%ecx
|
|
jnz .Loop
|
|
|
|
lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit
|
|
sar \$63,%rax # top bit -> mask
|
|
|
|
sub %rsi,%r8 # R -= D
|
|
sbb %rdx,%r9
|
|
sbb \$0,%rcx # subtract speculative bit
|
|
|
|
or %rcx,%rax # all ones if overflow
|
|
|
|
ret
|
|
.size div_3_limbs,.-div_3_limbs
|
|
___
|
|
########################################################################
|
|
# Calculate remainder and adjust the quotient, which can be off-by-one.
|
|
# Then save quotient in limb next to top limb of the remainder. There is
|
|
# place, because the remainder/next-iteration-dividend gets shorter by
|
|
# one limb.
|
|
{
|
|
my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");
|
|
my @acc = ("%r8", "%r9", "%rdx");
|
|
my @tmp = ("%r10", "%r11", "%rax");
|
|
|
|
$code.=<<___;
|
|
.globl quot_rem_128
|
|
.hidden quot_rem_128
|
|
.type quot_rem_128,\@function,3
|
|
.align 32
|
|
quot_rem_128:
|
|
mov %rdx, %rax
|
|
mov %rdx, $quotient
|
|
|
|
mulq 0($divisor) # divisor[0:1] * quotient
|
|
mov %rax, @acc[0]
|
|
mov $quotient, %rax
|
|
mov %rdx, @acc[1]
|
|
|
|
mulq 8($divisor)
|
|
add %rax, @acc[1]
|
|
adc \$0, %rdx # %rdx is @acc[2]
|
|
|
|
mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend
|
|
mov 8($div_rem), @tmp[1]
|
|
mov 16($div_rem), @tmp[2]
|
|
|
|
sub @acc[0], @tmp[0] # dividend - divisor * quotient
|
|
sbb @acc[1], @tmp[1]
|
|
sbb @acc[2], @tmp[2]
|
|
sbb @acc[0], @acc[0] # borrow -> mask
|
|
|
|
add @acc[0], $quotient # if borrowed, adjust the quotient ...
|
|
mov @acc[0], @acc[1]
|
|
and 0($divisor), @acc[0]
|
|
and 8($divisor), @acc[1]
|
|
add @acc[0], @tmp[0] # ... and add divisor
|
|
adc @acc[1], @tmp[1]
|
|
|
|
mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ...
|
|
mov @tmp[1], 8($div_rem)
|
|
mov $quotient, 16($div_rem) # ... and 1 limb of the quotient
|
|
|
|
mov $quotient, %rax # return adjusted quotient
|
|
|
|
ret
|
|
.size quot_rem_128,.-quot_rem_128
|
|
|
|
########################################################################
|
|
# Unlike 128-bit case above, quotient is exact. As result just one limb
|
|
# of the dividend is sufficient to calculate the remainder...
|
|
|
|
.globl quot_rem_64
|
|
.hidden quot_rem_64
|
|
.type quot_rem_64,\@function,3
|
|
.align 32
|
|
quot_rem_64:
|
|
mov %rdx, %rax # return quotient
|
|
imulq 0($divisor), %rdx # divisor[0] * quotient
|
|
|
|
mov 0($div_rem), @tmp[0] # load 1 limb of the dividend
|
|
|
|
sub %rdx, @tmp[0] # dividend - divisor * quotient
|
|
|
|
mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ...
|
|
mov %rax, 8($div_rem) # ... and 1 limb of the quotient
|
|
|
|
ret
|
|
.size quot_rem_64,.-quot_rem_64
|
|
___
|
|
}
|
|
|
|
print $code;
|
|
close STDOUT;
|