ftu/blst/asm/div3w-x86_64.pl

#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

$flavour = shift;
$output  = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
    or die "can't call $xlate: $!";

$c_ref=<<'___';
/*
 * |div_top| points at two most significant limbs of the dividend, |d_hi|
 * and |d_lo| are two most significant limbs of the divisor. If divisor
 * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
 * The divisor is required to be "bitwise left-aligned," and dividend's
 * top limbs to be not larger than the divisor's. The latter limitation
 * can be problematic in the first iteration of multi-precision division,
 * where in most general case the condition would have to be "smaller."
 * The subroutine considers four limbs, two of which are "overlapping,"
 * hence the name... Another way to look at it is to think of the pair
 * of the dividend's limbs being suffixed with a zero:
 *   +-------+-------+-------+
 * R |       |       |   0   |
 *   +-------+-------+-------+
 *           +-------+-------+
 * D         |       |       |
 *           +-------+-------+
 */
limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)
{
    llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0];
    llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo;
    limb_t Q = 0, mask;
    size_t i;

    for (i = 0; i < LIMB_BITS; i++) {
        Q <<= 1;
        mask = (R >= D);
        Q |= mask;
        R -= (D & ((llimb_t)0 - mask));
        D >>= 1;
    }

    mask = 0 - (Q >> (LIMB_BITS - 1));   /* does it overflow? */

    Q <<= 1;
    Q |= (R >= D);

    return (Q | mask);
}
___

$code.=<<___;
.text

.globl	div_3_limbs
.hidden	div_3_limbs
.type	div_3_limbs,\@function,3
.align	32
div_3_limbs:
	mov	(%rdi),%r8		# load R.lo
	mov	8(%rdi),%r9		# load R.hi
	xor	%rax,%rax		# Q = 0
	mov	\$64,%ecx		# loop counter

.Loop:
	 mov	%r8,%r10		# put aside R
	sub	%rsi,%r8		# R -= D
	 mov	%r9,%r11
	sbb	%rdx,%r9
	lea	1(%rax,%rax),%rax	# Q <<= 1 + speculative bit
	 mov	%rdx,%rdi
	cmovc	%r10,%r8		# restore R if R - D borrowed
	cmovc	%r11,%r9
	sbb	\$0,%rax		# subtract speculative bit
	 shl	\$63,%rdi
	 shr	\$1,%rsi
	 shr	\$1,%rdx
	 or	%rdi,%rsi		# D >>= 1
	sub	\$1,%ecx
	jnz	.Loop

	lea	1(%rax,%rax),%rcx	# Q <<= 1 + speculative bit
	sar	\$63,%rax		# top bit -> mask

	sub	%rsi,%r8		# R -= D
	sbb	%rdx,%r9
	sbb	\$0,%rcx		# subtract speculative bit

	or	%rcx,%rax		# all ones if overflow

	ret
.size	div_3_limbs,.-div_3_limbs
___
########################################################################
# Calculate remainder and adjust the quotient, which can be off-by-one.
# Then save quotient in limb next to top limb of the remainder. There is
# place, because the remainder/next-iteration-dividend gets shorter by
# one limb.
{
my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");
my @acc = ("%r8", "%r9", "%rdx");
my @tmp = ("%r10", "%r11", "%rax");

$code.=<<___;
.globl	quot_rem_128
.hidden	quot_rem_128
.type	quot_rem_128,\@function,3
.align	32
quot_rem_128:
	mov	%rdx, %rax
	mov	%rdx, $quotient

	mulq	0($divisor)		# divisor[0:1] * quotient
	mov	%rax, @acc[0]
	mov	$quotient, %rax
	mov	%rdx, @acc[1]

	mulq	8($divisor)
	add	%rax, @acc[1]
	adc	\$0, %rdx		# %rdx is @acc[2]

	mov	0($div_rem), @tmp[0]	# load 3 limbs of the dividend
	mov	8($div_rem), @tmp[1]
	mov	16($div_rem), @tmp[2]

	sub	@acc[0], @tmp[0]	# dividend - divisor * quotient
	sbb	@acc[1], @tmp[1]
	sbb	@acc[2], @tmp[2]
	sbb	@acc[0], @acc[0]	# borrow -> mask

	add	@acc[0], $quotient	# if borrowed, adjust the quotient ...
	mov	@acc[0], @acc[1]
	and	0($divisor), @acc[0]
	and	8($divisor), @acc[1]
	add	@acc[0], @tmp[0]	# ... and add divisor
	adc	@acc[1], @tmp[1]

	mov	@tmp[0], 0($div_rem)	# save 2 limbs of the remainder ...
	mov	@tmp[1], 8($div_rem)
	mov	$quotient, 16($div_rem)	# ... and 1 limb of the quotient

	mov	$quotient, %rax		# return adjusted quotient

	ret
.size	quot_rem_128,.-quot_rem_128

########################################################################
# Unlike 128-bit case above, quotient is exact. As result just one limb
# of the dividend is sufficient to calculate the remainder...

.globl	quot_rem_64
.hidden	quot_rem_64
.type	quot_rem_64,\@function,3
.align	32
quot_rem_64:
	mov	%rdx, %rax		# return quotient
	imulq	0($divisor), %rdx	# divisor[0] * quotient

	mov	0($div_rem), @tmp[0]	# load 1 limb of the dividend

	sub	%rdx, @tmp[0]		# dividend - divisor * quotient

	mov	@tmp[0], 0($div_rem)	# save 1 limb of the remainder ...
	mov	%rax, 8($div_rem)	# ... and 1 limb of the quotient

	ret
.size	quot_rem_64,.-quot_rem_64
___
}

print $code;
close STDOUT;
initial stuff 2022-09-09 06:47:49 +00:00			`#!/usr/bin/env perl`
			`#`
			`# Copyright Supranational LLC`
			`# Licensed under the Apache License, Version 2.0, see LICENSE for details.`
			`# SPDX-License-Identifier: Apache-2.0`

			`$flavour = shift;`
			`$output = shift;`
			`if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }`

			`$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);`

			`$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;`
			`( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or`
			`( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or`
			`die "can't locate x86_64-xlate.pl";`

			`open STDOUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\""`
			`or die "can't call $xlate: $!";`

			`$c_ref=<<'___';`
			`/*`
			`* \|div_top\| points at two most significant limbs of the dividend, \|d_hi\|`
			`* and \|d_lo\| are two most significant limbs of the divisor. If divisor`
			`* is only one limb, it is to be passed in \|d_hi\| with zero in \|d_lo\|.`
			`* The divisor is required to be "bitwise left-aligned," and dividend's`
			`* top limbs to be not larger than the divisor's. The latter limitation`
			`* can be problematic in the first iteration of multi-precision division,`
			`* where in most general case the condition would have to be "smaller."`
			`* The subroutine considers four limbs, two of which are "overlapping,"`
			`* hence the name... Another way to look at it is to think of the pair`
			`* of the dividend's limbs being suffixed with a zero:`
			`* +-------+-------+-------+`
			`* R \| \| \| 0 \|`
			`* +-------+-------+-------+`
			`* +-------+-------+`
			`* D \| \| \|`
			`* +-------+-------+`
			`*/`
			`limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)`
			`{`
			`llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) \| div_top[0];`
			`llimb_t D = ((llimb_t)d_hi << LIMB_BITS) \| d_lo;`
			`limb_t Q = 0, mask;`
			`size_t i;`

			`for (i = 0; i < LIMB_BITS; i++) {`
			`Q <<= 1;`
			`mask = (R >= D);`
			`Q \|= mask;`
			`R -= (D & ((llimb_t)0 - mask));`
			`D >>= 1;`
			`}`

			`mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */`

			`Q <<= 1;`
			`Q \|= (R >= D);`

			`return (Q \| mask);`
			`}`
			`___`

			`$code.=<<___;`
			`.text`

			`.globl div_3_limbs`
			`.hidden div_3_limbs`
			`.type div_3_limbs,\@function,3`
			`.align 32`
			`div_3_limbs:`
			`mov (%rdi),%r8 # load R.lo`
			`mov 8(%rdi),%r9 # load R.hi`
			`xor %rax,%rax # Q = 0`
			`mov \$64,%ecx # loop counter`

			`.Loop:`
			`mov %r8,%r10 # put aside R`
			`sub %rsi,%r8 # R -= D`
			`mov %r9,%r11`
			`sbb %rdx,%r9`
			`lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit`
			`mov %rdx,%rdi`
			`cmovc %r10,%r8 # restore R if R - D borrowed`
			`cmovc %r11,%r9`
			`sbb \$0,%rax # subtract speculative bit`
			`shl \$63,%rdi`
			`shr \$1,%rsi`
			`shr \$1,%rdx`
			`or %rdi,%rsi # D >>= 1`
			`sub \$1,%ecx`
			`jnz .Loop`

			`lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit`
			`sar \$63,%rax # top bit -> mask`

			`sub %rsi,%r8 # R -= D`
			`sbb %rdx,%r9`
			`sbb \$0,%rcx # subtract speculative bit`

			`or %rcx,%rax # all ones if overflow`

			`ret`
			`.size div_3_limbs,.-div_3_limbs`
			`___`
			`########################################################################`
			`# Calculate remainder and adjust the quotient, which can be off-by-one.`
			`# Then save quotient in limb next to top limb of the remainder. There is`
			`# place, because the remainder/next-iteration-dividend gets shorter by`
			`# one limb.`
			`{`
			`my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");`
			`my @acc = ("%r8", "%r9", "%rdx");`
			`my @tmp = ("%r10", "%r11", "%rax");`

			`$code.=<<___;`
			`.globl quot_rem_128`
			`.hidden quot_rem_128`
			`.type quot_rem_128,\@function,3`
			`.align 32`
			`quot_rem_128:`
			`mov %rdx, %rax`
			`mov %rdx, $quotient`

			`mulq 0($divisor) # divisor[0:1] * quotient`
			`mov %rax, @acc[0]`
			`mov $quotient, %rax`
			`mov %rdx, @acc[1]`

			`mulq 8($divisor)`
			`add %rax, @acc[1]`
			`adc \$0, %rdx # %rdx is @acc[2]`

			`mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend`
			`mov 8($div_rem), @tmp[1]`
			`mov 16($div_rem), @tmp[2]`

			`sub @acc[0], @tmp[0] # dividend - divisor * quotient`
			`sbb @acc[1], @tmp[1]`
			`sbb @acc[2], @tmp[2]`
			`sbb @acc[0], @acc[0] # borrow -> mask`

			`add @acc[0], $quotient # if borrowed, adjust the quotient ...`
			`mov @acc[0], @acc[1]`
			`and 0($divisor), @acc[0]`
			`and 8($divisor), @acc[1]`
			`add @acc[0], @tmp[0] # ... and add divisor`
			`adc @acc[1], @tmp[1]`

			`mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ...`
			`mov @tmp[1], 8($div_rem)`
			`mov $quotient, 16($div_rem) # ... and 1 limb of the quotient`

			`mov $quotient, %rax # return adjusted quotient`

			`ret`
			`.size quot_rem_128,.-quot_rem_128`

			`########################################################################`
			`# Unlike 128-bit case above, quotient is exact. As result just one limb`
			`# of the dividend is sufficient to calculate the remainder...`

			`.globl quot_rem_64`
			`.hidden quot_rem_64`
			`.type quot_rem_64,\@function,3`
			`.align 32`
			`quot_rem_64:`
			`mov %rdx, %rax # return quotient`
			`imulq 0($divisor), %rdx # divisor[0] * quotient`

			`mov 0($div_rem), @tmp[0] # load 1 limb of the dividend`

			`sub %rdx, @tmp[0] # dividend - divisor * quotient`

			`mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ...`
			`mov %rax, 8($div_rem) # ... and 1 limb of the quotient`

			`ret`
			`.size quot_rem_64,.-quot_rem_64`
			`___`
			`}`

			`print $code;`
			`close STDOUT;`