ftu/blst/asm/ct_is_square_mod_384-armv8.pl
2022-09-09 02:47:49 -04:00

399 lines
11 KiB
Raku
Executable file

#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast quadratic residue test as suggested in
# https://eprint.iacr.org/2020/972. Performance is >12x better [on
# Cortex cores] than modulus-specific Legendre symbol addition chain...
#
# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
#
$python_ref.=<<'___';
def ct_is_square_mod_384(inp, mod):
a = inp
b = mod
L = 0 # only least significant bit, adding 1 makes up for sign change
k = 30
w = 32
mask = (1 << w) - 1
for i in range(0, 768 // k - 1):
# __ab_approximation_30
n = max(a.bit_length(), b.bit_length())
if n < 64:
a_, b_ = a, b
else:
a_ = (a & mask) | ((a >> (n-w)) << w)
b_ = (b & mask) | ((b >> (n-w)) << w)
# __inner_loop_30
f0, g0, f1, g1 = 1, 0, 0, 1
for j in range(0, k):
if a_ & 1:
if a_ < b_:
a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
# tell the whole story
a_, f0, g0 = a_-b_, f0-f1, g0-g1
a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7]
# __smulq_384_n_shift_by_30
a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
if b < 0:
b = -b
if a < 0:
a = -a
L += (b % 4) >> 1 # |b| is always odd, the second bit
# tells the whole story
if True:
for j in range(0, 768 % k + k):
if a & 1:
if a < b:
a, b = b, a
L += (a & b) >> 1 # |a| and |b| are both odd, second bits
# tell the whole story
a = a-b
a = a >> 1
L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7]
return (L & 1) ^ 1
___
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2));
my @acc=map("x$_",(3..14));
my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20));
my @t = map("x$_",(21..28));
my ($a_, $b_) = @acc[5,11];
$frame = 2*256;
$code.=<<___;
.text
.globl ct_is_square_mod_384
.type ct_is_square_mod_384, %function
.align 5
ct_is_square_mod_384:
paciasp
stp x29, x30, [sp,#-128]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
sub sp, sp, #$frame
ldp @acc[0], @acc[1], [x0,#8*0] // load input
ldp @acc[2], @acc[3], [x0,#8*2]
ldp @acc[4], @acc[5], [x0,#8*4]
add $in_ptr, sp, #255 // find closest 256-byte-aligned spot
and $in_ptr, $in_ptr, #-256 // in the frame...
ldp @acc[6], @acc[7], [x1,#8*0] // load modulus
ldp @acc[8], @acc[9], [x1,#8*2]
ldp @acc[10], @acc[11], [x1,#8*4]
stp @acc[0], @acc[1], [$in_ptr,#8*6] // copy input to |a|
stp @acc[2], @acc[3], [$in_ptr,#8*8]
stp @acc[4], @acc[5], [$in_ptr,#8*10]
stp @acc[6], @acc[7], [$in_ptr,#8*0] // copy modulus to |b|
stp @acc[8], @acc[9], [$in_ptr,#8*2]
stp @acc[10], @acc[11], [$in_ptr,#8*4]
eor $L, $L, $L // init the Legendre symbol
mov $cnt, #24 // 24 is 768/30-1
b .Loop_is_square
.align 4
.Loop_is_square:
bl __ab_approximation_30
sub $cnt, $cnt, #1
eor $out_ptr, $in_ptr, #128 // pointer to dst |b|
bl __smul_384_n_shift_by_30
mov $f1, $f0 // |f0|
mov $g1, $g0 // |g0|
add $out_ptr, $out_ptr, #8*6 // pointer to dst |a|
bl __smul_384_n_shift_by_30
ldp @acc[6], @acc[7], [$out_ptr,#-8*6]
eor $in_ptr, $in_ptr, #128 // flip-flop src |a|b|
and @t[6], @t[6], @acc[6] // if |a| was negative,
add $L, $L, @t[6], lsr#1 // adjust |L|
cbnz $cnt, .Loop_is_square
////////////////////////////////////////// last iteration
//bl __ab_approximation_30 // |a| and |b| are exact,
//ldr $a_, [$in_ptr,#8*6] // just load
mov $b_, @acc[6] // ldr $b_, [$in_ptr,#8*0]
mov $cnt, #48 // 48 is 768%30 + 30
bl __inner_loop_48
ldr x30, [x29,#8]
and x0, $L, #1
eor x0, x0, #1
add sp, sp, #$frame
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldr x29, [sp],#128
autiasp
ret
.size ct_is_square_mod_384,.-ct_is_square_mod_384
.type __smul_384_n_shift_by_30, %function
.align 5
__smul_384_n_shift_by_30:
___
for($j=0; $j<2; $j++) {
my $fx = $g1; $fx = $f1 if ($j);
my @acc = @acc; @acc = @acc[6..11] if ($j);
my $k = 8*6*$j;
$code.=<<___;
ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |b| (or |a|)
asr @t[6], $fx, #63 // |g1|'s sign as mask (or |f1|'s)
ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k]
eor $fx, $fx, @t[6] // conditionally negate |g1| (or |f1|)
ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k]
eor @acc[0], @acc[0], @t[6] // conditionally negate |b| (or |a|)
sub $fx, $fx, @t[6]
eor @acc[1], @acc[1], @t[6]
adds @acc[0], @acc[0], @t[6], lsr#63
eor @acc[2], @acc[2], @t[6]
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], @t[6]
adcs @acc[2], @acc[2], xzr
eor @acc[4], @acc[4], @t[6]
umulh @t[0], @acc[0], $fx
adcs @acc[3], @acc[3], xzr
umulh @t[1], @acc[1], $fx
eor @acc[5], @acc[5], @t[6]
umulh @t[2], @acc[2], $fx
adcs @acc[4], @acc[4], xzr
umulh @t[3], @acc[3], $fx
adc @acc[5], @acc[5], xzr
umulh @t[4], @acc[4], $fx
and @t[7], $fx, @t[6]
umulh @t[5+$j], @acc[5], $fx
neg @t[7], @t[7]
mul @acc[0], @acc[0], $fx
mul @acc[1], @acc[1], $fx
mul @acc[2], @acc[2], $fx
adds @acc[1], @acc[1], @t[0]
mul @acc[3], @acc[3], $fx
adcs @acc[2], @acc[2], @t[1]
mul @acc[4], @acc[4], $fx
adcs @acc[3], @acc[3], @t[2]
mul @acc[5], @acc[5], $fx
adcs @acc[4], @acc[4], @t[3]
adcs @acc[5], @acc[5] ,@t[4]
adc @t[5+$j], @t[5+$j], @t[7]
___
}
$code.=<<___;
adds @acc[0], @acc[0], @acc[6]
adcs @acc[1], @acc[1], @acc[7]
adcs @acc[2], @acc[2], @acc[8]
adcs @acc[3], @acc[3], @acc[9]
adcs @acc[4], @acc[4], @acc[10]
adcs @acc[5], @acc[5], @acc[11]
adc @acc[6], @t[5], @t[6]
extr @acc[0], @acc[1], @acc[0], #30
extr @acc[1], @acc[2], @acc[1], #30
extr @acc[2], @acc[3], @acc[2], #30
asr @t[6], @acc[6], #63
extr @acc[3], @acc[4], @acc[3], #30
extr @acc[4], @acc[5], @acc[4], #30
extr @acc[5], @acc[6], @acc[5], #30
eor @acc[0], @acc[0], @t[6]
eor @acc[1], @acc[1], @t[6]
adds @acc[0], @acc[0], @t[6], lsr#63
eor @acc[2], @acc[2], @t[6]
adcs @acc[1], @acc[1], xzr
eor @acc[3], @acc[3], @t[6]
adcs @acc[2], @acc[2], xzr
eor @acc[4], @acc[4], @t[6]
adcs @acc[3], @acc[3], xzr
eor @acc[5], @acc[5], @t[6]
stp @acc[0], @acc[1], [$out_ptr,#8*0]
adcs @acc[4], @acc[4], xzr
stp @acc[2], @acc[3], [$out_ptr,#8*2]
adc @acc[5], @acc[5], xzr
stp @acc[4], @acc[5], [$out_ptr,#8*4]
ret
.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
___
{
my @a = @acc[0..5];
my @b = @acc[6..11];
my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]);
$code.=<<___;
.type __ab_approximation_30, %function
.align 4
__ab_approximation_30:
ldp @b[4], @b[5], [$in_ptr,#8*4] // |a| is still in registers
ldp @b[2], @b[3], [$in_ptr,#8*2]
orr @t[0], @a[5], @b[5] // check top-most limbs, ...
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[3], ne
orr @t[0], @a[5], @b[5] // ... ones before top-most, ...
csel @b[4], @b[4], @b[3], ne
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[2], ne
orr @t[0], @a[5], @b[5] // ... and ones before that ...
csel @b[4], @b[4], @b[2], ne
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[1], ne
orr @t[0], @a[5], @b[5] // and one more, ...
csel @b[4], @b[4], @b[1], ne
cmp @t[0], #0
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
csel @a[4], @a[4], @a[0], ne
orr @t[0], @a[5], @b[5]
csel @b[4], @b[4], @b[0], ne
clz @t[0], @t[0]
cmp @t[0], #64
csel @t[0], @t[0], xzr, ne
csel @a[5], @a[5], @a[4], ne
csel @b[5], @b[5], @b[4], ne
neg @t[1], @t[0]
lslv @a[5], @a[5], @t[0] // align high limbs to the left
lslv @b[5], @b[5], @t[0]
lsrv @a[4], @a[4], @t[1]
lsrv @b[4], @b[4], @t[1]
and @a[4], @a[4], @t[1], asr#6
and @b[4], @b[4], @t[1], asr#6
orr $a_, @a[5], @a[4]
orr $b_, @b[5], @b[4]
bfxil $a_, @a[0], #0, #32
bfxil $b_, @b[0], #0, #32
b __inner_loop_30
ret
.size __ab_approximation_30,.-__ab_approximation_30
.type __inner_loop_30, %function
.align 4
__inner_loop_30:
mov $cnt, #30
mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0
mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1
mov $bias,#0x7FFFFFFF7FFFFFFF
.Loop_30:
sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting
and @t[4], $a_, $b_
sub $cnt, $cnt, #1
and @t[0], $b_, @t[3]
sub @t[1], $b_, $a_ // |b_|-|a_|
subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
add @t[4], $L, @t[4], lsr#1 // L + (a_ & b_) >> 1
mov @t[0], $fg1
csel $b_, $b_, $a_, hs // |b_| = |a_|
csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1|
csel $fg0, $fg0, @t[0], hs
csel $L, $L, @t[4], hs
lsr $a_, $a_, #1
and @t[0], $fg1, @t[3]
and @t[1], $bias, @t[3]
add $t[2], $b_, #2
sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even)
add $fg1, $fg1, $fg1 // |f1|<<=1
add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5
add $fg0, $fg0, @t[1]
sub $fg1, $fg1, $bias
cbnz $cnt, .Loop_30
mov $bias, #0x7FFFFFFF
ubfx $f0, $fg0, #0, #32
ubfx $g0, $fg0, #32, #32
ubfx $f1, $fg1, #0, #32
ubfx $g1, $fg1, #32, #32
sub $f0, $f0, $bias // remove the bias
sub $g0, $g0, $bias
sub $f1, $f1, $bias
sub $g1, $g1, $bias
ret
.size __inner_loop_30,.-__inner_loop_30
___
}
$code.=<<___;
.type __inner_loop_48, %function
.align 4
__inner_loop_48:
.Loop_48:
sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting
and @t[4], $a_, $b_
sub $cnt, $cnt, #1
and @t[0], $b_, @t[3]
sub @t[1], $b_, $a_ // |b_|-|a_|
subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even)
add @t[4], $L, @t[4], lsr#1
csel $b_, $b_, $a_, hs // |b_| = |a_|
csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel $L, $L, @t[4], hs
add $t[2], $b_, #2
lsr $a_, $a_, #1
add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5
cbnz $cnt, .Loop_48
ret
.size __inner_loop_48,.-__inner_loop_48
___
print $code;
close STDOUT;