#!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # Both constant-time and fast quadratic residue test as suggested in # https://eprint.iacr.org/2020/972. Performance is >5x better than # modulus-specific Legendre symbol addition chain... # # bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); # $python_ref.=<<'___'; def ct_is_square_mod_384(inp, mod): a = inp b = mod L = 0 # only least significant bit, adding 1 makes up for sign change k = 30 w = 32 mask = (1 << w) - 1 for i in range(0, 768 // k - 1): # __ab_approximation_30 n = max(a.bit_length(), b.bit_length()) if n < 64: a_, b_ = a, b else: a_ = (a & mask) | ((a >> (n-w)) << w) b_ = (b & mask) | ((b >> (n-w)) << w) # __inner_loop_30 f0, g0, f1, g1 = 1, 0, 0, 1 for j in range(0, k): if a_ & 1: if a_ < b_: a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits # tell the whole story a_, f0, g0 = a_-b_, f0-f1, g0-g1 a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] # __smulq_384_n_shift_by_30 a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k if b < 0: b = -b if a < 0: a = -a L += (b % 4) >> 1 # |b| is always odd, the second bit # tells the whole story if True: for j in range(0, 768 % k + k): if a & 1: if a < b: a, b = b, a L += (a & b) >> 1 # |a| and |b| are both odd, second bits # tell the whole story a = a-b a = a >> 1 L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] return (L & 1) ^ 1 ___ $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; my ($out_ptr, $in_ptr) = ("%rdi", "%rsi"); my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx"); my @acc=map("%r$_",(8..15)); my $L = "%rbp"; $frame = 8*3+2*256; $code.=<<___; .text .globl ct_is_square_mod_384 .type ct_is_square_mod_384,\@function,2,"unwind" .align 32 ct_is_square_mod_384: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$$frame, %rsp .cfi_adjust_cfa_offset $frame .cfi_end_prologue lea 8*3+255(%rsp), %rax # find closest 256-byte-aligned spot and \$-256, %rax # in the frame... mov 8*0(%rdi), @acc[0] # load input mov 8*1(%rdi), @acc[1] mov 8*2(%rdi), @acc[2] mov 8*3(%rdi), @acc[3] mov 8*4(%rdi), @acc[4] mov 8*5(%rdi), @acc[5] mov 8*0(%rsi), @acc[6] # load modulus mov 8*1(%rsi), @acc[7] mov 8*2(%rsi), %rbx mov 8*3(%rsi), %rcx mov 8*4(%rsi), %rdx mov 8*5(%rsi), %rdi mov %rax, $in_ptr # pointer to source |a|b| mov @acc[0], 8*0(%rax) # copy input to |a| mov @acc[1], 8*1(%rax) mov @acc[2], 8*2(%rax) mov @acc[3], 8*3(%rax) mov @acc[4], 8*4(%rax) mov @acc[5], 8*5(%rax) mov @acc[6], 8*6(%rax) # copy modulus to |b| mov @acc[7], 8*7(%rax) mov %rbx, 8*8(%rax) mov %rcx, 8*9(%rax) mov %rdx, 8*10(%rax) mov %rdi, 8*11(%rax) xor $L, $L # initialize the Legendre symbol mov \$24, %ecx # 24 is 768/30-1 jmp .Loop_is_square .align 32 .Loop_is_square: mov %ecx, 8*2(%rsp) # offload loop counter call __ab_approximation_30 mov $f0, 8*0(%rsp) # offload |f0| and |g0| mov $g0, 8*1(%rsp) mov \$128+8*6, $out_ptr xor $in_ptr, $out_ptr # pointer to destination |b| call __smulq_384_n_shift_by_30 mov 8*0(%rsp), $f1 # pop |f0| and |g0| mov 8*1(%rsp), $g1 lea -8*6($out_ptr),$out_ptr # pointer to destination |a| call __smulq_384_n_shift_by_30 mov 8*2(%rsp), %ecx # re-load loop counter xor \$128, $in_ptr # flip-flop pointer to source |a|b| and 8*6($out_ptr), @acc[6] # if |a| was negative, adjust |L| shr \$1, @acc[6] add @acc[6], $L sub \$1, %ecx jnz .Loop_is_square ################################# last iteration #call __ab_approximation_30 # |a| and |b| are exact, just load #mov 8*0($in_ptr), @acc[0] # |a_| mov 8*6($in_ptr), @acc[1] # |b_| call __inner_loop_48 # 48 is 768%30+30 mov \$1, %rax and $L, %rax xor \$1, %rax # return value lea $frame(%rsp), %r8 # size optimization mov 8*0(%r8),%r15 .cfi_restore %r15 mov 8*1(%r8),%r14 .cfi_restore %r14 mov 8*2(%r8),%r13 .cfi_restore %r13 mov 8*3(%r8),%r12 .cfi_restore %r12 mov 8*4(%r8),%rbx .cfi_restore %rbx mov 8*5(%r8),%rbp .cfi_restore %rbp lea 8*6(%r8),%rsp .cfi_adjust_cfa_offset -$frame-8*6 .cfi_epilogue ret .cfi_endproc .size ct_is_square_mod_384,.-ct_is_square_mod_384 .type __smulq_384_n_shift_by_30,\@abi-omnipotent .align 32 __smulq_384_n_shift_by_30: ___ for($j=0; $j<2; $j++) { $code.=<<___; mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) mov 8*1($in_ptr), @acc[1] mov 8*2($in_ptr), @acc[2] mov 8*3($in_ptr), @acc[3] mov 8*4($in_ptr), @acc[4] mov 8*5($in_ptr), @acc[5] mov %rdx, %rbx # |f1| (or |g1|) sar \$63, %rdx # |f1|'s sign as mask (or |g1|'s) xor %rax, %rax sub %rdx, %rax # |f1|'s sign as bit (or |g1|'s) xor %rdx, %rbx # conditionally negate |f1| (or |g1|) add %rax, %rbx xor %rdx, @acc[0] # conditionally negate |a| (or |b|) xor %rdx, @acc[1] xor %rdx, @acc[2] xor %rdx, @acc[3] xor %rdx, @acc[4] xor %rdx, @acc[5] add @acc[0], %rax adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] mov %rdx, @acc[6+$j] and %rbx, @acc[6+$j] mulq %rbx # |a|*|f1| (or |b|*|g1|) mov %rax, @acc[0] mov @acc[1], %rax mov %rdx, @acc[1] ___ for($i=1; $i<5; $i++) { $code.=<<___; mulq %rbx add %rax, @acc[$i] mov @acc[$i+1], %rax adc \$0, %rdx mov %rdx, @acc[$i+1] ___ } $code.=<<___; neg @acc[6+$j] mulq %rbx add %rax, @acc[5] adc %rdx, @acc[6+$j] ___ $code.=<<___ if ($j==0); lea 8*6($in_ptr), $in_ptr # pointer to |b| mov $g1, %rdx mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) ___ } $code.=<<___; lea -8*6($in_ptr), $in_ptr # restore original in_ptr add 8*0($out_ptr), @acc[0] adc 8*1($out_ptr), @acc[1] adc 8*2($out_ptr), @acc[2] adc 8*3($out_ptr), @acc[3] adc 8*4($out_ptr), @acc[4] adc 8*5($out_ptr), @acc[5] adc @acc[7], @acc[6] shrd \$30, @acc[1], @acc[0] shrd \$30, @acc[2], @acc[1] shrd \$30, @acc[3], @acc[2] shrd \$30, @acc[4], @acc[3] shrd \$30, @acc[5], @acc[4] shrd \$30, @acc[6], @acc[5] sar \$63, @acc[6] # sign as mask xor %rbx, %rbx sub @acc[6], %rbx # sign as bit xor @acc[6], @acc[0] # conditionally negate the result xor @acc[6], @acc[1] xor @acc[6], @acc[2] xor @acc[6], @acc[3] xor @acc[6], @acc[4] xor @acc[6], @acc[5] add %rbx, @acc[0] adc \$0, @acc[1] adc \$0, @acc[2] adc \$0, @acc[3] adc \$0, @acc[4] adc \$0, @acc[5] mov @acc[0], 8*0($out_ptr) mov @acc[1], 8*1($out_ptr) mov @acc[2], 8*2($out_ptr) mov @acc[3], 8*3($out_ptr) mov @acc[4], 8*4($out_ptr) mov @acc[5], 8*5($out_ptr) ret .size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 ___ { my ($a_, $b_) = @acc[0..1]; my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15)); my ($fg0, $fg1, $bias) = ($g0, $g1, $t5); my $cnt = "%edi"; { my @a = @acc[0..5]; my @b = (@a[1..3], $t4, $t5, $g0); $code.=<<___; .type __ab_approximation_30,\@abi-omnipotent .align 32 __ab_approximation_30: mov 8*11($in_ptr), @b[5] # load |b| in reverse order mov 8*10($in_ptr), @b[4] mov 8*9($in_ptr), @b[3] mov @a[5], %rax or @b[5], %rax # check top-most limbs, ... cmovz @a[4], @a[5] cmovz @b[4], @b[5] cmovz @a[3], @a[4] mov 8*8($in_ptr), @b[2] cmovz @b[3], @b[4] mov @a[5], %rax or @b[5], %rax # ... ones before top-most, ... cmovz @a[4], @a[5] cmovz @b[4], @b[5] cmovz @a[2], @a[4] mov 8*7($in_ptr), @b[1] cmovz @b[2], @b[4] mov @a[5], %rax or @b[5], %rax # ... and ones before that ... cmovz @a[4], @a[5] cmovz @b[4], @b[5] cmovz @a[1], @a[4] mov 8*6($in_ptr), @b[0] cmovz @b[1], @b[4] mov @a[5], %rax or @b[5], %rax # ... and ones before that ... cmovz @a[4], @a[5] cmovz @b[4], @b[5] cmovz @a[0], @a[4] cmovz @b[0], @b[4] mov @a[5], %rax or @b[5], %rax bsr %rax, %rcx lea 1(%rcx), %rcx cmovz @a[0], @a[5] cmovz @b[0], @b[5] cmovz %rax, %rcx neg %rcx #and \$63, %rcx # debugging artefact shldq %cl, @a[4], @a[5] # align second limb to the left shldq %cl, @b[4], @b[5] mov \$0xFFFFFFFF00000000, %rax mov @a[0]d, ${a_}d mov @b[0]d, ${b_}d and %rax, @a[5] and %rax, @b[5] or @a[5], ${a_} or @b[5], ${b_} jmp __inner_loop_30 ret .size __ab_approximation_30,.-__ab_approximation_30 ___ } $code.=<<___; .type __inner_loop_30,\@abi-omnipotent .align 32 __inner_loop_30: ################# by Thomas Pornin mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 lea -1($fg0), $bias # 0x7FFFFFFF7FFFFFFF mov \$30, $cnt .Loop_30: mov $a_, %rax and $b_, %rax shr \$1, %rax # (a_ & b_) >> 1 cmp $b_, $a_ # if |a_|<|b_|, swap the variables mov $a_, $t0 mov $b_, $t1 lea (%rax,$L), %rax # pre-"negate" |L| mov $fg0, $t2 mov $fg1, $t3 mov $L, $t4 cmovb $b_, $a_ cmovb $t0, $b_ cmovb $fg1, $fg0 cmovb $t2, $fg1 cmovb %rax, $L sub $b_, $a_ # |a_|-|b_| sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| add $bias, $fg0 test \$1, $t0 # if |a_| was even, roll back cmovz $t0, $a_ cmovz $t1, $b_ cmovz $t2, $fg0 cmovz $t3, $fg1 cmovz $t4, $L lea 2($b_), %rax shr \$1, $a_ # |a_|>>=1 shr \$2, %rax add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 lea (%rax,$L), $L # "negate" |L| if |b|%8 is 3 or 5 sub $bias, $fg1 sub \$1, $cnt jnz .Loop_30 shr \$32, $bias mov %ebx, %eax # $fg0 -> $f0 shr \$32, $g0 mov %ecx, %edx # $fg1 -> $f1 shr \$32, $g1 sub $bias, $f0 # remove the bias sub $bias, $g0 sub $bias, $f1 sub $bias, $g1 ret .size __inner_loop_30,.-__inner_loop_30 .type __inner_loop_48,\@abi-omnipotent .align 32 __inner_loop_48: mov \$48, $cnt # 48 is 768%30+30 .Loop_48: mov $a_, %rax and $b_, %rax shr \$1, %rax # (a_ & b_) >> 1 cmp $b_, $a_ # if |a_|<|b_|, swap the variables mov $a_, $t0 mov $b_, $t1 lea (%rax,$L), %rax mov $L, $t2 cmovb $b_, $a_ cmovb $t0, $b_ cmovb %rax, $L sub $b_, $a_ # |a_|-|b_| test \$1, $t0 # if |a_| was even, roll back cmovz $t0, $a_ cmovz $t1, $b_ cmovz $t2, $L lea 2($b_), %rax shr \$1, $a_ # |a_|>>=1 shr \$2, %rax add %rax, $L # "negate" |L| if |b|%8 is 3 or 5 sub \$1, $cnt jnz .Loop_48 ret .size __inner_loop_48,.-__inner_loop_48 ___ } print $code; close STDOUT;