338 lines
7.1 KiB
Perl
338 lines
7.1 KiB
Perl
|
#!/usr/bin/env perl
|
||
|
#
|
||
|
# Copyright Supranational LLC
|
||
|
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
|
||
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
#
|
||
|
# ====================================================================
|
||
|
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
|
||
|
# project.
|
||
|
# ====================================================================
|
||
|
#
|
||
|
# sha256_block procedure for x86_64.
|
||
|
#
|
||
|
# Scalar-only version with minor twist minimizing 'lea' instructions.
|
||
|
|
||
|
$flavour = shift;
|
||
|
$output = pop;
|
||
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||
|
|
||
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||
|
|
||
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||
|
die "can't locate x86_64-xlate.pl";
|
||
|
|
||
|
open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
||
|
or die "can't call $xlate: $!";
|
||
|
|
||
|
$pre="blst_";
|
||
|
$func="${pre}sha256_block_data_order";
|
||
|
$TABLE="K256";
|
||
|
$SZ=4;
|
||
|
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
|
||
|
"%r8d","%r9d","%r10d","%r11d");
|
||
|
($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
|
||
|
@Sigma0=( 2,13,22);
|
||
|
@Sigma1=( 6,11,25);
|
||
|
@sigma0=( 7,18, 3);
|
||
|
@sigma1=(17,19,10);
|
||
|
$rounds=64;
|
||
|
|
||
|
$ctx="%rdi"; # 1st arg, zapped by $a3
|
||
|
$inp="%rsi"; # 2nd arg
|
||
|
$Tbl="%rbp";
|
||
|
|
||
|
$_ctx="16*$SZ+0*8(%rsp)";
|
||
|
$_inp="16*$SZ+1*8(%rsp)";
|
||
|
$_end="16*$SZ+2*8(%rsp)";
|
||
|
$framesz="16*$SZ+3*8";
|
||
|
|
||
|
sub ROUND_00_15()
|
||
|
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
|
||
|
my $STRIDE=$SZ;
|
||
|
# $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
|
||
|
|
||
|
$code.=<<___;
|
||
|
ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
|
||
|
mov $f,$a2
|
||
|
|
||
|
xor $e,$a0
|
||
|
ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
|
||
|
xor $g,$a2 # f^g
|
||
|
|
||
|
mov $T1,`$SZ*($i&0xf)`(%rsp)
|
||
|
xor $a,$a1
|
||
|
and $e,$a2 # (f^g)&e
|
||
|
|
||
|
ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
|
||
|
add $h,$T1 # T1+=h
|
||
|
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
|
||
|
|
||
|
ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
|
||
|
xor $e,$a0
|
||
|
add $a2,$T1 # T1+=Ch(e,f,g)
|
||
|
|
||
|
mov $a,$a2
|
||
|
add `$SZ*$i`($Tbl),$T1 # T1+=K[round]
|
||
|
xor $a,$a1
|
||
|
|
||
|
xor $b,$a2 # a^b, b^c in next round
|
||
|
ror \$$Sigma1[0],$a0 # Sigma1(e)
|
||
|
mov $b,$h
|
||
|
|
||
|
and $a2,$a3
|
||
|
ror \$$Sigma0[0],$a1 # Sigma0(a)
|
||
|
add $a0,$T1 # T1+=Sigma1(e)
|
||
|
|
||
|
xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
|
||
|
add $T1,$d # d+=T1
|
||
|
add $T1,$h # h+=T1
|
||
|
___
|
||
|
$code.=<<___ if ($i==31);
|
||
|
lea `16*$SZ`($Tbl),$Tbl # round+=16
|
||
|
___
|
||
|
$code.=<<___ if ($i<15);
|
||
|
add $a1,$h # h+=Sigma0(a)
|
||
|
___
|
||
|
($a2,$a3) = ($a3,$a2);
|
||
|
}
|
||
|
|
||
|
sub ROUND_16_XX()
|
||
|
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
|
||
|
|
||
|
$code.=<<___;
|
||
|
mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
|
||
|
mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
|
||
|
|
||
|
mov $a0,$T1
|
||
|
ror \$`$sigma0[1]-$sigma0[0]`,$a0
|
||
|
add $a1,$a # modulo-scheduled h+=Sigma0(a)
|
||
|
mov $a2,$a1
|
||
|
ror \$`$sigma1[1]-$sigma1[0]`,$a2
|
||
|
|
||
|
xor $T1,$a0
|
||
|
shr \$$sigma0[2],$T1
|
||
|
ror \$$sigma0[0],$a0
|
||
|
xor $a1,$a2
|
||
|
shr \$$sigma1[2],$a1
|
||
|
|
||
|
ror \$$sigma1[0],$a2
|
||
|
xor $a0,$T1 # sigma0(X[(i+1)&0xf])
|
||
|
xor $a1,$a2 # sigma1(X[(i+14)&0xf])
|
||
|
add `$SZ*(($i+9)&0xf)`(%rsp),$T1
|
||
|
|
||
|
add `$SZ*($i&0xf)`(%rsp),$T1
|
||
|
mov $e,$a0
|
||
|
add $a2,$T1
|
||
|
mov $a,$a1
|
||
|
___
|
||
|
&ROUND_00_15(@_);
|
||
|
}
|
||
|
|
||
|
$code=<<___;
|
||
|
.text
|
||
|
|
||
|
.globl $func
|
||
|
.type $func,\@function,3,"unwind"
|
||
|
.align 16
|
||
|
$func:
|
||
|
.cfi_startproc
|
||
|
push %rbx
|
||
|
.cfi_push %rbx
|
||
|
push %rbp
|
||
|
.cfi_push %rbp
|
||
|
push %r12
|
||
|
.cfi_push %r12
|
||
|
push %r13
|
||
|
.cfi_push %r13
|
||
|
push %r14
|
||
|
.cfi_push %r14
|
||
|
push %r15
|
||
|
.cfi_push %r15
|
||
|
shl \$4,%rdx # num*16
|
||
|
sub \$$framesz,%rsp
|
||
|
.cfi_adjust_cfa_offset $framesz
|
||
|
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
|
||
|
mov $ctx,$_ctx # save ctx, 1st arg
|
||
|
mov $inp,$_inp # save inp, 2nd arh
|
||
|
mov %rdx,$_end # save end pointer, "3rd" arg
|
||
|
.cfi_end_prologue
|
||
|
|
||
|
mov $SZ*0($ctx),$A
|
||
|
mov $SZ*1($ctx),$B
|
||
|
mov $SZ*2($ctx),$C
|
||
|
mov $SZ*3($ctx),$D
|
||
|
mov $SZ*4($ctx),$E
|
||
|
mov $SZ*5($ctx),$F
|
||
|
mov $SZ*6($ctx),$G
|
||
|
mov $SZ*7($ctx),$H
|
||
|
jmp .Lloop
|
||
|
|
||
|
.align 16
|
||
|
.Lloop:
|
||
|
mov $B,$a3
|
||
|
lea $TABLE(%rip),$Tbl
|
||
|
xor $C,$a3 # magic
|
||
|
___
|
||
|
for($i=0;$i<16;$i++) {
|
||
|
$code.=" mov $SZ*$i($inp),$T1\n";
|
||
|
$code.=" mov @ROT[4],$a0\n";
|
||
|
$code.=" mov @ROT[0],$a1\n";
|
||
|
$code.=" bswap $T1\n";
|
||
|
&ROUND_00_15($i,@ROT);
|
||
|
unshift(@ROT,pop(@ROT));
|
||
|
}
|
||
|
$code.=<<___;
|
||
|
jmp .Lrounds_16_xx
|
||
|
.align 16
|
||
|
.Lrounds_16_xx:
|
||
|
___
|
||
|
for(;$i<32;$i++) {
|
||
|
&ROUND_16_XX($i,@ROT);
|
||
|
unshift(@ROT,pop(@ROT));
|
||
|
}
|
||
|
|
||
|
$code.=<<___;
|
||
|
cmpb \$0x19,`$SZ-1`($Tbl)
|
||
|
jnz .Lrounds_16_xx
|
||
|
|
||
|
mov $_ctx,$ctx
|
||
|
add $a1,$A # modulo-scheduled h+=Sigma0(a)
|
||
|
lea 16*$SZ($inp),$inp
|
||
|
|
||
|
add $SZ*0($ctx),$A
|
||
|
add $SZ*1($ctx),$B
|
||
|
add $SZ*2($ctx),$C
|
||
|
add $SZ*3($ctx),$D
|
||
|
add $SZ*4($ctx),$E
|
||
|
add $SZ*5($ctx),$F
|
||
|
add $SZ*6($ctx),$G
|
||
|
add $SZ*7($ctx),$H
|
||
|
|
||
|
cmp $_end,$inp
|
||
|
|
||
|
mov $A,$SZ*0($ctx)
|
||
|
mov $B,$SZ*1($ctx)
|
||
|
mov $C,$SZ*2($ctx)
|
||
|
mov $D,$SZ*3($ctx)
|
||
|
mov $E,$SZ*4($ctx)
|
||
|
mov $F,$SZ*5($ctx)
|
||
|
mov $G,$SZ*6($ctx)
|
||
|
mov $H,$SZ*7($ctx)
|
||
|
jb .Lloop
|
||
|
|
||
|
lea $framesz+6*8(%rsp),%r11
|
||
|
.cfi_def_cfa %r11,8
|
||
|
mov $framesz(%rsp),%r15
|
||
|
.cfi_restore %r15
|
||
|
mov -40(%r11),%r14
|
||
|
.cfi_restore %r14
|
||
|
mov -32(%r11),%r13
|
||
|
.cfi_restore %r13
|
||
|
mov -24(%r11),%r12
|
||
|
.cfi_restore %r12
|
||
|
mov -16(%r11),%rbp
|
||
|
.cfi_restore %rbp
|
||
|
mov -8(%r11),%rbx
|
||
|
.cfi_restore %rbx
|
||
|
.cfi_epilogue
|
||
|
lea (%r11),%rsp
|
||
|
ret
|
||
|
.cfi_endproc
|
||
|
.size $func,.-$func
|
||
|
|
||
|
.align 64
|
||
|
.type $TABLE,\@object
|
||
|
$TABLE:
|
||
|
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||
|
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||
|
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||
|
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||
|
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||
|
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||
|
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||
|
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||
|
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||
|
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||
|
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||
|
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||
|
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||
|
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||
|
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||
|
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||
|
|
||
|
.asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
|
||
|
___
|
||
|
{
|
||
|
my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order
|
||
|
("%rdi","%rsi","%rdx"); # Unix order
|
||
|
$code.=<<___;
|
||
|
.globl ${pre}sha256_emit
|
||
|
.hidden ${pre}sha256_emit
|
||
|
.type ${pre}sha256_emit,\@abi-omnipotent
|
||
|
.align 16
|
||
|
${pre}sha256_emit:
|
||
|
mov 0($inp), %r8
|
||
|
mov 8($inp), %r9
|
||
|
mov 16($inp), %r10
|
||
|
bswap %r8
|
||
|
mov 24($inp), %r11
|
||
|
bswap %r9
|
||
|
mov %r8d, 4($out)
|
||
|
bswap %r10
|
||
|
mov %r9d, 12($out)
|
||
|
bswap %r11
|
||
|
mov %r10d, 20($out)
|
||
|
shr \$32, %r8
|
||
|
mov %r11d, 28($out)
|
||
|
shr \$32, %r9
|
||
|
mov %r8d, 0($out)
|
||
|
shr \$32, %r10
|
||
|
mov %r9d, 8($out)
|
||
|
shr \$32, %r11
|
||
|
mov %r10d, 16($out)
|
||
|
mov %r11d, 24($out)
|
||
|
ret
|
||
|
.size ${pre}sha256_emit,.-${pre}sha256_emit
|
||
|
|
||
|
.globl ${pre}sha256_bcopy
|
||
|
.hidden ${pre}sha256_bcopy
|
||
|
.type ${pre}sha256_bcopy,\@abi-omnipotent
|
||
|
.align 16
|
||
|
${pre}sha256_bcopy:
|
||
|
sub $inp, $out
|
||
|
.Loop_bcopy:
|
||
|
movzb ($inp), %eax
|
||
|
lea 1($inp), $inp
|
||
|
mov %al, -1($out,$inp)
|
||
|
dec $len
|
||
|
jnz .Loop_bcopy
|
||
|
ret
|
||
|
.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy
|
||
|
|
||
|
.globl ${pre}sha256_hcopy
|
||
|
.hidden ${pre}sha256_hcopy
|
||
|
.type ${pre}sha256_hcopy,\@abi-omnipotent
|
||
|
.align 16
|
||
|
${pre}sha256_hcopy:
|
||
|
mov 0($inp), %r8
|
||
|
mov 8($inp), %r9
|
||
|
mov 16($inp), %r10
|
||
|
mov 24($inp), %r11
|
||
|
mov %r8, 0($out)
|
||
|
mov %r9, 8($out)
|
||
|
mov %r10, 16($out)
|
||
|
mov %r11, 24($out)
|
||
|
ret
|
||
|
.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy
|
||
|
___
|
||
|
}
|
||
|
|
||
|
foreach (split("\n",$code)) {
|
||
|
s/\`([^\`]*)\`/eval $1/geo;
|
||
|
print $_,"\n";
|
||
|
}
|
||
|
close STDOUT;
|