#!/usr/bin/env perl # # Copyright Supranational LLC # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # # ==================================================================== # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL # project. # ==================================================================== # # sha256_block procedure for ARMv8. # # This module is stripped of scalar code paths, with raionale that all # known processors are NEON-capable. # # See original module at CRYPTOGAMS for further details. $flavour = shift; $output = shift; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } $BITS=256; $SZ=4; @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; $reg_t="w"; $pre="blst_"; ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); $code.=<<___; .text .align 6 .type .LK$BITS,%object .LK$BITS: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .size .LK$BITS,.-.LK$BITS .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm" .align 2 ___ if ($SZ==4) { my $Ktbl="x3"; my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); my @MSG=map("v$_.16b",(4..7)); my ($W0,$W1)=("v16.4s","v17.4s"); my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); $code.=<<___; .globl ${pre}sha256_block_armv8 .type ${pre}sha256_block_armv8,%function .align 6 ${pre}sha256_block_armv8: .Lv8_entry: stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1.32 {$ABCD,$EFGH},[$ctx] adr $Ktbl,.LK256 .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 sub $num,$num,#1 ld1.32 {$W0},[$Ktbl],#16 rev32 @MSG[0],@MSG[0] rev32 @MSG[1],@MSG[1] rev32 @MSG[2],@MSG[2] rev32 @MSG[3],@MSG[3] orr $ABCD_SAVE,$ABCD,$ABCD // offload orr $EFGH_SAVE,$EFGH,$EFGH ___ for($i=0;$i<12;$i++) { $code.=<<___; ld1.32 {$W1},[$Ktbl],#16 add.i32 $W0,$W0,@MSG[0] sha256su0 @MSG[0],@MSG[1] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 sha256su1 @MSG[0],@MSG[2],@MSG[3] ___ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); } $code.=<<___; ld1.32 {$W1},[$Ktbl],#16 add.i32 $W0,$W0,@MSG[0] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 ld1.32 {$W0},[$Ktbl],#16 add.i32 $W1,$W1,@MSG[1] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W1 sha256h2 $EFGH,$abcd,$W1 ld1.32 {$W1},[$Ktbl] add.i32 $W0,$W0,@MSG[2] sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 add.i32 $W1,$W1,@MSG[3] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W1 sha256h2 $EFGH,$abcd,$W1 add.i32 $ABCD,$ABCD,$ABCD_SAVE add.i32 $EFGH,$EFGH,$EFGH_SAVE cbnz $num,.Loop_hw st1.32 {$ABCD,$EFGH},[$ctx] ldr x29,[sp],#16 ret .size ${pre}sha256_block_armv8,.-${pre}sha256_block_armv8 ___ } if ($SZ==4) { ######################################### NEON stuff # # You'll surely note a lot of similarities with sha256-armv4 module, # and of course it's not a coincidence. sha256-armv4 was used as # initial template, but was adapted for ARMv8 instruction set and # extensively re-tuned for all-round performance. my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); my $Ktbl="x16"; my $Xfer="x17"; my @X = map("q$_",(0..3)); my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); my $j=0; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } sub Xupdate() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T2,$T0,$sigma0[0]); eval(shift(@insns)); &ushr_32 ($T1,$T0,$sigma0[2]); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] eval(shift(@insns)); &sli_32 ($T2,$T0,32-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T3,$T0,$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T1,$T1,$T2); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T3,$T0,32-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T4,$T7,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T4,$T7,32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T5,$T7,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T3,$T7,$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &sli_u32 ($T3,$T7,32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T6,@X[0],$sigma1[0]); eval(shift(@insns)); &ushr_32 ($T7,@X[0],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T6,@X[0],32-$sigma1[0]); eval(shift(@insns)); &ushr_32 ($T5,@X[0],$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T7,$T7,$T6); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T5,@X[0],32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &ld1_32 ("{$T0}","[$Ktbl], #16"); eval(shift(@insns)); &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T5); eval(shift(@insns)); eval(shift(@insns)); &mov (&Dhi($T5), &Dlo($T7)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 ($T0,$T0,@X[0]); while($#insns>=1) { eval(shift(@insns)); } &st1_32 ("{$T0}","[$Xfer], #16"); eval(shift(@insns)); push(@X,shift(@X)); # "rotate" X[] } sub Xpreload() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); eval(shift(@insns)); eval(shift(@insns)); &ld1_8 ("{@X[0]}","[$inp],#16"); eval(shift(@insns)); eval(shift(@insns)); &ld1_32 ("{$T0}","[$Ktbl],#16"); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &rev32 (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 ($T0,$T0,@X[0]); foreach (@insns) { eval; } # remaining instructions &st1_32 ("{$T0}","[$Xfer], #16"); push(@X,shift(@X)); # "rotate" X[] } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. '&add ($h,$h,$t1)', # h+=X[i]+K[i] '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past '&and ($t1,$f,$e)', '&bic ($t4,$g,$e)', '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past '&orr ($t1,$t1,$t4)', # Ch(e,f,g) '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', '&add ($h,$h,$t1)', # h+=Ch(e,f,g) '&ror ($t0,$t0,"#$Sigma1[0]")', '&eor ($t2,$a,$b)', # a^b, b^c in next round '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) '&add ($h,$h,$t0)', # h+=Sigma1(e) '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. '&ldr ($t1,"[$Ktbl]") if ($j==15);'. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) '&ror ($t4,$t4,"#$Sigma0[0]")', '&add ($d,$d,$h)', # d+=h '&eor ($t3,$t3,$b)', # Maj(a,b,c) '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' ) } $code.=<<___; .globl ${pre}sha256_block_data_order .type ${pre}sha256_block_data_order,%function .align 4 ${pre}sha256_block_data_order: stp x29, x30, [sp, #-16]! mov x29, sp sub sp,sp,#16*4 adr $Ktbl,.LK256 add $num,$inp,$num,lsl#6 // len to point at the end of inp ld1.8 {@X[0]},[$inp], #16 ld1.8 {@X[1]},[$inp], #16 ld1.8 {@X[2]},[$inp], #16 ld1.8 {@X[3]},[$inp], #16 ld1.32 {$T0},[$Ktbl], #16 ld1.32 {$T1},[$Ktbl], #16 ld1.32 {$T2},[$Ktbl], #16 ld1.32 {$T3},[$Ktbl], #16 rev32 @X[0],@X[0] // yes, even on rev32 @X[1],@X[1] // big-endian rev32 @X[2],@X[2] rev32 @X[3],@X[3] mov $Xfer,sp add.32 $T0,$T0,@X[0] add.32 $T1,$T1,@X[1] add.32 $T2,$T2,@X[2] st1.32 {$T0-$T1},[$Xfer], #32 add.32 $T3,$T3,@X[3] st1.32 {$T2-$T3},[$Xfer] sub $Xfer,$Xfer,#32 ldp $A,$B,[$ctx] ldp $C,$D,[$ctx,#8] ldp $E,$F,[$ctx,#16] ldp $G,$H,[$ctx,#24] ldr $t1,[sp,#0] mov $t2,wzr eor $t3,$B,$C mov $t4,wzr b .L_00_48 .align 4 .L_00_48: ___ &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); $code.=<<___; cmp $t1,#0 // check for K256 terminator ldr $t1,[sp,#0] sub $Xfer,$Xfer,#64 bne .L_00_48 sub $Ktbl,$Ktbl,#256 // rewind $Ktbl cmp $inp,$num mov $Xfer, #64 csel $Xfer, $Xfer, xzr, eq sub $inp,$inp,$Xfer // avoid SEGV mov $Xfer,sp ___ &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); $code.=<<___; add $A,$A,$t4 // h+=Sigma0(a) from the past ldp $t0,$t1,[$ctx,#0] add $A,$A,$t2 // h+=Maj(a,b,c) from the past ldp $t2,$t3,[$ctx,#8] add $A,$A,$t0 // accumulate add $B,$B,$t1 ldp $t0,$t1,[$ctx,#16] add $C,$C,$t2 add $D,$D,$t3 ldp $t2,$t3,[$ctx,#24] add $E,$E,$t0 add $F,$F,$t1 ldr $t1,[sp,#0] stp $A,$B,[$ctx,#0] add $G,$G,$t2 mov $t2,wzr stp $C,$D,[$ctx,#8] add $H,$H,$t3 stp $E,$F,[$ctx,#16] eor $t3,$B,$C stp $G,$H,[$ctx,#24] mov $t4,wzr mov $Xfer,sp b.ne .L_00_48 ldr x29,[x29] add sp,sp,#16*4+16 ret .size ${pre}sha256_block_data_order,.-${pre}sha256_block_data_order ___ } { my ($out,$inp,$len) = map("x$_",(0..2)); $code.=<<___; .globl ${pre}sha256_emit .hidden ${pre}sha256_emit .type ${pre}sha256_emit,%function .align 4 ${pre}sha256_emit: ldp x4,x5,[$inp] ldp x6,x7,[$inp,#16] #ifndef __AARCH64EB__ rev x4,x4 rev x5,x5 rev x6,x6 rev x7,x7 #endif str w4,[$out,#4] lsr x4,x4,#32 str w5,[$out,#12] lsr x5,x5,#32 str w6,[$out,#20] lsr x6,x6,#32 str w7,[$out,#28] lsr x7,x7,#32 str w4,[$out,#0] str w5,[$out,#8] str w6,[$out,#16] str w7,[$out,#24] ret .size ${pre}sha256_emit,.-${pre}sha256_emit .globl ${pre}sha256_bcopy .hidden ${pre}sha256_bcopy .type ${pre}sha256_bcopy,%function .align 4 ${pre}sha256_bcopy: .Loop_bcopy: ldrb w3,[$inp],#1 sub $len,$len,#1 strb w3,[$out],#1 cbnz $len,.Loop_bcopy ret .size ${pre}sha256_bcopy,.-${pre}sha256_bcopy .globl ${pre}sha256_hcopy .hidden ${pre}sha256_hcopy .type ${pre}sha256_hcopy,%function .align 4 ${pre}sha256_hcopy: ldp x4,x5,[$inp] ldp x6,x7,[$inp,#16] stp x4,x5,[$out] stp x6,x7,[$out,#16] ret .size ${pre}sha256_hcopy,.-${pre}sha256_hcopy ___ } { my %opcode = ( "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); sub unsha256 { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o && sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5)|($3<<16), $mnemonic,$arg; } } open SELF,$0; while() { next if (/^#!/); last if (!s/^#/\/\// and !/^$/); print; } close SELF; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers s/\.[ui]?8(\s)/$1/; s/\.\w?64\b// and s/\.16b/\.2d/g or s/\.\w?32\b// and s/\.16b/\.4s/g; m/\bext\b/ and s/\.2d/\.16b/g or m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; print $_,"\n"; } close STDOUT;