ftu/blst/elf/sha256-armv8.S

1078 lines
22 KiB
ArmAsm
Raw Normal View History

2022-09-09 06:47:49 +00:00
//
// Copyright Supranational LLC
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// ====================================================================
// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
// project.
// ====================================================================
//
// sha256_block procedure for ARMv8.
//
// This module is stripped of scalar code paths, with raionale that all
// known processors are NEON-capable.
//
// See original module at CRYPTOGAMS for further details.
.text
.align 6
.type .LK256,%object
.LK256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0 //terminator
.size .LK256,.-.LK256
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
.align 2
.align 2
.globl blst_sha256_block_armv8
.type blst_sha256_block_armv8,%function
.align 6
blst_sha256_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1 {v0.4s,v1.4s},[x0]
adr x3,.LK256
.Loop_hw:
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
sub x2,x2,#1
ld1 {v16.4s},[x3],#16
rev32 v4.16b,v4.16b
rev32 v5.16b,v5.16b
rev32 v6.16b,v6.16b
rev32 v7.16b,v7.16b
orr v18.16b,v0.16b,v0.16b // offload
orr v19.16b,v1.16b,v1.16b
ld1 {v17.4s},[x3],#16
add v16.4s,v16.4s,v4.4s
.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
ld1 {v16.4s},[x3],#16
add v17.4s,v17.4s,v5.4s
.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
ld1 {v17.4s},[x3],#16
add v16.4s,v16.4s,v6.4s
.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
ld1 {v16.4s},[x3],#16
add v17.4s,v17.4s,v7.4s
.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
ld1 {v17.4s},[x3],#16
add v16.4s,v16.4s,v4.4s
.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
ld1 {v16.4s},[x3],#16
add v17.4s,v17.4s,v5.4s
.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
ld1 {v17.4s},[x3],#16
add v16.4s,v16.4s,v6.4s
.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
ld1 {v16.4s},[x3],#16
add v17.4s,v17.4s,v7.4s
.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
ld1 {v17.4s},[x3],#16
add v16.4s,v16.4s,v4.4s
.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
ld1 {v16.4s},[x3],#16
add v17.4s,v17.4s,v5.4s
.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
ld1 {v17.4s},[x3],#16
add v16.4s,v16.4s,v6.4s
.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
ld1 {v16.4s},[x3],#16
add v17.4s,v17.4s,v7.4s
.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
orr v2.16b,v0.16b,v0.16b
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
ld1 {v17.4s},[x3],#16
add v16.4s,v16.4s,v4.4s
orr v2.16b,v0.16b,v0.16b
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
ld1 {v16.4s},[x3],#16
add v17.4s,v17.4s,v5.4s
orr v2.16b,v0.16b,v0.16b
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
ld1 {v17.4s},[x3]
add v16.4s,v16.4s,v6.4s
sub x3,x3,#64*4-16 // rewind
orr v2.16b,v0.16b,v0.16b
.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
add v17.4s,v17.4s,v7.4s
orr v2.16b,v0.16b,v0.16b
.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
add v0.4s,v0.4s,v18.4s
add v1.4s,v1.4s,v19.4s
cbnz x2,.Loop_hw
st1 {v0.4s,v1.4s},[x0]
ldr x29,[sp],#16
ret
.size blst_sha256_block_armv8,.-blst_sha256_block_armv8
.globl blst_sha256_block_data_order
.type blst_sha256_block_data_order,%function
.align 4
blst_sha256_block_data_order:
stp x29, x30, [sp, #-16]!
mov x29, sp
sub sp,sp,#16*4
adr x16,.LK256
add x2,x1,x2,lsl#6 // len to point at the end of inp
ld1 {v0.16b},[x1], #16
ld1 {v1.16b},[x1], #16
ld1 {v2.16b},[x1], #16
ld1 {v3.16b},[x1], #16
ld1 {v4.4s},[x16], #16
ld1 {v5.4s},[x16], #16
ld1 {v6.4s},[x16], #16
ld1 {v7.4s},[x16], #16
rev32 v0.16b,v0.16b // yes, even on
rev32 v1.16b,v1.16b // big-endian
rev32 v2.16b,v2.16b
rev32 v3.16b,v3.16b
mov x17,sp
add v4.4s,v4.4s,v0.4s
add v5.4s,v5.4s,v1.4s
add v6.4s,v6.4s,v2.4s
st1 {v4.4s,v5.4s},[x17], #32
add v7.4s,v7.4s,v3.4s
st1 {v6.4s,v7.4s},[x17]
sub x17,x17,#32
ldp w3,w4,[x0]
ldp w5,w6,[x0,#8]
ldp w7,w8,[x0,#16]
ldp w9,w10,[x0,#24]
ldr w12,[sp,#0]
mov w13,wzr
eor w14,w4,w5
mov w15,wzr
b .L_00_48
.align 4
.L_00_48:
ext v4.16b,v0.16b,v1.16b,#4
add w10,w10,w12
add w3,w3,w15
and w12,w8,w7
bic w15,w9,w7
ext v7.16b,v2.16b,v3.16b,#4
eor w11,w7,w7,ror#5
add w3,w3,w13
mov d19,v3.d[1]
orr w12,w12,w15
eor w11,w11,w7,ror#19
ushr v6.4s,v4.4s,#7
eor w15,w3,w3,ror#11
ushr v5.4s,v4.4s,#3
add w10,w10,w12
add v0.4s,v0.4s,v7.4s
ror w11,w11,#6
sli v6.4s,v4.4s,#25
eor w13,w3,w4
eor w15,w15,w3,ror#20
ushr v7.4s,v4.4s,#18
add w10,w10,w11
ldr w12,[sp,#4]
and w14,w14,w13
eor v5.16b,v5.16b,v6.16b
ror w15,w15,#2
add w6,w6,w10
sli v7.4s,v4.4s,#14
eor w14,w14,w4
ushr v16.4s,v19.4s,#17
add w9,w9,w12
add w10,w10,w15
and w12,w7,w6
eor v5.16b,v5.16b,v7.16b
bic w15,w8,w6
eor w11,w6,w6,ror#5
sli v16.4s,v19.4s,#15
add w10,w10,w14
orr w12,w12,w15
ushr v17.4s,v19.4s,#10
eor w11,w11,w6,ror#19
eor w15,w10,w10,ror#11
ushr v7.4s,v19.4s,#19
add w9,w9,w12
ror w11,w11,#6
add v0.4s,v0.4s,v5.4s
eor w14,w10,w3
eor w15,w15,w10,ror#20
sli v7.4s,v19.4s,#13
add w9,w9,w11
ldr w12,[sp,#8]
and w13,w13,w14
eor v17.16b,v17.16b,v16.16b
ror w15,w15,#2
add w5,w5,w9
eor w13,w13,w3
eor v17.16b,v17.16b,v7.16b
add w8,w8,w12
add w9,w9,w15
and w12,w6,w5
add v0.4s,v0.4s,v17.4s
bic w15,w7,w5
eor w11,w5,w5,ror#5
add w9,w9,w13
ushr v18.4s,v0.4s,#17
orr w12,w12,w15
ushr v19.4s,v0.4s,#10
eor w11,w11,w5,ror#19
eor w15,w9,w9,ror#11
sli v18.4s,v0.4s,#15
add w8,w8,w12
ushr v17.4s,v0.4s,#19
ror w11,w11,#6
eor w13,w9,w10
eor v19.16b,v19.16b,v18.16b
eor w15,w15,w9,ror#20
add w8,w8,w11
sli v17.4s,v0.4s,#13
ldr w12,[sp,#12]
and w14,w14,w13
ror w15,w15,#2
ld1 {v4.4s},[x16], #16
add w4,w4,w8
eor v19.16b,v19.16b,v17.16b
eor w14,w14,w10
eor v17.16b,v17.16b,v17.16b
add w7,w7,w12
add w8,w8,w15
and w12,w5,w4
mov v17.d[1],v19.d[0]
bic w15,w6,w4
eor w11,w4,w4,ror#5
add w8,w8,w14
add v0.4s,v0.4s,v17.4s
orr w12,w12,w15
eor w11,w11,w4,ror#19
eor w15,w8,w8,ror#11
add v4.4s,v4.4s,v0.4s
add w7,w7,w12
ror w11,w11,#6
eor w14,w8,w9
eor w15,w15,w8,ror#20
add w7,w7,w11
ldr w12,[sp,#16]
and w13,w13,w14
ror w15,w15,#2
add w3,w3,w7
eor w13,w13,w9
st1 {v4.4s},[x17], #16
ext v4.16b,v1.16b,v2.16b,#4
add w6,w6,w12
add w7,w7,w15
and w12,w4,w3
bic w15,w5,w3
ext v7.16b,v3.16b,v0.16b,#4
eor w11,w3,w3,ror#5
add w7,w7,w13
mov d19,v0.d[1]
orr w12,w12,w15
eor w11,w11,w3,ror#19
ushr v6.4s,v4.4s,#7
eor w15,w7,w7,ror#11
ushr v5.4s,v4.4s,#3
add w6,w6,w12
add v1.4s,v1.4s,v7.4s
ror w11,w11,#6
sli v6.4s,v4.4s,#25
eor w13,w7,w8
eor w15,w15,w7,ror#20
ushr v7.4s,v4.4s,#18
add w6,w6,w11
ldr w12,[sp,#20]
and w14,w14,w13
eor v5.16b,v5.16b,v6.16b
ror w15,w15,#2
add w10,w10,w6
sli v7.4s,v4.4s,#14
eor w14,w14,w8
ushr v16.4s,v19.4s,#17
add w5,w5,w12
add w6,w6,w15
and w12,w3,w10
eor v5.16b,v5.16b,v7.16b
bic w15,w4,w10
eor w11,w10,w10,ror#5
sli v16.4s,v19.4s,#15
add w6,w6,w14
orr w12,w12,w15
ushr v17.4s,v19.4s,#10
eor w11,w11,w10,ror#19
eor w15,w6,w6,ror#11
ushr v7.4s,v19.4s,#19
add w5,w5,w12
ror w11,w11,#6
add v1.4s,v1.4s,v5.4s
eor w14,w6,w7
eor w15,w15,w6,ror#20
sli v7.4s,v19.4s,#13
add w5,w5,w11
ldr w12,[sp,#24]
and w13,w13,w14
eor v17.16b,v17.16b,v16.16b
ror w15,w15,#2
add w9,w9,w5
eor w13,w13,w7
eor v17.16b,v17.16b,v7.16b
add w4,w4,w12
add w5,w5,w15
and w12,w10,w9
add v1.4s,v1.4s,v17.4s
bic w15,w3,w9
eor w11,w9,w9,ror#5
add w5,w5,w13
ushr v18.4s,v1.4s,#17
orr w12,w12,w15
ushr v19.4s,v1.4s,#10
eor w11,w11,w9,ror#19
eor w15,w5,w5,ror#11
sli v18.4s,v1.4s,#15
add w4,w4,w12
ushr v17.4s,v1.4s,#19
ror w11,w11,#6
eor w13,w5,w6
eor v19.16b,v19.16b,v18.16b
eor w15,w15,w5,ror#20
add w4,w4,w11
sli v17.4s,v1.4s,#13
ldr w12,[sp,#28]
and w14,w14,w13
ror w15,w15,#2
ld1 {v4.4s},[x16], #16
add w8,w8,w4
eor v19.16b,v19.16b,v17.16b
eor w14,w14,w6
eor v17.16b,v17.16b,v17.16b
add w3,w3,w12
add w4,w4,w15
and w12,w9,w8
mov v17.d[1],v19.d[0]
bic w15,w10,w8
eor w11,w8,w8,ror#5
add w4,w4,w14
add v1.4s,v1.4s,v17.4s
orr w12,w12,w15
eor w11,w11,w8,ror#19
eor w15,w4,w4,ror#11
add v4.4s,v4.4s,v1.4s
add w3,w3,w12
ror w11,w11,#6
eor w14,w4,w5
eor w15,w15,w4,ror#20
add w3,w3,w11
ldr w12,[sp,#32]
and w13,w13,w14
ror w15,w15,#2
add w7,w7,w3
eor w13,w13,w5
st1 {v4.4s},[x17], #16
ext v4.16b,v2.16b,v3.16b,#4
add w10,w10,w12
add w3,w3,w15
and w12,w8,w7
bic w15,w9,w7
ext v7.16b,v0.16b,v1.16b,#4
eor w11,w7,w7,ror#5
add w3,w3,w13
mov d19,v1.d[1]
orr w12,w12,w15
eor w11,w11,w7,ror#19
ushr v6.4s,v4.4s,#7
eor w15,w3,w3,ror#11
ushr v5.4s,v4.4s,#3
add w10,w10,w12
add v2.4s,v2.4s,v7.4s
ror w11,w11,#6
sli v6.4s,v4.4s,#25
eor w13,w3,w4
eor w15,w15,w3,ror#20
ushr v7.4s,v4.4s,#18
add w10,w10,w11
ldr w12,[sp,#36]
and w14,w14,w13
eor v5.16b,v5.16b,v6.16b
ror w15,w15,#2
add w6,w6,w10
sli v7.4s,v4.4s,#14
eor w14,w14,w4
ushr v16.4s,v19.4s,#17
add w9,w9,w12
add w10,w10,w15
and w12,w7,w6
eor v5.16b,v5.16b,v7.16b
bic w15,w8,w6
eor w11,w6,w6,ror#5
sli v16.4s,v19.4s,#15
add w10,w10,w14
orr w12,w12,w15
ushr v17.4s,v19.4s,#10
eor w11,w11,w6,ror#19
eor w15,w10,w10,ror#11
ushr v7.4s,v19.4s,#19
add w9,w9,w12
ror w11,w11,#6
add v2.4s,v2.4s,v5.4s
eor w14,w10,w3
eor w15,w15,w10,ror#20
sli v7.4s,v19.4s,#13
add w9,w9,w11
ldr w12,[sp,#40]
and w13,w13,w14
eor v17.16b,v17.16b,v16.16b
ror w15,w15,#2
add w5,w5,w9
eor w13,w13,w3
eor v17.16b,v17.16b,v7.16b
add w8,w8,w12
add w9,w9,w15
and w12,w6,w5
add v2.4s,v2.4s,v17.4s
bic w15,w7,w5
eor w11,w5,w5,ror#5
add w9,w9,w13
ushr v18.4s,v2.4s,#17
orr w12,w12,w15
ushr v19.4s,v2.4s,#10
eor w11,w11,w5,ror#19
eor w15,w9,w9,ror#11
sli v18.4s,v2.4s,#15
add w8,w8,w12
ushr v17.4s,v2.4s,#19
ror w11,w11,#6
eor w13,w9,w10
eor v19.16b,v19.16b,v18.16b
eor w15,w15,w9,ror#20
add w8,w8,w11
sli v17.4s,v2.4s,#13
ldr w12,[sp,#44]
and w14,w14,w13
ror w15,w15,#2
ld1 {v4.4s},[x16], #16
add w4,w4,w8
eor v19.16b,v19.16b,v17.16b
eor w14,w14,w10
eor v17.16b,v17.16b,v17.16b
add w7,w7,w12
add w8,w8,w15
and w12,w5,w4
mov v17.d[1],v19.d[0]
bic w15,w6,w4
eor w11,w4,w4,ror#5
add w8,w8,w14
add v2.4s,v2.4s,v17.4s
orr w12,w12,w15
eor w11,w11,w4,ror#19
eor w15,w8,w8,ror#11
add v4.4s,v4.4s,v2.4s
add w7,w7,w12
ror w11,w11,#6
eor w14,w8,w9
eor w15,w15,w8,ror#20
add w7,w7,w11
ldr w12,[sp,#48]
and w13,w13,w14
ror w15,w15,#2
add w3,w3,w7
eor w13,w13,w9
st1 {v4.4s},[x17], #16
ext v4.16b,v3.16b,v0.16b,#4
add w6,w6,w12
add w7,w7,w15
and w12,w4,w3
bic w15,w5,w3
ext v7.16b,v1.16b,v2.16b,#4
eor w11,w3,w3,ror#5
add w7,w7,w13
mov d19,v2.d[1]
orr w12,w12,w15
eor w11,w11,w3,ror#19
ushr v6.4s,v4.4s,#7
eor w15,w7,w7,ror#11
ushr v5.4s,v4.4s,#3
add w6,w6,w12
add v3.4s,v3.4s,v7.4s
ror w11,w11,#6
sli v6.4s,v4.4s,#25
eor w13,w7,w8
eor w15,w15,w7,ror#20
ushr v7.4s,v4.4s,#18
add w6,w6,w11
ldr w12,[sp,#52]
and w14,w14,w13
eor v5.16b,v5.16b,v6.16b
ror w15,w15,#2
add w10,w10,w6
sli v7.4s,v4.4s,#14
eor w14,w14,w8
ushr v16.4s,v19.4s,#17
add w5,w5,w12
add w6,w6,w15
and w12,w3,w10
eor v5.16b,v5.16b,v7.16b
bic w15,w4,w10
eor w11,w10,w10,ror#5
sli v16.4s,v19.4s,#15
add w6,w6,w14
orr w12,w12,w15
ushr v17.4s,v19.4s,#10
eor w11,w11,w10,ror#19
eor w15,w6,w6,ror#11
ushr v7.4s,v19.4s,#19
add w5,w5,w12
ror w11,w11,#6
add v3.4s,v3.4s,v5.4s
eor w14,w6,w7
eor w15,w15,w6,ror#20
sli v7.4s,v19.4s,#13
add w5,w5,w11
ldr w12,[sp,#56]
and w13,w13,w14
eor v17.16b,v17.16b,v16.16b
ror w15,w15,#2
add w9,w9,w5
eor w13,w13,w7
eor v17.16b,v17.16b,v7.16b
add w4,w4,w12
add w5,w5,w15
and w12,w10,w9
add v3.4s,v3.4s,v17.4s
bic w15,w3,w9
eor w11,w9,w9,ror#5
add w5,w5,w13
ushr v18.4s,v3.4s,#17
orr w12,w12,w15
ushr v19.4s,v3.4s,#10
eor w11,w11,w9,ror#19
eor w15,w5,w5,ror#11
sli v18.4s,v3.4s,#15
add w4,w4,w12
ushr v17.4s,v3.4s,#19
ror w11,w11,#6
eor w13,w5,w6
eor v19.16b,v19.16b,v18.16b
eor w15,w15,w5,ror#20
add w4,w4,w11
sli v17.4s,v3.4s,#13
ldr w12,[sp,#60]
and w14,w14,w13
ror w15,w15,#2
ld1 {v4.4s},[x16], #16
add w8,w8,w4
eor v19.16b,v19.16b,v17.16b
eor w14,w14,w6
eor v17.16b,v17.16b,v17.16b
add w3,w3,w12
add w4,w4,w15
and w12,w9,w8
mov v17.d[1],v19.d[0]
bic w15,w10,w8
eor w11,w8,w8,ror#5
add w4,w4,w14
add v3.4s,v3.4s,v17.4s
orr w12,w12,w15
eor w11,w11,w8,ror#19
eor w15,w4,w4,ror#11
add v4.4s,v4.4s,v3.4s
add w3,w3,w12
ror w11,w11,#6
eor w14,w4,w5
eor w15,w15,w4,ror#20
add w3,w3,w11
ldr w12,[x16]
and w13,w13,w14
ror w15,w15,#2
add w7,w7,w3
eor w13,w13,w5
st1 {v4.4s},[x17], #16
cmp w12,#0 // check for K256 terminator
ldr w12,[sp,#0]
sub x17,x17,#64
bne .L_00_48
sub x16,x16,#256 // rewind x16
cmp x1,x2
mov x17, #64
csel x17, x17, xzr, eq
sub x1,x1,x17 // avoid SEGV
mov x17,sp
add w10,w10,w12
add w3,w3,w15
and w12,w8,w7
ld1 {v0.16b},[x1],#16
bic w15,w9,w7
eor w11,w7,w7,ror#5
ld1 {v4.4s},[x16],#16
add w3,w3,w13
orr w12,w12,w15
eor w11,w11,w7,ror#19
eor w15,w3,w3,ror#11
rev32 v0.16b,v0.16b
add w10,w10,w12
ror w11,w11,#6
eor w13,w3,w4
eor w15,w15,w3,ror#20
add v4.4s,v4.4s,v0.4s
add w10,w10,w11
ldr w12,[sp,#4]
and w14,w14,w13
ror w15,w15,#2
add w6,w6,w10
eor w14,w14,w4
add w9,w9,w12
add w10,w10,w15
and w12,w7,w6
bic w15,w8,w6
eor w11,w6,w6,ror#5
add w10,w10,w14
orr w12,w12,w15
eor w11,w11,w6,ror#19
eor w15,w10,w10,ror#11
add w9,w9,w12
ror w11,w11,#6
eor w14,w10,w3
eor w15,w15,w10,ror#20
add w9,w9,w11
ldr w12,[sp,#8]
and w13,w13,w14
ror w15,w15,#2
add w5,w5,w9
eor w13,w13,w3
add w8,w8,w12
add w9,w9,w15
and w12,w6,w5
bic w15,w7,w5
eor w11,w5,w5,ror#5
add w9,w9,w13
orr w12,w12,w15
eor w11,w11,w5,ror#19
eor w15,w9,w9,ror#11
add w8,w8,w12
ror w11,w11,#6
eor w13,w9,w10
eor w15,w15,w9,ror#20
add w8,w8,w11
ldr w12,[sp,#12]
and w14,w14,w13
ror w15,w15,#2
add w4,w4,w8
eor w14,w14,w10
add w7,w7,w12
add w8,w8,w15
and w12,w5,w4
bic w15,w6,w4
eor w11,w4,w4,ror#5
add w8,w8,w14
orr w12,w12,w15
eor w11,w11,w4,ror#19
eor w15,w8,w8,ror#11
add w7,w7,w12
ror w11,w11,#6
eor w14,w8,w9
eor w15,w15,w8,ror#20
add w7,w7,w11
ldr w12,[sp,#16]
and w13,w13,w14
ror w15,w15,#2
add w3,w3,w7
eor w13,w13,w9
st1 {v4.4s},[x17], #16
add w6,w6,w12
add w7,w7,w15
and w12,w4,w3
ld1 {v1.16b},[x1],#16
bic w15,w5,w3
eor w11,w3,w3,ror#5
ld1 {v4.4s},[x16],#16
add w7,w7,w13
orr w12,w12,w15
eor w11,w11,w3,ror#19
eor w15,w7,w7,ror#11
rev32 v1.16b,v1.16b
add w6,w6,w12
ror w11,w11,#6
eor w13,w7,w8
eor w15,w15,w7,ror#20
add v4.4s,v4.4s,v1.4s
add w6,w6,w11
ldr w12,[sp,#20]
and w14,w14,w13
ror w15,w15,#2
add w10,w10,w6
eor w14,w14,w8
add w5,w5,w12
add w6,w6,w15
and w12,w3,w10
bic w15,w4,w10
eor w11,w10,w10,ror#5
add w6,w6,w14
orr w12,w12,w15
eor w11,w11,w10,ror#19
eor w15,w6,w6,ror#11
add w5,w5,w12
ror w11,w11,#6
eor w14,w6,w7
eor w15,w15,w6,ror#20
add w5,w5,w11
ldr w12,[sp,#24]
and w13,w13,w14
ror w15,w15,#2
add w9,w9,w5
eor w13,w13,w7
add w4,w4,w12
add w5,w5,w15
and w12,w10,w9
bic w15,w3,w9
eor w11,w9,w9,ror#5
add w5,w5,w13
orr w12,w12,w15
eor w11,w11,w9,ror#19
eor w15,w5,w5,ror#11
add w4,w4,w12
ror w11,w11,#6
eor w13,w5,w6
eor w15,w15,w5,ror#20
add w4,w4,w11
ldr w12,[sp,#28]
and w14,w14,w13
ror w15,w15,#2
add w8,w8,w4
eor w14,w14,w6
add w3,w3,w12
add w4,w4,w15
and w12,w9,w8
bic w15,w10,w8
eor w11,w8,w8,ror#5
add w4,w4,w14
orr w12,w12,w15
eor w11,w11,w8,ror#19
eor w15,w4,w4,ror#11
add w3,w3,w12
ror w11,w11,#6
eor w14,w4,w5
eor w15,w15,w4,ror#20
add w3,w3,w11
ldr w12,[sp,#32]
and w13,w13,w14
ror w15,w15,#2
add w7,w7,w3
eor w13,w13,w5
st1 {v4.4s},[x17], #16
add w10,w10,w12
add w3,w3,w15
and w12,w8,w7
ld1 {v2.16b},[x1],#16
bic w15,w9,w7
eor w11,w7,w7,ror#5
ld1 {v4.4s},[x16],#16
add w3,w3,w13
orr w12,w12,w15
eor w11,w11,w7,ror#19
eor w15,w3,w3,ror#11
rev32 v2.16b,v2.16b
add w10,w10,w12
ror w11,w11,#6
eor w13,w3,w4
eor w15,w15,w3,ror#20
add v4.4s,v4.4s,v2.4s
add w10,w10,w11
ldr w12,[sp,#36]
and w14,w14,w13
ror w15,w15,#2
add w6,w6,w10
eor w14,w14,w4
add w9,w9,w12
add w10,w10,w15
and w12,w7,w6
bic w15,w8,w6
eor w11,w6,w6,ror#5
add w10,w10,w14
orr w12,w12,w15
eor w11,w11,w6,ror#19
eor w15,w10,w10,ror#11
add w9,w9,w12
ror w11,w11,#6
eor w14,w10,w3
eor w15,w15,w10,ror#20
add w9,w9,w11
ldr w12,[sp,#40]
and w13,w13,w14
ror w15,w15,#2
add w5,w5,w9
eor w13,w13,w3
add w8,w8,w12
add w9,w9,w15
and w12,w6,w5
bic w15,w7,w5
eor w11,w5,w5,ror#5
add w9,w9,w13
orr w12,w12,w15
eor w11,w11,w5,ror#19
eor w15,w9,w9,ror#11
add w8,w8,w12
ror w11,w11,#6
eor w13,w9,w10
eor w15,w15,w9,ror#20
add w8,w8,w11
ldr w12,[sp,#44]
and w14,w14,w13
ror w15,w15,#2
add w4,w4,w8
eor w14,w14,w10
add w7,w7,w12
add w8,w8,w15
and w12,w5,w4
bic w15,w6,w4
eor w11,w4,w4,ror#5
add w8,w8,w14
orr w12,w12,w15
eor w11,w11,w4,ror#19
eor w15,w8,w8,ror#11
add w7,w7,w12
ror w11,w11,#6
eor w14,w8,w9
eor w15,w15,w8,ror#20
add w7,w7,w11
ldr w12,[sp,#48]
and w13,w13,w14
ror w15,w15,#2
add w3,w3,w7
eor w13,w13,w9
st1 {v4.4s},[x17], #16
add w6,w6,w12
add w7,w7,w15
and w12,w4,w3
ld1 {v3.16b},[x1],#16
bic w15,w5,w3
eor w11,w3,w3,ror#5
ld1 {v4.4s},[x16],#16
add w7,w7,w13
orr w12,w12,w15
eor w11,w11,w3,ror#19
eor w15,w7,w7,ror#11
rev32 v3.16b,v3.16b
add w6,w6,w12
ror w11,w11,#6
eor w13,w7,w8
eor w15,w15,w7,ror#20
add v4.4s,v4.4s,v3.4s
add w6,w6,w11
ldr w12,[sp,#52]
and w14,w14,w13
ror w15,w15,#2
add w10,w10,w6
eor w14,w14,w8
add w5,w5,w12
add w6,w6,w15
and w12,w3,w10
bic w15,w4,w10
eor w11,w10,w10,ror#5
add w6,w6,w14
orr w12,w12,w15
eor w11,w11,w10,ror#19
eor w15,w6,w6,ror#11
add w5,w5,w12
ror w11,w11,#6
eor w14,w6,w7
eor w15,w15,w6,ror#20
add w5,w5,w11
ldr w12,[sp,#56]
and w13,w13,w14
ror w15,w15,#2
add w9,w9,w5
eor w13,w13,w7
add w4,w4,w12
add w5,w5,w15
and w12,w10,w9
bic w15,w3,w9
eor w11,w9,w9,ror#5
add w5,w5,w13
orr w12,w12,w15
eor w11,w11,w9,ror#19
eor w15,w5,w5,ror#11
add w4,w4,w12
ror w11,w11,#6
eor w13,w5,w6
eor w15,w15,w5,ror#20
add w4,w4,w11
ldr w12,[sp,#60]
and w14,w14,w13
ror w15,w15,#2
add w8,w8,w4
eor w14,w14,w6
add w3,w3,w12
add w4,w4,w15
and w12,w9,w8
bic w15,w10,w8
eor w11,w8,w8,ror#5
add w4,w4,w14
orr w12,w12,w15
eor w11,w11,w8,ror#19
eor w15,w4,w4,ror#11
add w3,w3,w12
ror w11,w11,#6
eor w14,w4,w5
eor w15,w15,w4,ror#20
add w3,w3,w11
and w13,w13,w14
ror w15,w15,#2
add w7,w7,w3
eor w13,w13,w5
st1 {v4.4s},[x17], #16
add w3,w3,w15 // h+=Sigma0(a) from the past
ldp w11,w12,[x0,#0]
add w3,w3,w13 // h+=Maj(a,b,c) from the past
ldp w13,w14,[x0,#8]
add w3,w3,w11 // accumulate
add w4,w4,w12
ldp w11,w12,[x0,#16]
add w5,w5,w13
add w6,w6,w14
ldp w13,w14,[x0,#24]
add w7,w7,w11
add w8,w8,w12
ldr w12,[sp,#0]
stp w3,w4,[x0,#0]
add w9,w9,w13
mov w13,wzr
stp w5,w6,[x0,#8]
add w10,w10,w14
stp w7,w8,[x0,#16]
eor w14,w4,w5
stp w9,w10,[x0,#24]
mov w15,wzr
mov x17,sp
b.ne .L_00_48
ldr x29,[x29]
add sp,sp,#16*4+16
ret
.size blst_sha256_block_data_order,.-blst_sha256_block_data_order
.globl blst_sha256_emit
.hidden blst_sha256_emit
.type blst_sha256_emit,%function
.align 4
blst_sha256_emit:
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
#ifndef __AARCH64EB__
rev x4,x4
rev x5,x5
rev x6,x6
rev x7,x7
#endif
str w4,[x0,#4]
lsr x4,x4,#32
str w5,[x0,#12]
lsr x5,x5,#32
str w6,[x0,#20]
lsr x6,x6,#32
str w7,[x0,#28]
lsr x7,x7,#32
str w4,[x0,#0]
str w5,[x0,#8]
str w6,[x0,#16]
str w7,[x0,#24]
ret
.size blst_sha256_emit,.-blst_sha256_emit
.globl blst_sha256_bcopy
.hidden blst_sha256_bcopy
.type blst_sha256_bcopy,%function
.align 4
blst_sha256_bcopy:
.Loop_bcopy:
ldrb w3,[x1],#1
sub x2,x2,#1
strb w3,[x0],#1
cbnz x2,.Loop_bcopy
ret
.size blst_sha256_bcopy,.-blst_sha256_bcopy
.globl blst_sha256_hcopy
.hidden blst_sha256_hcopy
.type blst_sha256_hcopy,%function
.align 4
blst_sha256_hcopy:
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
stp x4,x5,[x0]
stp x6,x7,[x0,#16]
ret
.size blst_sha256_hcopy,.-blst_sha256_hcopy