initial stuff

This commit is contained in:
John Doe 2022-09-09 02:47:49 -04:00
commit 943c07066e
99 changed files with 58786 additions and 0 deletions

View file

@ -0,0 +1,379 @@
.text
.globl add_mod_256
.hidden add_mod_256
.type add_mod_256,%function
.align 5
add_mod_256:
ldp x8,x9,[x1]
ldp x12,x13,[x2]
ldp x10,x11,[x1,#16]
adds x8,x8,x12
ldp x14,x15,[x2,#16]
adcs x9,x9,x13
ldp x4,x5,[x3]
adcs x10,x10,x14
ldp x6,x7,[x3,#16]
adcs x11,x11,x15
adc x3,xzr,xzr
subs x16,x8,x4
sbcs x17,x9,x5
sbcs x1,x10,x6
sbcs x2,x11,x7
sbcs xzr,x3,xzr
csel x8,x8,x16,lo
csel x9,x9,x17,lo
csel x10,x10,x1,lo
stp x8,x9,[x0]
csel x11,x11,x2,lo
stp x10,x11,[x0,#16]
ret
.size add_mod_256,.-add_mod_256
.globl mul_by_3_mod_256
.hidden mul_by_3_mod_256
.type mul_by_3_mod_256,%function
.align 5
mul_by_3_mod_256:
ldp x12,x13,[x1]
ldp x14,x15,[x1,#16]
adds x8,x12,x12
ldp x4,x5,[x2]
adcs x9,x13,x13
ldp x6,x7,[x2,#16]
adcs x10,x14,x14
adcs x11,x15,x15
adc x3,xzr,xzr
subs x16,x8,x4
sbcs x17,x9,x5
sbcs x1,x10,x6
sbcs x2,x11,x7
sbcs xzr,x3,xzr
csel x8,x8,x16,lo
csel x9,x9,x17,lo
csel x10,x10,x1,lo
csel x11,x11,x2,lo
adds x8,x8,x12
adcs x9,x9,x13
adcs x10,x10,x14
adcs x11,x11,x15
adc x3,xzr,xzr
subs x16,x8,x4
sbcs x17,x9,x5
sbcs x1,x10,x6
sbcs x2,x11,x7
sbcs xzr,x3,xzr
csel x8,x8,x16,lo
csel x9,x9,x17,lo
csel x10,x10,x1,lo
stp x8,x9,[x0]
csel x11,x11,x2,lo
stp x10,x11,[x0,#16]
ret
.size mul_by_3_mod_256,.-mul_by_3_mod_256
.globl lshift_mod_256
.hidden lshift_mod_256
.type lshift_mod_256,%function
.align 5
lshift_mod_256:
ldp x8,x9,[x1]
ldp x10,x11,[x1,#16]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
.Loop_lshift_mod_256:
adds x8,x8,x8
sub x2,x2,#1
adcs x9,x9,x9
adcs x10,x10,x10
adcs x11,x11,x11
adc x3,xzr,xzr
subs x12,x8,x4
sbcs x13,x9,x5
sbcs x14,x10,x6
sbcs x15,x11,x7
sbcs xzr,x3,xzr
csel x8,x8,x12,lo
csel x9,x9,x13,lo
csel x10,x10,x14,lo
csel x11,x11,x15,lo
cbnz x2,.Loop_lshift_mod_256
stp x8,x9,[x0]
stp x10,x11,[x0,#16]
ret
.size lshift_mod_256,.-lshift_mod_256
.globl rshift_mod_256
.hidden rshift_mod_256
.type rshift_mod_256,%function
.align 5
rshift_mod_256:
ldp x8,x9,[x1]
ldp x10,x11,[x1,#16]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
.Loop_rshift:
adds x12,x8,x4
sub x2,x2,#1
adcs x13,x9,x5
adcs x14,x10,x6
adcs x15,x11,x7
adc x3,xzr,xzr
tst x8,#1
csel x12,x12,x8,ne
csel x13,x13,x9,ne
csel x14,x14,x10,ne
csel x15,x15,x11,ne
csel x3,x3,xzr,ne
extr x8,x13,x12,#1
extr x9,x14,x13,#1
extr x10,x15,x14,#1
extr x11,x3,x15,#1
cbnz x2,.Loop_rshift
stp x8,x9,[x0]
stp x10,x11,[x0,#16]
ret
.size rshift_mod_256,.-rshift_mod_256
.globl cneg_mod_256
.hidden cneg_mod_256
.type cneg_mod_256,%function
.align 5
cneg_mod_256:
ldp x8,x9,[x1]
ldp x4,x5,[x3]
ldp x10,x11,[x1,#16]
subs x12,x4,x8
ldp x6,x7,[x3,#16]
orr x4,x8,x9
sbcs x13,x5,x9
orr x5,x10,x11
sbcs x14,x6,x10
orr x3,x4,x5
sbc x15,x7,x11
cmp x3,#0
csetm x3,ne
ands x2,x2,x3
csel x8,x8,x12,eq
csel x9,x9,x13,eq
csel x10,x10,x14,eq
stp x8,x9,[x0]
csel x11,x11,x15,eq
stp x10,x11,[x0,#16]
ret
.size cneg_mod_256,.-cneg_mod_256
.globl sub_mod_256
.hidden sub_mod_256
.type sub_mod_256,%function
.align 5
sub_mod_256:
ldp x8,x9,[x1]
ldp x12,x13,[x2]
ldp x10,x11,[x1,#16]
subs x8,x8,x12
ldp x14,x15,[x2,#16]
sbcs x9,x9,x13
ldp x4,x5,[x3]
sbcs x10,x10,x14
ldp x6,x7,[x3,#16]
sbcs x11,x11,x15
sbc x3,xzr,xzr
and x4,x4,x3
and x5,x5,x3
adds x8,x8,x4
and x6,x6,x3
adcs x9,x9,x5
and x7,x7,x3
adcs x10,x10,x6
stp x8,x9,[x0]
adc x11,x11,x7
stp x10,x11,[x0,#16]
ret
.size sub_mod_256,.-sub_mod_256
.globl check_mod_256
.hidden check_mod_256
.type check_mod_256,%function
.align 5
check_mod_256:
ldp x8,x9,[x0]
ldp x10,x11,[x0,#16]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
#ifdef __AARCH64EB__
rev x8,x8
rev x9,x9
rev x10,x10
rev x11,x11
#endif
subs xzr,x8,x4
sbcs xzr,x9,x5
orr x8,x8,x9
sbcs xzr,x10,x6
orr x8,x8,x10
sbcs xzr,x11,x7
orr x8,x8,x11
sbc x1,xzr,xzr
cmp x8,#0
mov x0,#1
csel x0,x0,xzr,ne
and x0,x0,x1
ret
.size check_mod_256,.-check_mod_256
.globl add_n_check_mod_256
.hidden add_n_check_mod_256
.type add_n_check_mod_256,%function
.align 5
add_n_check_mod_256:
ldp x8,x9,[x1]
ldp x12,x13,[x2]
ldp x10,x11,[x1,#16]
ldp x14,x15,[x2,#16]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
rev x10,x10
rev x14,x14
rev x11,x11
rev x15,x15
#endif
adds x8,x8,x12
ldp x4,x5,[x3]
adcs x9,x9,x13
ldp x6,x7,[x3,#16]
adcs x10,x10,x14
adcs x11,x11,x15
adc x3,xzr,xzr
subs x16,x8,x4
sbcs x17,x9,x5
sbcs x1,x10,x6
sbcs x2,x11,x7
sbcs xzr,x3,xzr
csel x8,x8,x16,lo
csel x9,x9,x17,lo
csel x10,x10,x1,lo
csel x11,x11,x2,lo
orr x16, x8, x9
orr x17, x10, x11
orr x16, x16, x17
#ifdef __AARCH64EB__
rev x8,x8
rev x9,x9
rev x10,x10
rev x11,x11
#endif
stp x8,x9,[x0]
stp x10,x11,[x0,#16]
mov x17, #1
cmp x16, #0
csel x0, x17, xzr, ne
ret
.size add_n_check_mod_256,.-add_n_check_mod_256
.globl sub_n_check_mod_256
.hidden sub_n_check_mod_256
.type sub_n_check_mod_256,%function
.align 5
sub_n_check_mod_256:
ldp x8,x9,[x1]
ldp x12,x13,[x2]
ldp x10,x11,[x1,#16]
ldp x14,x15,[x2,#16]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
rev x10,x10
rev x14,x14
rev x11,x11
rev x15,x15
#endif
subs x8,x8,x12
sbcs x9,x9,x13
ldp x4,x5,[x3]
sbcs x10,x10,x14
ldp x6,x7,[x3,#16]
sbcs x11,x11,x15
sbc x3,xzr,xzr
and x4,x4,x3
and x5,x5,x3
adds x8,x8,x4
and x6,x6,x3
adcs x9,x9,x5
and x7,x7,x3
adcs x10,x10,x6
adc x11,x11,x7
orr x16, x8, x9
orr x17, x10, x11
orr x16, x16, x17
#ifdef __AARCH64EB__
rev x8,x8
rev x9,x9
rev x10,x10
rev x11,x11
#endif
stp x8,x9,[x0]
stp x10,x11,[x0,#16]
mov x17, #1
cmp x16, #0
csel x0, x17, xzr, ne
ret
.size sub_n_check_mod_256,.-sub_n_check_mod_256

View file

@ -0,0 +1,572 @@
.text
.globl add_mod_256
.hidden add_mod_256
.type add_mod_256,@function
.align 32
add_mod_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
.Loaded_a_add_mod_256:
addq 0(%rdx),%r8
adcq 8(%rdx),%r9
movq %r8,%rax
adcq 16(%rdx),%r10
movq %r9,%rsi
adcq 24(%rdx),%r11
sbbq %rdx,%rdx
movq %r10,%rbx
subq 0(%rcx),%r8
sbbq 8(%rcx),%r9
sbbq 16(%rcx),%r10
movq %r11,%rbp
sbbq 24(%rcx),%r11
sbbq $0,%rdx
cmovcq %rax,%r8
cmovcq %rsi,%r9
movq %r8,0(%rdi)
cmovcq %rbx,%r10
movq %r9,8(%rdi)
cmovcq %rbp,%r11
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq 8(%rsp),%rbx
.cfi_restore %rbx
movq 16(%rsp),%rbp
.cfi_restore %rbp
leaq 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.byte 0xf3,0xc3
.cfi_endproc
.size add_mod_256,.-add_mod_256
.globl mul_by_3_mod_256
.hidden mul_by_3_mod_256
.type mul_by_3_mod_256,@function
.align 32
mul_by_3_mod_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
movq %rdx,%rcx
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq %rsi,%rdx
movq 24(%rsi),%r11
call __lshift_mod_256
movq 0(%rsp),%r12
.cfi_restore %r12
jmp .Loaded_a_add_mod_256
movq 8(%rsp),%rbx
.cfi_restore %rbx
movq 16(%rsp),%rbp
.cfi_restore %rbp
leaq 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.byte 0xf3,0xc3
.cfi_endproc
.size mul_by_3_mod_256,.-mul_by_3_mod_256
.type __lshift_mod_256,@function
.align 32
__lshift_mod_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
addq %r8,%r8
adcq %r9,%r9
movq %r8,%rax
adcq %r10,%r10
movq %r9,%rsi
adcq %r11,%r11
sbbq %r12,%r12
movq %r10,%rbx
subq 0(%rcx),%r8
sbbq 8(%rcx),%r9
sbbq 16(%rcx),%r10
movq %r11,%rbp
sbbq 24(%rcx),%r11
sbbq $0,%r12
cmovcq %rax,%r8
cmovcq %rsi,%r9
cmovcq %rbx,%r10
cmovcq %rbp,%r11
.byte 0xf3,0xc3
.cfi_endproc
.size __lshift_mod_256,.-__lshift_mod_256
.globl lshift_mod_256
.hidden lshift_mod_256
.type lshift_mod_256,@function
.align 32
lshift_mod_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
.Loop_lshift_mod_256:
call __lshift_mod_256
decl %edx
jnz .Loop_lshift_mod_256
movq %r8,0(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq 0(%rsp),%r12
.cfi_restore %r12
movq 8(%rsp),%rbx
.cfi_restore %rbx
movq 16(%rsp),%rbp
.cfi_restore %rbp
leaq 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.byte 0xf3,0xc3
.cfi_endproc
.size lshift_mod_256,.-lshift_mod_256
.globl rshift_mod_256
.hidden rshift_mod_256
.type rshift_mod_256,@function
.align 32
rshift_mod_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq 0(%rsi),%rbp
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
.Loop_rshift_mod_256:
movq %rbp,%r8
andq $1,%rbp
movq 0(%rcx),%rax
negq %rbp
movq 8(%rcx),%rsi
movq 16(%rcx),%rbx
andq %rbp,%rax
andq %rbp,%rsi
andq %rbp,%rbx
andq 24(%rcx),%rbp
addq %rax,%r8
adcq %rsi,%r9
adcq %rbx,%r10
adcq %rbp,%r11
sbbq %rax,%rax
shrq $1,%r8
movq %r9,%rbp
shrq $1,%r9
movq %r10,%rbx
shrq $1,%r10
movq %r11,%rsi
shrq $1,%r11
shlq $63,%rbp
shlq $63,%rbx
orq %r8,%rbp
shlq $63,%rsi
orq %rbx,%r9
shlq $63,%rax
orq %rsi,%r10
orq %rax,%r11
decl %edx
jnz .Loop_rshift_mod_256
movq %rbp,0(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq 8(%rsp),%rbx
.cfi_restore %rbx
movq 16(%rsp),%rbp
.cfi_restore %rbp
leaq 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.byte 0xf3,0xc3
.cfi_endproc
.size rshift_mod_256,.-rshift_mod_256
.globl cneg_mod_256
.hidden cneg_mod_256
.type cneg_mod_256,@function
.align 32
cneg_mod_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
movq 0(%rsi),%r12
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq %r12,%r8
movq 24(%rsi),%r11
orq %r9,%r12
orq %r10,%r12
orq %r11,%r12
movq $-1,%rbp
movq 0(%rcx),%rax
cmovnzq %rbp,%r12
movq 8(%rcx),%rsi
movq 16(%rcx),%rbx
andq %r12,%rax
movq 24(%rcx),%rbp
andq %r12,%rsi
andq %r12,%rbx
andq %r12,%rbp
subq %r8,%rax
sbbq %r9,%rsi
sbbq %r10,%rbx
sbbq %r11,%rbp
orq %rdx,%rdx
cmovzq %r8,%rax
cmovzq %r9,%rsi
movq %rax,0(%rdi)
cmovzq %r10,%rbx
movq %rsi,8(%rdi)
cmovzq %r11,%rbp
movq %rbx,16(%rdi)
movq %rbp,24(%rdi)
movq 0(%rsp),%r12
.cfi_restore %r12
movq 8(%rsp),%rbx
.cfi_restore %rbx
movq 16(%rsp),%rbp
.cfi_restore %rbp
leaq 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.byte 0xf3,0xc3
.cfi_endproc
.size cneg_mod_256,.-cneg_mod_256
.globl sub_mod_256
.hidden sub_mod_256
.type sub_mod_256,@function
.align 32
sub_mod_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
subq 0(%rdx),%r8
movq 0(%rcx),%rax
sbbq 8(%rdx),%r9
movq 8(%rcx),%rsi
sbbq 16(%rdx),%r10
movq 16(%rcx),%rbx
sbbq 24(%rdx),%r11
movq 24(%rcx),%rbp
sbbq %rdx,%rdx
andq %rdx,%rax
andq %rdx,%rsi
andq %rdx,%rbx
andq %rdx,%rbp
addq %rax,%r8
adcq %rsi,%r9
movq %r8,0(%rdi)
adcq %rbx,%r10
movq %r9,8(%rdi)
adcq %rbp,%r11
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq 8(%rsp),%rbx
.cfi_restore %rbx
movq 16(%rsp),%rbp
.cfi_restore %rbp
leaq 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.byte 0xf3,0xc3
.cfi_endproc
.size sub_mod_256,.-sub_mod_256
.globl check_mod_256
.hidden check_mod_256
.type check_mod_256,@function
.align 32
check_mod_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq 0(%rdi),%rax
movq 8(%rdi),%r9
movq 16(%rdi),%r10
movq 24(%rdi),%r11
movq %rax,%r8
orq %r9,%rax
orq %r10,%rax
orq %r11,%rax
subq 0(%rsi),%r8
sbbq 8(%rsi),%r9
sbbq 16(%rsi),%r10
sbbq 24(%rsi),%r11
sbbq %rsi,%rsi
movq $1,%rdx
cmpq $0,%rax
cmovneq %rdx,%rax
andq %rsi,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size check_mod_256,.-check_mod_256
.globl add_n_check_mod_256
.hidden add_n_check_mod_256
.type add_n_check_mod_256,@function
.align 32
add_n_check_mod_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
addq 0(%rdx),%r8
adcq 8(%rdx),%r9
movq %r8,%rax
adcq 16(%rdx),%r10
movq %r9,%rsi
adcq 24(%rdx),%r11
sbbq %rdx,%rdx
movq %r10,%rbx
subq 0(%rcx),%r8
sbbq 8(%rcx),%r9
sbbq 16(%rcx),%r10
movq %r11,%rbp
sbbq 24(%rcx),%r11
sbbq $0,%rdx
cmovcq %rax,%r8
cmovcq %rsi,%r9
movq %r8,0(%rdi)
cmovcq %rbx,%r10
movq %r9,8(%rdi)
cmovcq %rbp,%r11
movq %r10,16(%rdi)
movq %r11,24(%rdi)
orq %r9,%r8
orq %r11,%r10
orq %r10,%r8
movq $1,%rax
cmovzq %r8,%rax
movq 8(%rsp),%rbx
.cfi_restore %rbx
movq 16(%rsp),%rbp
.cfi_restore %rbp
leaq 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.byte 0xf3,0xc3
.cfi_endproc
.size add_n_check_mod_256,.-add_n_check_mod_256
.globl sub_n_check_mod_256
.hidden sub_n_check_mod_256
.type sub_n_check_mod_256,@function
.align 32
sub_n_check_mod_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
subq 0(%rdx),%r8
movq 0(%rcx),%rax
sbbq 8(%rdx),%r9
movq 8(%rcx),%rsi
sbbq 16(%rdx),%r10
movq 16(%rcx),%rbx
sbbq 24(%rdx),%r11
movq 24(%rcx),%rbp
sbbq %rdx,%rdx
andq %rdx,%rax
andq %rdx,%rsi
andq %rdx,%rbx
andq %rdx,%rbp
addq %rax,%r8
adcq %rsi,%r9
movq %r8,0(%rdi)
adcq %rbx,%r10
movq %r9,8(%rdi)
adcq %rbp,%r11
movq %r10,16(%rdi)
movq %r11,24(%rdi)
orq %r9,%r8
orq %r11,%r10
orq %r10,%r8
movq $1,%rax
cmovzq %r8,%rax
movq 8(%rsp),%rbx
.cfi_restore %rbx
movq 16(%rsp),%rbp
.cfi_restore %rbp
leaq 24(%rsp),%rsp
.cfi_adjust_cfa_offset -24
.byte 0xf3,0xc3
.cfi_endproc
.size sub_n_check_mod_256,.-sub_n_check_mod_256
.section .note.GNU-stack,"",@progbits
.section .note.gnu.property,"a",@note
.long 4,2f-1f,5
.byte 0x47,0x4E,0x55,0
1: .long 0xc0000002,4,3
.align 8
2:

View file

@ -0,0 +1,931 @@
.text
.globl add_mod_384
.hidden add_mod_384
.type add_mod_384,%function
.align 5
add_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
bl __add_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size add_mod_384,.-add_mod_384
.type __add_mod_384,%function
.align 5
__add_mod_384:
ldp x10,x11,[x1]
ldp x16,x17,[x2]
ldp x12,x13,[x1,#16]
ldp x19,x20,[x2,#16]
ldp x14,x15,[x1,#32]
ldp x21,x22,[x2,#32]
__add_mod_384_ab_are_loaded:
adds x10,x10,x16
adcs x11,x11,x17
adcs x12,x12,x19
adcs x13,x13,x20
adcs x14,x14,x21
adcs x15,x15,x22
adc x3,xzr,xzr
subs x16,x10,x4
sbcs x17,x11,x5
sbcs x19,x12,x6
sbcs x20,x13,x7
sbcs x21,x14,x8
sbcs x22,x15,x9
sbcs xzr,x3,xzr
csel x10,x10,x16,lo
csel x11,x11,x17,lo
csel x12,x12,x19,lo
csel x13,x13,x20,lo
csel x14,x14,x21,lo
csel x15,x15,x22,lo
ret
.size __add_mod_384,.-__add_mod_384
.globl add_mod_384x
.hidden add_mod_384x
.type add_mod_384x,%function
.align 5
add_mod_384x:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
bl __add_mod_384
stp x10,x11,[x0]
add x1,x1,#48
stp x12,x13,[x0,#16]
add x2,x2,#48
stp x14,x15,[x0,#32]
bl __add_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0,#48]
stp x12,x13,[x0,#64]
stp x14,x15,[x0,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size add_mod_384x,.-add_mod_384x
.globl rshift_mod_384
.hidden rshift_mod_384
.type rshift_mod_384,%function
.align 5
rshift_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
.Loop_rshift_mod_384:
sub x2,x2,#1
bl __rshift_mod_384
cbnz x2,.Loop_rshift_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size rshift_mod_384,.-rshift_mod_384
.type __rshift_mod_384,%function
.align 5
__rshift_mod_384:
sbfx x22,x10,#0,#1
and x16,x22,x4
and x17,x22,x5
adds x10,x10,x16
and x19,x22,x6
adcs x11,x11,x17
and x20,x22,x7
adcs x12,x12,x19
and x21,x22,x8
adcs x13,x13,x20
and x22,x22,x9
adcs x14,x14,x21
extr x10,x11,x10,#1 // a[0:5] >>= 1
adcs x15,x15,x22
extr x11,x12,x11,#1
adc x22,xzr,xzr
extr x12,x13,x12,#1
extr x13,x14,x13,#1
extr x14,x15,x14,#1
extr x15,x22,x15,#1
ret
.size __rshift_mod_384,.-__rshift_mod_384
.globl div_by_2_mod_384
.hidden div_by_2_mod_384
.type div_by_2_mod_384,%function
.align 5
div_by_2_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
bl __rshift_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size div_by_2_mod_384,.-div_by_2_mod_384
.globl lshift_mod_384
.hidden lshift_mod_384
.type lshift_mod_384,%function
.align 5
lshift_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
.Loop_lshift_mod_384:
sub x2,x2,#1
bl __lshift_mod_384
cbnz x2,.Loop_lshift_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size lshift_mod_384,.-lshift_mod_384
.type __lshift_mod_384,%function
.align 5
__lshift_mod_384:
adds x10,x10,x10
adcs x11,x11,x11
adcs x12,x12,x12
adcs x13,x13,x13
adcs x14,x14,x14
adcs x15,x15,x15
adc x3,xzr,xzr
subs x16,x10,x4
sbcs x17,x11,x5
sbcs x19,x12,x6
sbcs x20,x13,x7
sbcs x21,x14,x8
sbcs x22,x15,x9
sbcs xzr,x3,xzr
csel x10,x10,x16,lo
csel x11,x11,x17,lo
csel x12,x12,x19,lo
csel x13,x13,x20,lo
csel x14,x14,x21,lo
csel x15,x15,x22,lo
ret
.size __lshift_mod_384,.-__lshift_mod_384
.globl mul_by_3_mod_384
.hidden mul_by_3_mod_384
.type mul_by_3_mod_384,%function
.align 5
mul_by_3_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
bl __lshift_mod_384
ldp x16,x17,[x1]
ldp x19,x20,[x1,#16]
ldp x21,x22,[x1,#32]
bl __add_mod_384_ab_are_loaded
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size mul_by_3_mod_384,.-mul_by_3_mod_384
.globl mul_by_8_mod_384
.hidden mul_by_8_mod_384
.type mul_by_8_mod_384,%function
.align 5
mul_by_8_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
bl __lshift_mod_384
bl __lshift_mod_384
bl __lshift_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size mul_by_8_mod_384,.-mul_by_8_mod_384
.globl mul_by_3_mod_384x
.hidden mul_by_3_mod_384x
.type mul_by_3_mod_384x,%function
.align 5
mul_by_3_mod_384x:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
bl __lshift_mod_384
ldp x16,x17,[x1]
ldp x19,x20,[x1,#16]
ldp x21,x22,[x1,#32]
bl __add_mod_384_ab_are_loaded
stp x10,x11,[x0]
ldp x10,x11,[x1,#48]
stp x12,x13,[x0,#16]
ldp x12,x13,[x1,#64]
stp x14,x15,[x0,#32]
ldp x14,x15,[x1,#80]
bl __lshift_mod_384
ldp x16,x17,[x1,#48]
ldp x19,x20,[x1,#64]
ldp x21,x22,[x1,#80]
bl __add_mod_384_ab_are_loaded
ldr x30,[sp,#8]
stp x10,x11,[x0,#48]
stp x12,x13,[x0,#64]
stp x14,x15,[x0,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size mul_by_3_mod_384x,.-mul_by_3_mod_384x
.globl mul_by_8_mod_384x
.hidden mul_by_8_mod_384x
.type mul_by_8_mod_384x,%function
.align 5
mul_by_8_mod_384x:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
ldp x14,x15,[x1,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
bl __lshift_mod_384
bl __lshift_mod_384
bl __lshift_mod_384
stp x10,x11,[x0]
ldp x10,x11,[x1,#48]
stp x12,x13,[x0,#16]
ldp x12,x13,[x1,#64]
stp x14,x15,[x0,#32]
ldp x14,x15,[x1,#80]
bl __lshift_mod_384
bl __lshift_mod_384
bl __lshift_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0,#48]
stp x12,x13,[x0,#64]
stp x14,x15,[x0,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size mul_by_8_mod_384x,.-mul_by_8_mod_384x
.globl cneg_mod_384
.hidden cneg_mod_384
.type cneg_mod_384,%function
.align 5
cneg_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x10,x11,[x1]
ldp x4,x5,[x3]
ldp x12,x13,[x1,#16]
ldp x6,x7,[x3,#16]
subs x16,x4,x10
ldp x14,x15,[x1,#32]
ldp x8,x9,[x3,#32]
orr x3,x10,x11
sbcs x17,x5,x11
orr x3,x3,x12
sbcs x19,x6,x12
orr x3,x3,x13
sbcs x20,x7,x13
orr x3,x3,x14
sbcs x21,x8,x14
orr x3,x3,x15
sbc x22,x9,x15
cmp x3,#0
csetm x3,ne
ands x2,x2,x3
csel x10,x10,x16,eq
csel x11,x11,x17,eq
csel x12,x12,x19,eq
csel x13,x13,x20,eq
stp x10,x11,[x0]
csel x14,x14,x21,eq
stp x12,x13,[x0,#16]
csel x15,x15,x22,eq
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size cneg_mod_384,.-cneg_mod_384
.globl sub_mod_384
.hidden sub_mod_384
.type sub_mod_384,%function
.align 5
sub_mod_384:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
bl __sub_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
stp x14,x15,[x0,#32]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size sub_mod_384,.-sub_mod_384
.type __sub_mod_384,%function
.align 5
__sub_mod_384:
ldp x10,x11,[x1]
ldp x16,x17,[x2]
ldp x12,x13,[x1,#16]
ldp x19,x20,[x2,#16]
ldp x14,x15,[x1,#32]
ldp x21,x22,[x2,#32]
subs x10,x10,x16
sbcs x11,x11,x17
sbcs x12,x12,x19
sbcs x13,x13,x20
sbcs x14,x14,x21
sbcs x15,x15,x22
sbc x3,xzr,xzr
and x16,x4,x3
and x17,x5,x3
adds x10,x10,x16
and x19,x6,x3
adcs x11,x11,x17
and x20,x7,x3
adcs x12,x12,x19
and x21,x8,x3
adcs x13,x13,x20
and x22,x9,x3
adcs x14,x14,x21
adc x15,x15,x22
ret
.size __sub_mod_384,.-__sub_mod_384
.globl sub_mod_384x
.hidden sub_mod_384x
.type sub_mod_384x,%function
.align 5
sub_mod_384x:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x4,x5,[x3]
ldp x6,x7,[x3,#16]
ldp x8,x9,[x3,#32]
bl __sub_mod_384
stp x10,x11,[x0]
add x1,x1,#48
stp x12,x13,[x0,#16]
add x2,x2,#48
stp x14,x15,[x0,#32]
bl __sub_mod_384
ldr x30,[sp,#8]
stp x10,x11,[x0,#48]
stp x12,x13,[x0,#64]
stp x14,x15,[x0,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size sub_mod_384x,.-sub_mod_384x
.globl mul_by_1_plus_i_mod_384x
.hidden mul_by_1_plus_i_mod_384x
.type mul_by_1_plus_i_mod_384x,%function
.align 5
mul_by_1_plus_i_mod_384x:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x4,x5,[x2]
ldp x6,x7,[x2,#16]
ldp x8,x9,[x2,#32]
add x2,x1,#48
bl __sub_mod_384 // a->re - a->im
ldp x16,x17,[x1]
ldp x19,x20,[x1,#16]
ldp x21,x22,[x1,#32]
stp x10,x11,[x0]
ldp x10,x11,[x1,#48]
stp x12,x13,[x0,#16]
ldp x12,x13,[x1,#64]
stp x14,x15,[x0,#32]
ldp x14,x15,[x1,#80]
bl __add_mod_384_ab_are_loaded // a->re + a->im
ldr x30,[sp,#8]
stp x10,x11,[x0,#48]
stp x12,x13,[x0,#64]
stp x14,x15,[x0,#80]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
.globl sgn0_pty_mod_384
.hidden sgn0_pty_mod_384
.type sgn0_pty_mod_384,%function
.align 5
sgn0_pty_mod_384:
ldp x10,x11,[x0]
ldp x12,x13,[x0,#16]
ldp x14,x15,[x0,#32]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
ldp x8,x9,[x1,#32]
and x0,x10,#1
adds x10,x10,x10
adcs x11,x11,x11
adcs x12,x12,x12
adcs x13,x13,x13
adcs x14,x14,x14
adcs x15,x15,x15
adc x3,xzr,xzr
subs x10,x10,x4
sbcs x11,x11,x5
sbcs x12,x12,x6
sbcs x13,x13,x7
sbcs x14,x14,x8
sbcs x15,x15,x9
sbc x3,x3,xzr
mvn x3,x3
and x3,x3,#2
orr x0,x0,x3
ret
.size sgn0_pty_mod_384,.-sgn0_pty_mod_384
.globl sgn0_pty_mod_384x
.hidden sgn0_pty_mod_384x
.type sgn0_pty_mod_384x,%function
.align 5
sgn0_pty_mod_384x:
ldp x10,x11,[x0]
ldp x12,x13,[x0,#16]
ldp x14,x15,[x0,#32]
ldp x4,x5,[x1]
ldp x6,x7,[x1,#16]
ldp x8,x9,[x1,#32]
and x2,x10,#1
orr x3,x10,x11
adds x10,x10,x10
orr x3,x3,x12
adcs x11,x11,x11
orr x3,x3,x13
adcs x12,x12,x12
orr x3,x3,x14
adcs x13,x13,x13
orr x3,x3,x15
adcs x14,x14,x14
adcs x15,x15,x15
adc x16,xzr,xzr
subs x10,x10,x4
sbcs x11,x11,x5
sbcs x12,x12,x6
sbcs x13,x13,x7
sbcs x14,x14,x8
sbcs x15,x15,x9
sbc x16,x16,xzr
ldp x10,x11,[x0,#48]
ldp x12,x13,[x0,#64]
ldp x14,x15,[x0,#80]
mvn x16,x16
and x16,x16,#2
orr x2,x2,x16
and x0,x10,#1
orr x1,x10,x11
adds x10,x10,x10
orr x1,x1,x12
adcs x11,x11,x11
orr x1,x1,x13
adcs x12,x12,x12
orr x1,x1,x14
adcs x13,x13,x13
orr x1,x1,x15
adcs x14,x14,x14
adcs x15,x15,x15
adc x16,xzr,xzr
subs x10,x10,x4
sbcs x11,x11,x5
sbcs x12,x12,x6
sbcs x13,x13,x7
sbcs x14,x14,x8
sbcs x15,x15,x9
sbc x16,x16,xzr
mvn x16,x16
and x16,x16,#2
orr x0,x0,x16
cmp x3,#0
csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re)
cmp x1,#0
csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re)
and x3,x3,#1
and x1,x1,#2
orr x0,x1,x3 // pack sign and parity
ret
.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
.globl vec_select_48
.hidden vec_select_48
.type vec_select_48,%function
.align 5
vec_select_48:
dup v6.2d, x3
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
cmeq v6.2d, v6.2d, #0
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v0.16b, v3.16b, v6.16b
bit v1.16b, v4.16b, v6.16b
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0]
ret
.size vec_select_48,.-vec_select_48
.globl vec_select_96
.hidden vec_select_96
.type vec_select_96,%function
.align 5
vec_select_96:
dup v6.2d, x3
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
cmeq v6.2d, v6.2d, #0
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
bit v17.16b, v20.16b, v6.16b
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0]
ret
.size vec_select_96,.-vec_select_96
.globl vec_select_192
.hidden vec_select_192
.type vec_select_192,%function
.align 5
vec_select_192:
dup v6.2d, x3
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
cmeq v6.2d, v6.2d, #0
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
bit v17.16b, v20.16b, v6.16b
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
bit v17.16b, v20.16b, v6.16b
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0]
ret
.size vec_select_192,.-vec_select_192
.globl vec_select_144
.hidden vec_select_144
.type vec_select_144,%function
.align 5
vec_select_144:
dup v6.2d, x3
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
cmeq v6.2d, v6.2d, #0
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
bit v17.16b, v20.16b, v6.16b
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0],#48
bit v0.16b, v3.16b, v6.16b
bit v1.16b, v4.16b, v6.16b
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0]
ret
.size vec_select_144,.-vec_select_144
.globl vec_select_288
.hidden vec_select_288
.type vec_select_288,%function
.align 5
vec_select_288:
dup v6.2d, x3
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
cmeq v6.2d, v6.2d, #0
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
bit v17.16b, v20.16b, v6.16b
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48
bit v17.16b, v20.16b, v6.16b
ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0],#48
bit v0.16b, v3.16b, v6.16b
ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48
bit v1.16b, v4.16b, v6.16b
ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48
bit v2.16b, v5.16b, v6.16b
st1 {v0.2d, v1.2d, v2.2d}, [x0],#48
bit v16.16b, v19.16b, v6.16b
bit v17.16b, v20.16b, v6.16b
bit v18.16b, v21.16b, v6.16b
st1 {v16.2d, v17.2d, v18.2d}, [x0]
ret
.size vec_select_288,.-vec_select_288
.globl vec_prefetch
.hidden vec_prefetch
.type vec_prefetch,%function
.align 5
vec_prefetch:
add x1, x1, x0
sub x1, x1, #1
mov x2, #64
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
csel x2, xzr, x2, hi
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
csel x2, xzr, x2, hi
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
csel x2, xzr, x2, hi
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
csel x2, xzr, x2, hi
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
csel x2, xzr, x2, hi
prfm pldl1keep, [x0]
add x0, x0, x2
cmp x0, x1
csel x0, x1, x0, hi
prfm pldl1keep, [x0]
ret
.size vec_prefetch,.-vec_prefetch

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,252 @@
.text
.type __add_mod_384x384,@function
.align 32
__add_mod_384x384:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
movq 32(%rsi),%r12
movq 40(%rsi),%r13
movq 48(%rsi),%r14
addq 0(%rdx),%r8
movq 56(%rsi),%r15
adcq 8(%rdx),%r9
movq 64(%rsi),%rax
adcq 16(%rdx),%r10
movq 72(%rsi),%rbx
adcq 24(%rdx),%r11
movq 80(%rsi),%rbp
adcq 32(%rdx),%r12
movq 88(%rsi),%rsi
adcq 40(%rdx),%r13
movq %r8,0(%rdi)
adcq 48(%rdx),%r14
movq %r9,8(%rdi)
adcq 56(%rdx),%r15
movq %r10,16(%rdi)
adcq 64(%rdx),%rax
movq %r12,32(%rdi)
movq %r14,%r8
adcq 72(%rdx),%rbx
movq %r11,24(%rdi)
movq %r15,%r9
adcq 80(%rdx),%rbp
movq %r13,40(%rdi)
movq %rax,%r10
adcq 88(%rdx),%rsi
movq %rbx,%r11
sbbq %rdx,%rdx
subq 0(%rcx),%r14
sbbq 8(%rcx),%r15
movq %rbp,%r12
sbbq 16(%rcx),%rax
sbbq 24(%rcx),%rbx
sbbq 32(%rcx),%rbp
movq %rsi,%r13
sbbq 40(%rcx),%rsi
sbbq $0,%rdx
cmovcq %r8,%r14
cmovcq %r9,%r15
cmovcq %r10,%rax
movq %r14,48(%rdi)
cmovcq %r11,%rbx
movq %r15,56(%rdi)
cmovcq %r12,%rbp
movq %rax,64(%rdi)
cmovcq %r13,%rsi
movq %rbx,72(%rdi)
movq %rbp,80(%rdi)
movq %rsi,88(%rdi)
.byte 0xf3,0xc3
.cfi_endproc
.size __add_mod_384x384,.-__add_mod_384x384
.type __sub_mod_384x384,@function
.align 32
__sub_mod_384x384:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
movq 32(%rsi),%r12
movq 40(%rsi),%r13
movq 48(%rsi),%r14
subq 0(%rdx),%r8
movq 56(%rsi),%r15
sbbq 8(%rdx),%r9
movq 64(%rsi),%rax
sbbq 16(%rdx),%r10
movq 72(%rsi),%rbx
sbbq 24(%rdx),%r11
movq 80(%rsi),%rbp
sbbq 32(%rdx),%r12
movq 88(%rsi),%rsi
sbbq 40(%rdx),%r13
movq %r8,0(%rdi)
sbbq 48(%rdx),%r14
movq 0(%rcx),%r8
movq %r9,8(%rdi)
sbbq 56(%rdx),%r15
movq 8(%rcx),%r9
movq %r10,16(%rdi)
sbbq 64(%rdx),%rax
movq 16(%rcx),%r10
movq %r11,24(%rdi)
sbbq 72(%rdx),%rbx
movq 24(%rcx),%r11
movq %r12,32(%rdi)
sbbq 80(%rdx),%rbp
movq 32(%rcx),%r12
movq %r13,40(%rdi)
sbbq 88(%rdx),%rsi
movq 40(%rcx),%r13
sbbq %rdx,%rdx
andq %rdx,%r8
andq %rdx,%r9
andq %rdx,%r10
andq %rdx,%r11
andq %rdx,%r12
andq %rdx,%r13
addq %r8,%r14
adcq %r9,%r15
movq %r14,48(%rdi)
adcq %r10,%rax
movq %r15,56(%rdi)
adcq %r11,%rbx
movq %rax,64(%rdi)
adcq %r12,%rbp
movq %rbx,72(%rdi)
adcq %r13,%rsi
movq %rbp,80(%rdi)
movq %rsi,88(%rdi)
.byte 0xf3,0xc3
.cfi_endproc
.size __sub_mod_384x384,.-__sub_mod_384x384
.globl add_mod_384x384
.hidden add_mod_384x384
.type add_mod_384x384,@function
.align 32
add_mod_384x384:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
subq $8,%rsp
.cfi_adjust_cfa_offset 8
call __add_mod_384x384
movq 8(%rsp),%r15
.cfi_restore %r15
movq 16(%rsp),%r14
.cfi_restore %r14
movq 24(%rsp),%r13
.cfi_restore %r13
movq 32(%rsp),%r12
.cfi_restore %r12
movq 40(%rsp),%rbx
.cfi_restore %rbx
movq 48(%rsp),%rbp
.cfi_restore %rbp
leaq 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.byte 0xf3,0xc3
.cfi_endproc
.size add_mod_384x384,.-add_mod_384x384
.globl sub_mod_384x384
.hidden sub_mod_384x384
.type sub_mod_384x384,@function
.align 32
sub_mod_384x384:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
subq $8,%rsp
.cfi_adjust_cfa_offset 8
call __sub_mod_384x384
movq 8(%rsp),%r15
.cfi_restore %r15
movq 16(%rsp),%r14
.cfi_restore %r14
movq 24(%rsp),%r13
.cfi_restore %r13
movq 32(%rsp),%r12
.cfi_restore %r12
movq 40(%rsp),%rbx
.cfi_restore %rbx
movq 48(%rsp),%rbp
.cfi_restore %rbp
leaq 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.byte 0xf3,0xc3
.cfi_endproc
.size sub_mod_384x384,.-sub_mod_384x384
.section .note.GNU-stack,"",@progbits
.section .note.gnu.property,"a",@note
.long 4,2f-1f,5
.byte 0x47,0x4E,0x55,0
1: .long 0xc0000002,4,3
.align 8
2:

View file

@ -0,0 +1,784 @@
.text
.globl ct_inverse_mod_256
.type ct_inverse_mod_256, %function
.align 5
ct_inverse_mod_256:
.inst 0xd503233f
stp x29, x30, [sp,#-80]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
sub sp, sp, #1040
ldp x4, x5, [x1,#8*0]
ldp x6, x7, [x1,#8*2]
add x1, sp, #16+511 // find closest 512-byte-aligned spot
and x1, x1, #-512 // in the frame...
str x0, [sp]
ldp x8, x9, [x2,#8*0]
ldp x10, x11, [x2,#8*2]
stp x4, x5, [x1,#8*0] // copy input to |a|
stp x6, x7, [x1,#8*2]
stp x8, x9, [x1,#8*4] // copy modulus to |b|
stp x10, x11, [x1,#8*6]
////////////////////////////////////////// first iteration
bl .Lab_approximation_31_256_loaded
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
str x12,[x0,#8*8] // initialize |u| with |f0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to dst |b|
bl __smul_256_n_shift_by_31
str x12, [x0,#8*9] // initialize |v| with |f1|
////////////////////////////////////////// second iteration
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
ldr x8, [x1,#8*8] // |u|
ldr x9, [x1,#8*13] // |v|
madd x4, x16, x8, xzr // |u|*|f0|
madd x4, x17, x9, x4 // |v|*|g0|
str x4, [x0,#8*4]
asr x5, x4, #63 // sign extenstion
stp x5, x5, [x0,#8*5]
stp x5, x5, [x0,#8*7]
madd x4, x12, x8, xzr // |u|*|f1|
madd x4, x13, x9, x4 // |v|*|g1|
str x4, [x0,#8*9]
asr x5, x4, #63 // sign extenstion
stp x5, x5, [x0,#8*10]
stp x5, x5, [x0,#8*12]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
adc x22, x22, x23
stp x22, x22, [x0,#8*4]
stp x22, x22, [x0,#8*6]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
bl __ab_approximation_31_256
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_256_n_shift_by_31
mov x16, x12 // corrected |f0|
mov x17, x13 // corrected |g0|
mov x12, x14 // |f1|
mov x13, x15 // |g1|
add x0, x0, #8*4 // pointer to destination |b|
bl __smul_256_n_shift_by_31
add x0, x0, #8*4 // pointer to destination |u|
bl __smul_256x63
adc x22, x22, x23
str x22, [x0,#8*4]
mov x16, x12 // corrected |f1|
mov x17, x13 // corrected |g1|
add x0, x0, #8*5 // pointer to destination |v|
bl __smul_256x63
bl __smul_512x63_tail
////////////////////////////////////////// two[!] last iterations
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #47 // 31 + 512 % 31
//bl __ab_approximation_62_256 // |a| and |b| are exact,
ldr x7, [x1,#8*0] // just load
ldr x11, [x1,#8*4]
bl __inner_loop_62_256
mov x16, x14
mov x17, x15
ldr x0, [sp] // original out_ptr
bl __smul_256x63
bl __smul_512x63_tail
ldr x30, [x29,#8]
smulh x20, x7, x17 // figure out top-most limb
ldp x8, x9, [x3,#8*0]
adc x23, x23, x25
ldp x10, x11, [x3,#8*2]
add x20, x20, x23 // x20 is 1, 0 or -1
asr x19, x20, #63 // sign as mask
and x23, x8, x19 // add mod<<256 conditionally
and x24, x9, x19
adds x4, x4, x23
and x25, x10, x19
adcs x5, x5, x24
and x26, x11, x19
adcs x6, x6, x25
adcs x7, x22, x26
adc x20, x20, xzr // x20 is 1, 0 or -1
neg x19, x20
orr x20, x20, x19 // excess bit or sign as mask
asr x19, x19, #63 // excess bit as mask
and x8, x8, x20 // mask |mod|
and x9, x9, x20
and x10, x10, x20
and x11, x11, x20
eor x8, x8, x19 // conditionally negate |mod|
eor x9, x9, x19
adds x8, x8, x19, lsr#63
eor x10, x10, x19
adcs x9, x9, xzr
eor x11, x11, x19
adcs x10, x10, xzr
adc x11, x11, xzr
adds x4, x4, x8 // final adjustment for |mod|<<256
adcs x5, x5, x9
adcs x6, x6, x10
stp x4, x5, [x0,#8*4]
adc x7, x7, x11
stp x6, x7, [x0,#8*6]
add sp, sp, #1040
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldr x29, [sp],#80
.inst 0xd50323bf
ret
.size ct_inverse_mod_256,.-ct_inverse_mod_256
////////////////////////////////////////////////////////////////////////
.type __smul_256x63, %function
.align 5
__smul_256x63:
ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|)
asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s)
ldp x6, x7, [x1,#8*2+64]
eor x16, x16, x14 // conditionally negate |f_| (or |g_|)
ldr x22, [x1,#8*4+64]
eor x4, x4, x14 // conditionally negate |u| (or |v|)
sub x16, x16, x14
eor x5, x5, x14
adds x4, x4, x14, lsr#63
eor x6, x6, x14
adcs x5, x5, xzr
eor x7, x7, x14
adcs x6, x6, xzr
eor x22, x22, x14
umulh x19, x4, x16
adcs x7, x7, xzr
umulh x20, x5, x16
adcs x22, x22, xzr
umulh x21, x6, x16
mul x4, x4, x16
cmp x16, #0
mul x5, x5, x16
csel x22, x22, xzr, ne
mul x6, x6, x16
adds x5, x5, x19
mul x24, x7, x16
adcs x6, x6, x20
adcs x24, x24, x21
adc x26, xzr, xzr
ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|)
asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s)
ldp x10, x11, [x1,#8*2+104]
eor x17, x17, x14 // conditionally negate |f_| (or |g_|)
ldr x23, [x1,#8*4+104]
eor x8, x8, x14 // conditionally negate |u| (or |v|)
sub x17, x17, x14
eor x9, x9, x14
adds x8, x8, x14, lsr#63
eor x10, x10, x14
adcs x9, x9, xzr
eor x11, x11, x14
adcs x10, x10, xzr
eor x23, x23, x14
umulh x19, x8, x17
adcs x11, x11, xzr
umulh x20, x9, x17
adcs x23, x23, xzr
umulh x21, x10, x17
adc x15, xzr, xzr // used in __smul_512x63_tail
mul x8, x8, x17
cmp x17, #0
mul x9, x9, x17
csel x23, x23, xzr, ne
mul x10, x10, x17
adds x9, x9, x19
mul x25, x11, x17
adcs x10, x10, x20
adcs x25, x25, x21
adc x26, x26, xzr
adds x4, x4, x8
adcs x5, x5, x9
adcs x6, x6, x10
stp x4, x5, [x0,#8*0]
adcs x24, x24, x25
stp x6, x24, [x0,#8*2]
ret
.size __smul_256x63,.-__smul_256x63
.type __smul_512x63_tail, %function
.align 5
__smul_512x63_tail:
umulh x24, x7, x16
ldp x5, x6, [x1,#8*18] // load rest of |v|
adc x26, x26, xzr
ldr x7, [x1,#8*20]
and x22, x22, x16
umulh x11, x11, x17 // resume |v|*|g1| chain
sub x24, x24, x22 // tie up |u|*|f1| chain
asr x25, x24, #63
eor x5, x5, x14 // conditionally negate rest of |v|
eor x6, x6, x14
adds x5, x5, x15
eor x7, x7, x14
adcs x6, x6, xzr
umulh x19, x23, x17
adc x7, x7, xzr
umulh x20, x5, x17
add x11, x11, x26
umulh x21, x6, x17
mul x4, x23, x17
mul x5, x5, x17
adds x4, x4, x11
mul x6, x6, x17
adcs x5, x5, x19
mul x22, x7, x17
adcs x6, x6, x20
adcs x22, x22, x21
adc x23, xzr, xzr // used in the final step
adds x4, x4, x24
adcs x5, x5, x25
adcs x6, x6, x25
stp x4, x5, [x0,#8*4]
adcs x22, x22, x25 // carry is used in the final step
stp x6, x22, [x0,#8*6]
ret
.size __smul_512x63_tail,.-__smul_512x63_tail
.type __smul_256_n_shift_by_31, %function
.align 5
__smul_256_n_shift_by_31:
ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|)
asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s)
ldp x6, x7, [x1,#8*2+0]
eor x25, x12, x24 // conditionally negate |f0| (or |g0|)
eor x4, x4, x24 // conditionally negate |a| (or |b|)
sub x25, x25, x24
eor x5, x5, x24
adds x4, x4, x24, lsr#63
eor x6, x6, x24
adcs x5, x5, xzr
eor x7, x7, x24
umulh x19, x4, x25
adcs x6, x6, xzr
umulh x20, x5, x25
adc x7, x7, xzr
umulh x21, x6, x25
and x24, x24, x25
umulh x22, x7, x25
neg x24, x24
mul x4, x4, x25
mul x5, x5, x25
mul x6, x6, x25
adds x5, x5, x19
mul x7, x7, x25
adcs x6, x6, x20
adcs x7, x7, x21
adc x22, x22, x24
ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|)
asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s)
ldp x10, x11, [x1,#8*2+32]
eor x25, x13, x24 // conditionally negate |f0| (or |g0|)
eor x8, x8, x24 // conditionally negate |a| (or |b|)
sub x25, x25, x24
eor x9, x9, x24
adds x8, x8, x24, lsr#63
eor x10, x10, x24
adcs x9, x9, xzr
eor x11, x11, x24
umulh x19, x8, x25
adcs x10, x10, xzr
umulh x20, x9, x25
adc x11, x11, xzr
umulh x21, x10, x25
and x24, x24, x25
umulh x23, x11, x25
neg x24, x24
mul x8, x8, x25
mul x9, x9, x25
mul x10, x10, x25
adds x9, x9, x19
mul x11, x11, x25
adcs x10, x10, x20
adcs x11, x11, x21
adc x23, x23, x24
adds x4, x4, x8
adcs x5, x5, x9
adcs x6, x6, x10
adcs x7, x7, x11
adc x8, x22, x23
extr x4, x5, x4, #31
extr x5, x6, x5, #31
extr x6, x7, x6, #31
asr x23, x8, #63 // result's sign as mask
extr x7, x8, x7, #31
eor x4, x4, x23 // ensure the result is positive
eor x5, x5, x23
adds x4, x4, x23, lsr#63
eor x6, x6, x23
adcs x5, x5, xzr
eor x7, x7, x23
adcs x6, x6, xzr
stp x4, x5, [x0,#8*0]
adc x7, x7, xzr
stp x6, x7, [x0,#8*2]
eor x12, x12, x23 // adjust |f/g| accordingly
eor x13, x13, x23
sub x12, x12, x23
sub x13, x13, x23
ret
.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
.type __ab_approximation_31_256, %function
.align 4
__ab_approximation_31_256:
ldp x6, x7, [x1,#8*2]
ldp x10, x11, [x1,#8*6]
ldp x4, x5, [x1,#8*0]
ldp x8, x9, [x1,#8*4]
.Lab_approximation_31_256_loaded:
orr x19, x7, x11 // check top-most limbs, ...
cmp x19, #0
csel x7, x7, x6, ne
csel x11, x11, x10, ne
csel x6, x6, x5, ne
orr x19, x7, x11 // and ones before top-most, ...
csel x10, x10, x9, ne
cmp x19, #0
csel x7, x7, x6, ne
csel x11, x11, x10, ne
csel x6, x6, x4, ne
orr x19, x7, x11 // and one more, ...
csel x10, x10, x8, ne
clz x19, x19
cmp x19, #64
csel x19, x19, xzr, ne
csel x7, x7, x6, ne
csel x11, x11, x10, ne
neg x20, x19
lslv x7, x7, x19 // align high limbs to the left
lslv x11, x11, x19
lsrv x6, x6, x20
lsrv x10, x10, x20
and x6, x6, x20, asr#6
and x10, x10, x20, asr#6
orr x7, x7, x6
orr x11, x11, x10
bfxil x7, x4, #0, #31
bfxil x11, x8, #0, #31
b __inner_loop_31_256
ret
.size __ab_approximation_31_256,.-__ab_approximation_31_256
.type __inner_loop_31_256, %function
.align 4
__inner_loop_31_256:
mov x2, #31
mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0
mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1
mov x23,#0x7FFFFFFF7FFFFFFF
.Loop_31_256:
sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting
sub x2, x2, #1
and x19, x11, x22
sub x20, x11, x7 // |b_|-|a_|
subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even)
mov x19, x15
csel x11, x11, x7, hs // |b_| = |a_|
csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel x15, x15, x13, hs // exchange |fg0| and |fg1|
csel x13, x13, x19, hs
lsr x7, x7, #1
and x19, x15, x22
and x20, x23, x22
sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even)
add x15, x15, x15 // |f1|<<=1
add x13, x13, x20
sub x15, x15, x23
cbnz x2, .Loop_31_256
mov x23, #0x7FFFFFFF
ubfx x12, x13, #0, #32
ubfx x13, x13, #32, #32
ubfx x14, x15, #0, #32
ubfx x15, x15, #32, #32
sub x12, x12, x23 // remove bias
sub x13, x13, x23
sub x14, x14, x23
sub x15, x15, x23
ret
.size __inner_loop_31_256,.-__inner_loop_31_256
.type __inner_loop_62_256, %function
.align 4
__inner_loop_62_256:
mov x12, #1 // |f0|=1
mov x13, #0 // |g0|=0
mov x14, #0 // |f1|=0
mov x15, #1 // |g1|=1
.Loop_62_256:
sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting
sub x2, x2, #1
and x19, x11, x22
sub x20, x11, x7 // |b_|-|a_|
subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even)
mov x19, x12
csel x11, x11, x7, hs // |b_| = |a_|
csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
mov x20, x13
csel x12, x12, x14, hs // exchange |f0| and |f1|
csel x14, x14, x19, hs
csel x13, x13, x15, hs // exchange |g0| and |g1|
csel x15, x15, x20, hs
lsr x7, x7, #1
and x19, x14, x22
and x20, x15, x22
add x14, x14, x14 // |f1|<<=1
add x15, x15, x15 // |g1|<<=1
sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even)
sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...)
cbnz x2, .Loop_62_256
ret
.size __inner_loop_62_256,.-__inner_loop_62_256

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,717 @@
.text
.globl ct_inverse_mod_383
.type ct_inverse_mod_383, %function
.align 5
ct_inverse_mod_383:
.inst 0xd503233f
stp x29, x30, [sp,#-128]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
sub sp, sp, #1040
ldp x22, x4, [x1,#8*0]
ldp x5, x6, [x1,#8*2]
ldp x7, x8, [x1,#8*4]
add x1, sp, #16+511 // find closest 512-byte-aligned spot
and x1, x1, #-512 // in the frame...
stp x0, x3, [sp]
ldp x9, x10, [x2,#8*0]
ldp x11, x12, [x2,#8*2]
ldp x13, x14, [x2,#8*4]
stp x22, x4, [x1,#8*0] // copy input to |a|
stp x5, x6, [x1,#8*2]
stp x7, x8, [x1,#8*4]
stp x9, x10, [x1,#8*6] // copy modulus to |b|
stp x11, x12, [x1,#8*8]
stp x13, x14, [x1,#8*10]
////////////////////////////////////////// first iteration
mov x2, #62
bl .Lab_approximation_62_loaded
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
str x15,[x0,#8*12] // initialize |u| with |f0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to dst |b|
bl __smul_383_n_shift_by_62
str x15, [x0,#8*12] // initialize |v| with |f1|
////////////////////////////////////////// second iteration
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
ldr x7, [x1,#8*12] // |u|
ldr x8, [x1,#8*18] // |v|
mul x3, x20, x7 // |u|*|f0|
smulh x4, x20, x7
mul x5, x21, x8 // |v|*|g0|
smulh x6, x21, x8
adds x3, x3, x5
adc x4, x4, x6
stp x3, x4, [x0,#8*6]
asr x5, x4, #63 // sign extenstion
stp x5, x5, [x0,#8*8]
stp x5, x5, [x0,#8*10]
mul x3, x15, x7 // |u|*|f1|
smulh x4, x15, x7
mul x5, x16, x8 // |v|*|g1|
smulh x6, x16, x8
adds x3, x3, x5
adc x4, x4, x6
stp x3, x4, [x0,#8*12]
asr x5, x4, #63 // sign extenstion
stp x5, x5, [x0,#8*14]
stp x5, x5, [x0,#8*16]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
asr x27, x27, #63 // sign extension
stp x27, x27, [x0,#8*6]
stp x27, x27, [x0,#8*8]
stp x27, x27, [x0,#8*10]
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
bl __smul_767x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
bl __smul_767x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
bl __smul_767x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
bl __smul_767x63_tail
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
bl __ab_approximation_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
bl __smul_383_n_shift_by_62
mov x20, x15 // corrected |f0|
mov x21, x16 // corrected |g0|
mov x15, x17 // |f1|
mov x16, x19 // |g1|
add x0, x0, #8*6 // pointer to destination |b|
bl __smul_383_n_shift_by_62
add x0, x0, #8*6 // pointer to destination |u|
bl __smul_383x63
mov x20, x15 // corrected |f1|
mov x21, x16 // corrected |g1|
add x0, x0, #8*6 // pointer to destination |v|
bl __smul_383x63
bl __smul_767x63_tail
////////////////////////////////////////// iteration before last
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #62
//bl __ab_approximation_62 // |a| and |b| are exact,
ldp x3, x8, [x1,#8*0] // just load
ldp x9, x14, [x1,#8*6]
bl __inner_loop_62
eor x0, x1, #256 // pointer to dst |a|b|u|v|
str x3, [x0,#8*0]
str x9, [x0,#8*6]
mov x20, x15 // exact |f0|
mov x21, x16 // exact |g0|
mov x15, x17
mov x16, x19
add x0, x0, #8*12 // pointer to dst |u|
bl __smul_383x63
mov x20, x15 // exact |f1|
mov x21, x16 // exact |g1|
add x0, x0, #8*6 // pointer to dst |v|
bl __smul_383x63
bl __smul_767x63_tail
////////////////////////////////////////// last iteration
eor x1, x1, #256 // flip-flop src |a|b|u|v|
mov x2, #22 // 766 % 62
//bl __ab_approximation_62 // |a| and |b| are exact,
ldr x3, [x1,#8*0] // just load
eor x8, x8, x8
ldr x9, [x1,#8*6]
eor x14, x14, x14
bl __inner_loop_62
mov x20, x17
mov x21, x19
ldp x0, x15, [sp] // original out_ptr and n_ptr
bl __smul_383x63
bl __smul_767x63_tail
ldr x30, [x29,#8]
asr x22, x8, #63 // sign as mask
ldp x9, x10, [x15,#8*0]
ldp x11, x12, [x15,#8*2]
ldp x13, x14, [x15,#8*4]
and x9, x9, x22 // add mod<<384 conditionally
and x10, x10, x22
adds x3, x3, x9
and x11, x11, x22
adcs x4, x4, x10
and x12, x12, x22
adcs x5, x5, x11
and x13, x13, x22
adcs x6, x6, x12
and x14, x14, x22
stp x3, x4, [x0,#8*6]
adcs x7, x7, x13
stp x5, x6, [x0,#8*8]
adc x8, x8, x14
stp x7, x8, [x0,#8*10]
add sp, sp, #1040
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldr x29, [sp],#128
.inst 0xd50323bf
ret
.size ct_inverse_mod_383,.-ct_inverse_mod_383
////////////////////////////////////////////////////////////////////////
// see corresponding commentary in ctx_inverse_mod_384-x86_64...
.type __smul_383x63, %function
.align 5
__smul_383x63:
ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|)
asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s)
ldp x5, x6, [x1,#8*2+96]
eor x20, x20, x17 // conditionally negate |f_| (or |g_|)
ldp x7, x8, [x1,#8*4+96]
eor x3, x3, x17 // conditionally negate |u| (or |v|)
sub x20, x20, x17
eor x4, x4, x17
adds x3, x3, x17, lsr#63
eor x5, x5, x17
adcs x4, x4, xzr
eor x6, x6, x17
adcs x5, x5, xzr
eor x7, x7, x17
adcs x6, x6, xzr
umulh x22, x3, x20
eor x8, x8, x17
umulh x23, x4, x20
adcs x7, x7, xzr
umulh x24, x5, x20
adcs x8, x8, xzr
umulh x25, x6, x20
umulh x26, x7, x20
mul x3, x3, x20
mul x4, x4, x20
mul x5, x5, x20
adds x4, x4, x22
mul x6, x6, x20
adcs x5, x5, x23
mul x7, x7, x20
adcs x6, x6, x24
mul x27,x8, x20
adcs x7, x7, x25
adcs x27,x27,x26
adc x2, xzr, xzr
ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|)
asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s)
ldp x11, x12, [x1,#8*2+144]
eor x21, x21, x17 // conditionally negate |f_| (or |g_|)
ldp x13, x14, [x1,#8*4+144]
eor x9, x9, x17 // conditionally negate |u| (or |v|)
sub x21, x21, x17
eor x10, x10, x17
adds x9, x9, x17, lsr#63
eor x11, x11, x17
adcs x10, x10, xzr
eor x12, x12, x17
adcs x11, x11, xzr
eor x13, x13, x17
adcs x12, x12, xzr
umulh x22, x9, x21
eor x14, x14, x17
umulh x23, x10, x21
adcs x13, x13, xzr
umulh x24, x11, x21
adcs x14, x14, xzr
umulh x25, x12, x21
adc x19, xzr, xzr // used in __smul_767x63_tail
umulh x26, x13, x21
mul x9, x9, x21
mul x10, x10, x21
mul x11, x11, x21
adds x10, x10, x22
mul x12, x12, x21
adcs x11, x11, x23
mul x13, x13, x21
adcs x12, x12, x24
mul x28,x14, x21
adcs x13, x13, x25
adcs x28,x28,x26
adc x2, x2, xzr
adds x3, x3, x9
adcs x4, x4, x10
adcs x5, x5, x11
adcs x6, x6, x12
stp x3, x4, [x0,#8*0]
adcs x7, x7, x13
stp x5, x6, [x0,#8*2]
adcs x27, x27, x28
stp x7, x27, [x0,#8*4]
adc x28, x2, xzr // used in __smul_767x63_tail
ret
.size __smul_383x63,.-__smul_383x63
.type __smul_767x63_tail, %function
.align 5
__smul_767x63_tail:
smulh x27, x8, x20
ldp x3, x4, [x1,#8*24] // load rest of |v|
umulh x14,x14, x21
ldp x5, x6, [x1,#8*26]
ldp x7, x8, [x1,#8*28]
eor x3, x3, x17 // conditionally negate rest of |v|
eor x4, x4, x17
eor x5, x5, x17
adds x3, x3, x19
eor x6, x6, x17
adcs x4, x4, xzr
eor x7, x7, x17
adcs x5, x5, xzr
eor x8, x8, x17
adcs x6, x6, xzr
umulh x22, x3, x21
adcs x7, x7, xzr
umulh x23, x4, x21
adc x8, x8, xzr
umulh x24, x5, x21
add x14, x14, x28
umulh x25, x6, x21
asr x28, x27, #63
umulh x26, x7, x21
mul x3, x3, x21
mul x4, x4, x21
mul x5, x5, x21
adds x3, x3, x14
mul x6, x6, x21
adcs x4, x4, x22
mul x7, x7, x21
adcs x5, x5, x23
mul x8, x8, x21
adcs x6, x6, x24
adcs x7, x7, x25
adc x8, x8, x26
adds x3, x3, x27
adcs x4, x4, x28
adcs x5, x5, x28
adcs x6, x6, x28
stp x3, x4, [x0,#8*6]
adcs x7, x7, x28
stp x5, x6, [x0,#8*8]
adc x8, x8, x28
stp x7, x8, [x0,#8*10]
ret
.size __smul_767x63_tail,.-__smul_767x63_tail
.type __smul_383_n_shift_by_62, %function
.align 5
__smul_383_n_shift_by_62:
ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|)
asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s)
ldp x5, x6, [x1,#8*2+0]
eor x2, x15, x28 // conditionally negate |f0| (or |g0|)
ldp x7, x8, [x1,#8*4+0]
eor x3, x3, x28 // conditionally negate |a| (or |b|)
sub x2, x2, x28
eor x4, x4, x28
adds x3, x3, x28, lsr#63
eor x5, x5, x28
adcs x4, x4, xzr
eor x6, x6, x28
adcs x5, x5, xzr
eor x7, x7, x28
umulh x22, x3, x2
adcs x6, x6, xzr
umulh x23, x4, x2
eor x8, x8, x28
umulh x24, x5, x2
adcs x7, x7, xzr
umulh x25, x6, x2
adc x8, x8, xzr
umulh x26, x7, x2
smulh x27, x8, x2
mul x3, x3, x2
mul x4, x4, x2
mul x5, x5, x2
adds x4, x4, x22
mul x6, x6, x2
adcs x5, x5, x23
mul x7, x7, x2
adcs x6, x6, x24
mul x8, x8, x2
adcs x7, x7, x25
adcs x8, x8 ,x26
adc x27, x27, xzr
ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|)
asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s)
ldp x11, x12, [x1,#8*2+48]
eor x2, x16, x28 // conditionally negate |f0| (or |g0|)
ldp x13, x14, [x1,#8*4+48]
eor x9, x9, x28 // conditionally negate |a| (or |b|)
sub x2, x2, x28
eor x10, x10, x28
adds x9, x9, x28, lsr#63
eor x11, x11, x28
adcs x10, x10, xzr
eor x12, x12, x28
adcs x11, x11, xzr
eor x13, x13, x28
umulh x22, x9, x2
adcs x12, x12, xzr
umulh x23, x10, x2
eor x14, x14, x28
umulh x24, x11, x2
adcs x13, x13, xzr
umulh x25, x12, x2
adc x14, x14, xzr
umulh x26, x13, x2
smulh x28, x14, x2
mul x9, x9, x2
mul x10, x10, x2
mul x11, x11, x2
adds x10, x10, x22
mul x12, x12, x2
adcs x11, x11, x23
mul x13, x13, x2
adcs x12, x12, x24
mul x14, x14, x2
adcs x13, x13, x25
adcs x14, x14 ,x26
adc x28, x28, xzr
adds x3, x3, x9
adcs x4, x4, x10
adcs x5, x5, x11
adcs x6, x6, x12
adcs x7, x7, x13
adcs x8, x8, x14
adc x9, x27, x28
extr x3, x4, x3, #62
extr x4, x5, x4, #62
extr x5, x6, x5, #62
asr x28, x9, #63
extr x6, x7, x6, #62
extr x7, x8, x7, #62
extr x8, x9, x8, #62
eor x3, x3, x28
eor x4, x4, x28
adds x3, x3, x28, lsr#63
eor x5, x5, x28
adcs x4, x4, xzr
eor x6, x6, x28
adcs x5, x5, xzr
eor x7, x7, x28
adcs x6, x6, xzr
eor x8, x8, x28
stp x3, x4, [x0,#8*0]
adcs x7, x7, xzr
stp x5, x6, [x0,#8*2]
adc x8, x8, xzr
stp x7, x8, [x0,#8*4]
eor x15, x15, x28
eor x16, x16, x28
sub x15, x15, x28
sub x16, x16, x28
ret
.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
.type __ab_approximation_62, %function
.align 4
__ab_approximation_62:
ldp x7, x8, [x1,#8*4]
ldp x13, x14, [x1,#8*10]
ldp x5, x6, [x1,#8*2]
ldp x11, x12, [x1,#8*8]
.Lab_approximation_62_loaded:
orr x22, x8, x14 // check top-most limbs, ...
cmp x22, #0
csel x8, x8, x7, ne
csel x14, x14, x13, ne
csel x7, x7, x6, ne
orr x22, x8, x14 // ... ones before top-most, ...
csel x13, x13, x12, ne
ldp x3, x4, [x1,#8*0]
ldp x9, x10, [x1,#8*6]
cmp x22, #0
csel x8, x8, x7, ne
csel x14, x14, x13, ne
csel x7, x7, x5, ne
orr x22, x8, x14 // ... and ones before that ...
csel x13, x13, x11, ne
cmp x22, #0
csel x8, x8, x7, ne
csel x14, x14, x13, ne
csel x7, x7, x4, ne
orr x22, x8, x14
csel x13, x13, x10, ne
clz x22, x22
cmp x22, #64
csel x22, x22, xzr, ne
csel x8, x8, x7, ne
csel x14, x14, x13, ne
neg x23, x22
lslv x8, x8, x22 // align high limbs to the left
lslv x14, x14, x22
lsrv x7, x7, x23
lsrv x13, x13, x23
and x7, x7, x23, asr#6
and x13, x13, x23, asr#6
orr x8, x8, x7
orr x14, x14, x13
b __inner_loop_62
ret
.size __ab_approximation_62,.-__ab_approximation_62
.type __inner_loop_62, %function
.align 4
__inner_loop_62:
mov x15, #1 // |f0|=1
mov x16, #0 // |g0|=0
mov x17, #0 // |f1|=0
mov x19, #1 // |g1|=1
.Loop_62:
sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting
sub x2, x2, #1
subs x24, x9, x3 // |b_|-|a_|
and x22, x9, x28
sbc x25, x14, x8
and x23, x14, x28
subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even)
mov x22, x15
sbcs x27, x8, x23
mov x23, x16
csel x9, x9, x3, hs // |b_| = |a_|
csel x14, x14, x8, hs
csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel x8, x27, x25, hs
csel x15, x15, x17, hs // exchange |f0| and |f1|
csel x17, x17, x22, hs
csel x16, x16, x19, hs // exchange |g0| and |g1|
csel x19, x19, x23, hs
extr x3, x8, x3, #1
lsr x8, x8, #1
and x22, x17, x28
and x23, x19, x28
add x17, x17, x17 // |f1|<<=1
add x19, x19, x19 // |g1|<<=1
sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even)
sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...)
cbnz x2, .Loop_62
ret
.size __inner_loop_62,.-__inner_loop_62

View file

@ -0,0 +1,324 @@
.text
.globl ct_is_square_mod_384
.type ct_is_square_mod_384, %function
.align 5
ct_is_square_mod_384:
.inst 0xd503233f
stp x29, x30, [sp,#-128]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
sub sp, sp, #512
ldp x3, x4, [x0,#8*0] // load input
ldp x5, x6, [x0,#8*2]
ldp x7, x8, [x0,#8*4]
add x0, sp, #255 // find closest 256-byte-aligned spot
and x0, x0, #-256 // in the frame...
ldp x9, x10, [x1,#8*0] // load modulus
ldp x11, x12, [x1,#8*2]
ldp x13, x14, [x1,#8*4]
stp x3, x4, [x0,#8*6] // copy input to |a|
stp x5, x6, [x0,#8*8]
stp x7, x8, [x0,#8*10]
stp x9, x10, [x0,#8*0] // copy modulus to |b|
stp x11, x12, [x0,#8*2]
stp x13, x14, [x0,#8*4]
eor x2, x2, x2 // init the .Legendre symbol
mov x15, #24 // 24 is 768/30-1
b .Loop_is_square
.align 4
.Loop_is_square:
bl __ab_approximation_30
sub x15, x15, #1
eor x1, x0, #128 // pointer to dst |b|
bl __smul_384_n_shift_by_30
mov x19, x16 // |f0|
mov x20, x17 // |g0|
add x1, x1, #8*6 // pointer to dst |a|
bl __smul_384_n_shift_by_30
ldp x9, x10, [x1,#-8*6]
eor x0, x0, #128 // flip-flop src |a|b|
and x27, x27, x9 // if |a| was negative,
add x2, x2, x27, lsr#1 // adjust |L|
cbnz x15, .Loop_is_square
////////////////////////////////////////// last iteration
//bl __ab_approximation_30 // |a| and |b| are exact,
//ldr x8, [x0,#8*6] // just load
mov x14, x9 // ldr x14, [x0,#8*0]
mov x15, #48 // 48 is 768%30 + 30
bl __inner_loop_48
ldr x30, [x29,#8]
and x0, x2, #1
eor x0, x0, #1
add sp, sp, #512
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldr x29, [sp],#128
.inst 0xd50323bf
ret
.size ct_is_square_mod_384,.-ct_is_square_mod_384
.type __smul_384_n_shift_by_30, %function
.align 5
__smul_384_n_shift_by_30:
ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|)
asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s)
ldp x5, x6, [x0,#8*2+0]
eor x20, x20, x27 // conditionally negate |g1| (or |f1|)
ldp x7, x8, [x0,#8*4+0]
eor x3, x3, x27 // conditionally negate |b| (or |a|)
sub x20, x20, x27
eor x4, x4, x27
adds x3, x3, x27, lsr#63
eor x5, x5, x27
adcs x4, x4, xzr
eor x6, x6, x27
adcs x5, x5, xzr
eor x7, x7, x27
umulh x21, x3, x20
adcs x6, x6, xzr
umulh x22, x4, x20
eor x8, x8, x27
umulh x23, x5, x20
adcs x7, x7, xzr
umulh x24, x6, x20
adc x8, x8, xzr
umulh x25, x7, x20
and x28, x20, x27
umulh x26, x8, x20
neg x28, x28
mul x3, x3, x20
mul x4, x4, x20
mul x5, x5, x20
adds x4, x4, x21
mul x6, x6, x20
adcs x5, x5, x22
mul x7, x7, x20
adcs x6, x6, x23
mul x8, x8, x20
adcs x7, x7, x24
adcs x8, x8 ,x25
adc x26, x26, x28
ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|)
asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s)
ldp x11, x12, [x0,#8*2+48]
eor x19, x19, x27 // conditionally negate |g1| (or |f1|)
ldp x13, x14, [x0,#8*4+48]
eor x9, x9, x27 // conditionally negate |b| (or |a|)
sub x19, x19, x27
eor x10, x10, x27
adds x9, x9, x27, lsr#63
eor x11, x11, x27
adcs x10, x10, xzr
eor x12, x12, x27
adcs x11, x11, xzr
eor x13, x13, x27
umulh x21, x9, x19
adcs x12, x12, xzr
umulh x22, x10, x19
eor x14, x14, x27
umulh x23, x11, x19
adcs x13, x13, xzr
umulh x24, x12, x19
adc x14, x14, xzr
umulh x25, x13, x19
and x28, x19, x27
umulh x27, x14, x19
neg x28, x28
mul x9, x9, x19
mul x10, x10, x19
mul x11, x11, x19
adds x10, x10, x21
mul x12, x12, x19
adcs x11, x11, x22
mul x13, x13, x19
adcs x12, x12, x23
mul x14, x14, x19
adcs x13, x13, x24
adcs x14, x14 ,x25
adc x27, x27, x28
adds x3, x3, x9
adcs x4, x4, x10
adcs x5, x5, x11
adcs x6, x6, x12
adcs x7, x7, x13
adcs x8, x8, x14
adc x9, x26, x27
extr x3, x4, x3, #30
extr x4, x5, x4, #30
extr x5, x6, x5, #30
asr x27, x9, #63
extr x6, x7, x6, #30
extr x7, x8, x7, #30
extr x8, x9, x8, #30
eor x3, x3, x27
eor x4, x4, x27
adds x3, x3, x27, lsr#63
eor x5, x5, x27
adcs x4, x4, xzr
eor x6, x6, x27
adcs x5, x5, xzr
eor x7, x7, x27
adcs x6, x6, xzr
eor x8, x8, x27
stp x3, x4, [x1,#8*0]
adcs x7, x7, xzr
stp x5, x6, [x1,#8*2]
adc x8, x8, xzr
stp x7, x8, [x1,#8*4]
ret
.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
.type __ab_approximation_30, %function
.align 4
__ab_approximation_30:
ldp x13, x14, [x0,#8*4] // |a| is still in registers
ldp x11, x12, [x0,#8*2]
orr x21, x8, x14 // check top-most limbs, ...
cmp x21, #0
csel x8, x8, x7, ne
csel x14, x14, x13, ne
csel x7, x7, x6, ne
orr x21, x8, x14 // ... ones before top-most, ...
csel x13, x13, x12, ne
cmp x21, #0
csel x8, x8, x7, ne
csel x14, x14, x13, ne
csel x7, x7, x5, ne
orr x21, x8, x14 // ... and ones before that ...
csel x13, x13, x11, ne
cmp x21, #0
csel x8, x8, x7, ne
csel x14, x14, x13, ne
csel x7, x7, x4, ne
orr x21, x8, x14 // and one more, ...
csel x13, x13, x10, ne
cmp x21, #0
csel x8, x8, x7, ne
csel x14, x14, x13, ne
csel x7, x7, x3, ne
orr x21, x8, x14
csel x13, x13, x9, ne
clz x21, x21
cmp x21, #64
csel x21, x21, xzr, ne
csel x8, x8, x7, ne
csel x14, x14, x13, ne
neg x22, x21
lslv x8, x8, x21 // align high limbs to the left
lslv x14, x14, x21
lsrv x7, x7, x22
lsrv x13, x13, x22
and x7, x7, x22, asr#6
and x13, x13, x22, asr#6
orr x8, x8, x7
orr x14, x14, x13
bfxil x8, x3, #0, #32
bfxil x14, x9, #0, #32
b __inner_loop_30
ret
.size __ab_approximation_30,.-__ab_approximation_30
.type __inner_loop_30, %function
.align 4
__inner_loop_30:
mov x28, #30
mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0
mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1
mov x27,#0x7FFFFFFF7FFFFFFF
.Loop_30:
sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting
and x25, x8, x14
sub x28, x28, #1
and x21, x14, x24
sub x22, x14, x8 // |b_|-|a_|
subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even)
add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1
mov x21, x20
csel x14, x14, x8, hs // |b_| = |a_|
csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel x20, x20, x17, hs // exchange |fg0| and |fg1|
csel x17, x17, x21, hs
csel x2, x2, x25, hs
lsr x8, x8, #1
and x21, x20, x24
and x22, x27, x24
add x23, x14, #2
sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even)
add x20, x20, x20 // |f1|<<=1
add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5
add x17, x17, x22
sub x20, x20, x27
cbnz x28, .Loop_30
mov x27, #0x7FFFFFFF
ubfx x16, x17, #0, #32
ubfx x17, x17, #32, #32
ubfx x19, x20, #0, #32
ubfx x20, x20, #32, #32
sub x16, x16, x27 // remove the bias
sub x17, x17, x27
sub x19, x19, x27
sub x20, x20, x27
ret
.size __inner_loop_30,.-__inner_loop_30
.type __inner_loop_48, %function
.align 4
__inner_loop_48:
.Loop_48:
sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting
and x25, x8, x14
sub x15, x15, #1
and x21, x14, x24
sub x22, x14, x8 // |b_|-|a_|
subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even)
add x25, x2, x25, lsr#1
csel x14, x14, x8, hs // |b_| = |a_|
csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_|
csel x2, x2, x25, hs
add x23, x14, #2
lsr x8, x8, #1
add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5
cbnz x15, .Loop_48
ret
.size __inner_loop_48,.-__inner_loop_48

View file

@ -0,0 +1,479 @@
.text
.globl ct_is_square_mod_384
.type ct_is_square_mod_384,@function
.align 32
ct_is_square_mod_384:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
subq $536,%rsp
.cfi_adjust_cfa_offset 536
leaq 24+255(%rsp),%rax
andq $-256,%rax
movq 0(%rdi),%r8
movq 8(%rdi),%r9
movq 16(%rdi),%r10
movq 24(%rdi),%r11
movq 32(%rdi),%r12
movq 40(%rdi),%r13
movq 0(%rsi),%r14
movq 8(%rsi),%r15
movq 16(%rsi),%rbx
movq 24(%rsi),%rcx
movq 32(%rsi),%rdx
movq 40(%rsi),%rdi
movq %rax,%rsi
movq %r8,0(%rax)
movq %r9,8(%rax)
movq %r10,16(%rax)
movq %r11,24(%rax)
movq %r12,32(%rax)
movq %r13,40(%rax)
movq %r14,48(%rax)
movq %r15,56(%rax)
movq %rbx,64(%rax)
movq %rcx,72(%rax)
movq %rdx,80(%rax)
movq %rdi,88(%rax)
xorq %rbp,%rbp
movl $24,%ecx
jmp .Loop_is_square
.align 32
.Loop_is_square:
movl %ecx,16(%rsp)
call __ab_approximation_30
movq %rax,0(%rsp)
movq %rbx,8(%rsp)
movq $128+48,%rdi
xorq %rsi,%rdi
call __smulq_384_n_shift_by_30
movq 0(%rsp),%rdx
movq 8(%rsp),%rcx
leaq -48(%rdi),%rdi
call __smulq_384_n_shift_by_30
movl 16(%rsp),%ecx
xorq $128,%rsi
andq 48(%rdi),%r14
shrq $1,%r14
addq %r14,%rbp
subl $1,%ecx
jnz .Loop_is_square
movq 48(%rsi),%r9
call __inner_loop_48
movq $1,%rax
andq %rbp,%rax
xorq $1,%rax
leaq 536(%rsp),%r8
movq 0(%r8),%r15
.cfi_restore %r15
movq 8(%r8),%r14
.cfi_restore %r14
movq 16(%r8),%r13
.cfi_restore %r13
movq 24(%r8),%r12
.cfi_restore %r12
movq 32(%r8),%rbx
.cfi_restore %rbx
movq 40(%r8),%rbp
.cfi_restore %rbp
leaq 48(%r8),%rsp
.cfi_adjust_cfa_offset -536-8*6
.byte 0xf3,0xc3
.cfi_endproc
.size ct_is_square_mod_384,.-ct_is_square_mod_384
.type __smulq_384_n_shift_by_30,@function
.align 32
__smulq_384_n_shift_by_30:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
movq 32(%rsi),%r12
movq 40(%rsi),%r13
movq %rdx,%rbx
sarq $63,%rdx
xorq %rax,%rax
subq %rdx,%rax
xorq %rdx,%rbx
addq %rax,%rbx
xorq %rdx,%r8
xorq %rdx,%r9
xorq %rdx,%r10
xorq %rdx,%r11
xorq %rdx,%r12
xorq %rdx,%r13
addq %r8,%rax
adcq $0,%r9
adcq $0,%r10
adcq $0,%r11
adcq $0,%r12
adcq $0,%r13
movq %rdx,%r14
andq %rbx,%r14
mulq %rbx
movq %rax,%r8
movq %r9,%rax
movq %rdx,%r9
mulq %rbx
addq %rax,%r9
movq %r10,%rax
adcq $0,%rdx
movq %rdx,%r10
mulq %rbx
addq %rax,%r10
movq %r11,%rax
adcq $0,%rdx
movq %rdx,%r11
mulq %rbx
addq %rax,%r11
movq %r12,%rax
adcq $0,%rdx
movq %rdx,%r12
mulq %rbx
addq %rax,%r12
movq %r13,%rax
adcq $0,%rdx
movq %rdx,%r13
negq %r14
mulq %rbx
addq %rax,%r13
adcq %rdx,%r14
leaq 48(%rsi),%rsi
movq %rcx,%rdx
movq %r8,0(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq %r12,32(%rdi)
movq %r13,40(%rdi)
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq 16(%rsi),%r10
movq 24(%rsi),%r11
movq 32(%rsi),%r12
movq 40(%rsi),%r13
movq %rdx,%rbx
sarq $63,%rdx
xorq %rax,%rax
subq %rdx,%rax
xorq %rdx,%rbx
addq %rax,%rbx
xorq %rdx,%r8
xorq %rdx,%r9
xorq %rdx,%r10
xorq %rdx,%r11
xorq %rdx,%r12
xorq %rdx,%r13
addq %r8,%rax
adcq $0,%r9
adcq $0,%r10
adcq $0,%r11
adcq $0,%r12
adcq $0,%r13
movq %rdx,%r15
andq %rbx,%r15
mulq %rbx
movq %rax,%r8
movq %r9,%rax
movq %rdx,%r9
mulq %rbx
addq %rax,%r9
movq %r10,%rax
adcq $0,%rdx
movq %rdx,%r10
mulq %rbx
addq %rax,%r10
movq %r11,%rax
adcq $0,%rdx
movq %rdx,%r11
mulq %rbx
addq %rax,%r11
movq %r12,%rax
adcq $0,%rdx
movq %rdx,%r12
mulq %rbx
addq %rax,%r12
movq %r13,%rax
adcq $0,%rdx
movq %rdx,%r13
negq %r15
mulq %rbx
addq %rax,%r13
adcq %rdx,%r15
leaq -48(%rsi),%rsi
addq 0(%rdi),%r8
adcq 8(%rdi),%r9
adcq 16(%rdi),%r10
adcq 24(%rdi),%r11
adcq 32(%rdi),%r12
adcq 40(%rdi),%r13
adcq %r15,%r14
shrdq $30,%r9,%r8
shrdq $30,%r10,%r9
shrdq $30,%r11,%r10
shrdq $30,%r12,%r11
shrdq $30,%r13,%r12
shrdq $30,%r14,%r13
sarq $63,%r14
xorq %rbx,%rbx
subq %r14,%rbx
xorq %r14,%r8
xorq %r14,%r9
xorq %r14,%r10
xorq %r14,%r11
xorq %r14,%r12
xorq %r14,%r13
addq %rbx,%r8
adcq $0,%r9
adcq $0,%r10
adcq $0,%r11
adcq $0,%r12
adcq $0,%r13
movq %r8,0(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq %r12,32(%rdi)
movq %r13,40(%rdi)
.byte 0xf3,0xc3
.cfi_endproc
.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30
.type __ab_approximation_30,@function
.align 32
__ab_approximation_30:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq 88(%rsi),%rbx
movq 80(%rsi),%r15
movq 72(%rsi),%r14
movq %r13,%rax
orq %rbx,%rax
cmovzq %r12,%r13
cmovzq %r15,%rbx
cmovzq %r11,%r12
movq 64(%rsi),%r11
cmovzq %r14,%r15
movq %r13,%rax
orq %rbx,%rax
cmovzq %r12,%r13
cmovzq %r15,%rbx
cmovzq %r10,%r12
movq 56(%rsi),%r10
cmovzq %r11,%r15
movq %r13,%rax
orq %rbx,%rax
cmovzq %r12,%r13
cmovzq %r15,%rbx
cmovzq %r9,%r12
movq 48(%rsi),%r9
cmovzq %r10,%r15
movq %r13,%rax
orq %rbx,%rax
cmovzq %r12,%r13
cmovzq %r15,%rbx
cmovzq %r8,%r12
cmovzq %r9,%r15
movq %r13,%rax
orq %rbx,%rax
bsrq %rax,%rcx
leaq 1(%rcx),%rcx
cmovzq %r8,%r13
cmovzq %r9,%rbx
cmovzq %rax,%rcx
negq %rcx
shldq %cl,%r12,%r13
shldq %cl,%r15,%rbx
movq $0xFFFFFFFF00000000,%rax
movl %r8d,%r8d
movl %r9d,%r9d
andq %rax,%r13
andq %rax,%rbx
orq %r13,%r8
orq %rbx,%r9
jmp __inner_loop_30
.byte 0xf3,0xc3
.cfi_endproc
.size __ab_approximation_30,.-__ab_approximation_30
.type __inner_loop_30,@function
.align 32
__inner_loop_30:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq $0x7FFFFFFF80000000,%rbx
movq $0x800000007FFFFFFF,%rcx
leaq -1(%rbx),%r15
movl $30,%edi
.Loop_30:
movq %r8,%rax
andq %r9,%rax
shrq $1,%rax
cmpq %r9,%r8
movq %r8,%r10
movq %r9,%r11
leaq (%rax,%rbp,1),%rax
movq %rbx,%r12
movq %rcx,%r13
movq %rbp,%r14
cmovbq %r9,%r8
cmovbq %r10,%r9
cmovbq %rcx,%rbx
cmovbq %r12,%rcx
cmovbq %rax,%rbp
subq %r9,%r8
subq %rcx,%rbx
addq %r15,%rbx
testq $1,%r10
cmovzq %r10,%r8
cmovzq %r11,%r9
cmovzq %r12,%rbx
cmovzq %r13,%rcx
cmovzq %r14,%rbp
leaq 2(%r9),%rax
shrq $1,%r8
shrq $2,%rax
addq %rcx,%rcx
leaq (%rax,%rbp,1),%rbp
subq %r15,%rcx
subl $1,%edi
jnz .Loop_30
shrq $32,%r15
movl %ebx,%eax
shrq $32,%rbx
movl %ecx,%edx
shrq $32,%rcx
subq %r15,%rax
subq %r15,%rbx
subq %r15,%rdx
subq %r15,%rcx
.byte 0xf3,0xc3
.cfi_endproc
.size __inner_loop_30,.-__inner_loop_30
.type __inner_loop_48,@function
.align 32
__inner_loop_48:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movl $48,%edi
.Loop_48:
movq %r8,%rax
andq %r9,%rax
shrq $1,%rax
cmpq %r9,%r8
movq %r8,%r10
movq %r9,%r11
leaq (%rax,%rbp,1),%rax
movq %rbp,%r12
cmovbq %r9,%r8
cmovbq %r10,%r9
cmovbq %rax,%rbp
subq %r9,%r8
testq $1,%r10
cmovzq %r10,%r8
cmovzq %r11,%r9
cmovzq %r12,%rbp
leaq 2(%r9),%rax
shrq $1,%r8
shrq $2,%rax
addq %rax,%rbp
subl $1,%edi
jnz .Loop_48
.byte 0xf3,0xc3
.cfi_endproc
.size __inner_loop_48,.-__inner_loop_48
.section .note.GNU-stack,"",@progbits
.section .note.gnu.property,"a",@note
.long 4,2f-1f,5
.byte 0x47,0x4E,0x55,0
1: .long 0xc0000002,4,3
.align 8
2:

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

88
blst/elf/div3w-armv8.S Normal file
View file

@ -0,0 +1,88 @@
.text
.globl div_3_limbs
.type div_3_limbs,%function
.align 5
div_3_limbs:
ldp x4,x5,[x0] // load R
eor x0,x0,x0 // Q = 0
mov x3,#64 // loop counter
nop
.Loop:
subs x6,x4,x1 // R - D
add x0,x0,x0 // Q <<= 1
sbcs x7,x5,x2
add x0,x0,#1 // Q + speculative bit
csel x4,x4,x6,lo // select between R and R - D
extr x1,x2,x1,#1 // D >>= 1
csel x5,x5,x7,lo
lsr x2,x2,#1
sbc x0,x0,xzr // subtract speculative bit
sub x3,x3,#1
cbnz x3,.Loop
asr x3,x0,#63 // top bit -> mask
add x0,x0,x0 // Q <<= 1
subs x6,x4,x1 // R - D
add x0,x0,#1 // Q + specilative bit
sbcs x7,x5,x2
sbc x0,x0,xzr // subtract speculative bit
orr x0,x0,x3 // all ones if overflow
ret
.size div_3_limbs,.-div_3_limbs
.globl quot_rem_128
.type quot_rem_128,%function
.align 5
quot_rem_128:
ldp x3,x4,[x1]
mul x5,x3,x2 // divisor[0:1} * quotient
umulh x6,x3,x2
mul x11, x4,x2
umulh x7,x4,x2
ldp x8,x9,[x0] // load 3 limbs of the dividend
ldr x10,[x0,#16]
adds x6,x6,x11
adc x7,x7,xzr
subs x8,x8,x5 // dividend - divisor * quotient
sbcs x9,x9,x6
sbcs x10,x10,x7
sbc x5,xzr,xzr // borrow -> mask
add x2,x2,x5 // if borrowed, adjust the quotient ...
and x3,x3,x5
and x4,x4,x5
adds x8,x8,x3 // ... and add divisor
adc x9,x9,x4
stp x8,x9,[x0] // save 2 limbs of the remainder
str x2,[x0,#16] // and one limb of the quotient
mov x0,x2 // return adjusted quotient
ret
.size quot_rem_128,.-quot_rem_128
.globl quot_rem_64
.type quot_rem_64,%function
.align 5
quot_rem_64:
ldr x3,[x1]
ldr x8,[x0] // load 1 limb of the dividend
mul x5,x3,x2 // divisor * quotient
sub x8,x8,x5 // dividend - divisor * quotient
stp x8,x2,[x0] // save remainder and quotient
mov x0,x2 // return quotient
ret
.size quot_rem_64,.-quot_rem_64

123
blst/elf/div3w-x86_64.s Normal file
View file

@ -0,0 +1,123 @@
.text
.globl div_3_limbs
.hidden div_3_limbs
.type div_3_limbs,@function
.align 32
div_3_limbs:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq (%rdi),%r8
movq 8(%rdi),%r9
xorq %rax,%rax
movl $64,%ecx
.Loop:
movq %r8,%r10
subq %rsi,%r8
movq %r9,%r11
sbbq %rdx,%r9
leaq 1(%rax,%rax,1),%rax
movq %rdx,%rdi
cmovcq %r10,%r8
cmovcq %r11,%r9
sbbq $0,%rax
shlq $63,%rdi
shrq $1,%rsi
shrq $1,%rdx
orq %rdi,%rsi
subl $1,%ecx
jnz .Loop
leaq 1(%rax,%rax,1),%rcx
sarq $63,%rax
subq %rsi,%r8
sbbq %rdx,%r9
sbbq $0,%rcx
orq %rcx,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size div_3_limbs,.-div_3_limbs
.globl quot_rem_128
.hidden quot_rem_128
.type quot_rem_128,@function
.align 32
quot_rem_128:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq %rdx,%rax
movq %rdx,%rcx
mulq 0(%rsi)
movq %rax,%r8
movq %rcx,%rax
movq %rdx,%r9
mulq 8(%rsi)
addq %rax,%r9
adcq $0,%rdx
movq 0(%rdi),%r10
movq 8(%rdi),%r11
movq 16(%rdi),%rax
subq %r8,%r10
sbbq %r9,%r11
sbbq %rdx,%rax
sbbq %r8,%r8
addq %r8,%rcx
movq %r8,%r9
andq 0(%rsi),%r8
andq 8(%rsi),%r9
addq %r8,%r10
adcq %r9,%r11
movq %r10,0(%rdi)
movq %r11,8(%rdi)
movq %rcx,16(%rdi)
movq %rcx,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size quot_rem_128,.-quot_rem_128
.globl quot_rem_64
.hidden quot_rem_64
.type quot_rem_64,@function
.align 32
quot_rem_64:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq %rdx,%rax
imulq 0(%rsi),%rdx
movq 0(%rdi),%r10
subq %rdx,%r10
movq %r10,0(%rdi)
movq %rax,8(%rdi)
.byte 0xf3,0xc3
.cfi_endproc
.size quot_rem_64,.-quot_rem_64
.section .note.GNU-stack,"",@progbits
.section .note.gnu.property,"a",@note
.long 4,2f-1f,5
.byte 0x47,0x4E,0x55,0
1: .long 0xc0000002,4,3
.align 8
2:

View file

@ -0,0 +1,464 @@
.text
.globl mul_mont_sparse_256
.hidden mul_mont_sparse_256
.type mul_mont_sparse_256,%function
.align 5
mul_mont_sparse_256:
stp x29,x30,[sp,#-64]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
ldp x10,x11,[x1]
ldr x9, [x2]
ldp x12,x13,[x1,#16]
mul x19,x10,x9
ldp x5,x6,[x3]
mul x20,x11,x9
ldp x7,x8,[x3,#16]
mul x21,x12,x9
mul x22,x13,x9
umulh x14,x10,x9
umulh x15,x11,x9
mul x3,x4,x19
umulh x16,x12,x9
umulh x17,x13,x9
adds x20,x20,x14
//mul x14,x5,x3
adcs x21,x21,x15
mul x15,x6,x3
adcs x22,x22,x16
mul x16,x7,x3
adc x23,xzr, x17
mul x17,x8,x3
ldr x9,[x2,8*1]
subs xzr,x19,#1 //adds x19,x19,x14
umulh x14,x5,x3
adcs x20,x20,x15
umulh x15,x6,x3
adcs x21,x21,x16
umulh x16,x7,x3
adcs x22,x22,x17
umulh x17,x8,x3
adc x23,x23,xzr
adds x19,x20,x14
mul x14,x10,x9
adcs x20,x21,x15
mul x15,x11,x9
adcs x21,x22,x16
mul x16,x12,x9
adcs x22,x23,x17
mul x17,x13,x9
adc x23,xzr,xzr
adds x19,x19,x14
umulh x14,x10,x9
adcs x20,x20,x15
umulh x15,x11,x9
adcs x21,x21,x16
mul x3,x4,x19
umulh x16,x12,x9
adcs x22,x22,x17
umulh x17,x13,x9
adc x23,x23,xzr
adds x20,x20,x14
//mul x14,x5,x3
adcs x21,x21,x15
mul x15,x6,x3
adcs x22,x22,x16
mul x16,x7,x3
adc x23,x23,x17
mul x17,x8,x3
ldr x9,[x2,8*2]
subs xzr,x19,#1 //adds x19,x19,x14
umulh x14,x5,x3
adcs x20,x20,x15
umulh x15,x6,x3
adcs x21,x21,x16
umulh x16,x7,x3
adcs x22,x22,x17
umulh x17,x8,x3
adc x23,x23,xzr
adds x19,x20,x14
mul x14,x10,x9
adcs x20,x21,x15
mul x15,x11,x9
adcs x21,x22,x16
mul x16,x12,x9
adcs x22,x23,x17
mul x17,x13,x9
adc x23,xzr,xzr
adds x19,x19,x14
umulh x14,x10,x9
adcs x20,x20,x15
umulh x15,x11,x9
adcs x21,x21,x16
mul x3,x4,x19
umulh x16,x12,x9
adcs x22,x22,x17
umulh x17,x13,x9
adc x23,x23,xzr
adds x20,x20,x14
//mul x14,x5,x3
adcs x21,x21,x15
mul x15,x6,x3
adcs x22,x22,x16
mul x16,x7,x3
adc x23,x23,x17
mul x17,x8,x3
ldr x9,[x2,8*3]
subs xzr,x19,#1 //adds x19,x19,x14
umulh x14,x5,x3
adcs x20,x20,x15
umulh x15,x6,x3
adcs x21,x21,x16
umulh x16,x7,x3
adcs x22,x22,x17
umulh x17,x8,x3
adc x23,x23,xzr
adds x19,x20,x14
mul x14,x10,x9
adcs x20,x21,x15
mul x15,x11,x9
adcs x21,x22,x16
mul x16,x12,x9
adcs x22,x23,x17
mul x17,x13,x9
adc x23,xzr,xzr
adds x19,x19,x14
umulh x14,x10,x9
adcs x20,x20,x15
umulh x15,x11,x9
adcs x21,x21,x16
mul x3,x4,x19
umulh x16,x12,x9
adcs x22,x22,x17
umulh x17,x13,x9
adc x23,x23,xzr
adds x20,x20,x14
//mul x14,x5,x3
adcs x21,x21,x15
mul x15,x6,x3
adcs x22,x22,x16
mul x16,x7,x3
adc x23,x23,x17
mul x17,x8,x3
subs xzr,x19,#1 //adds x19,x19,x14
umulh x14,x5,x3
adcs x20,x20,x15
umulh x15,x6,x3
adcs x21,x21,x16
umulh x16,x7,x3
adcs x22,x22,x17
umulh x17,x8,x3
adc x23,x23,xzr
adds x19,x20,x14
adcs x20,x21,x15
adcs x21,x22,x16
adcs x22,x23,x17
adc x23,xzr,xzr
subs x14,x19,x5
sbcs x15,x20,x6
sbcs x16,x21,x7
sbcs x17,x22,x8
sbcs xzr, x23,xzr
csel x19,x19,x14,lo
csel x20,x20,x15,lo
csel x21,x21,x16,lo
csel x22,x22,x17,lo
stp x19,x20,[x0]
stp x21,x22,[x0,#16]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldr x29,[sp],#64
ret
.size mul_mont_sparse_256,.-mul_mont_sparse_256
.globl sqr_mont_sparse_256
.hidden sqr_mont_sparse_256
.type sqr_mont_sparse_256,%function
.align 5
sqr_mont_sparse_256:
.inst 0xd503233f
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
ldp x5,x6,[x1]
ldp x7,x8,[x1,#16]
mov x4,x3
////////////////////////////////////////////////////////////////
// | | | | | |a1*a0| |
// | | | | |a2*a0| | |
// | |a3*a2|a3*a0| | | |
// | | | |a2*a1| | | |
// | | |a3*a1| | | | |
// *| | | | | | | | 2|
// +|a3*a3|a2*a2|a1*a1|a0*a0|
// |--+--+--+--+--+--+--+--|
// |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10
//
// "can't overflow" below mark carrying into high part of
// multiplication result, which can't overflow, because it
// can never be all ones.
mul x11,x6,x5 // a[1]*a[0]
umulh x15,x6,x5
mul x12,x7,x5 // a[2]*a[0]
umulh x16,x7,x5
mul x13,x8,x5 // a[3]*a[0]
umulh x19,x8,x5
adds x12,x12,x15 // accumulate high parts of multiplication
mul x14,x7,x6 // a[2]*a[1]
umulh x15,x7,x6
adcs x13,x13,x16
mul x16,x8,x6 // a[3]*a[1]
umulh x17,x8,x6
adc x19,x19,xzr // can't overflow
mul x20,x8,x7 // a[3]*a[2]
umulh x21,x8,x7
adds x15,x15,x16 // accumulate high parts of multiplication
mul x10,x5,x5 // a[0]*a[0]
adc x16,x17,xzr // can't overflow
adds x13,x13,x14 // accumulate low parts of multiplication
umulh x5,x5,x5
adcs x19,x19,x15
mul x15,x6,x6 // a[1]*a[1]
adcs x20,x20,x16
umulh x6,x6,x6
adc x21,x21,xzr // can't overflow
adds x11,x11,x11 // acc[1-6]*=2
mul x16,x7,x7 // a[2]*a[2]
adcs x12,x12,x12
umulh x7,x7,x7
adcs x13,x13,x13
mul x17,x8,x8 // a[3]*a[3]
adcs x19,x19,x19
umulh x8,x8,x8
adcs x20,x20,x20
adcs x21,x21,x21
adc x22,xzr,xzr
adds x11,x11,x5 // +a[i]*a[i]
adcs x12,x12,x15
adcs x13,x13,x6
adcs x19,x19,x16
adcs x20,x20,x7
adcs x21,x21,x17
adc x22,x22,x8
bl __mul_by_1_mont_256
ldr x30,[x29,#8]
adds x10,x10,x19 // accumulate upper half
adcs x11,x11,x20
adcs x12,x12,x21
adcs x13,x13,x22
adc x19,xzr,xzr
subs x14,x10,x5
sbcs x15,x11,x6
sbcs x16,x12,x7
sbcs x17,x13,x8
sbcs xzr, x19,xzr
csel x10,x10,x14,lo
csel x11,x11,x15,lo
csel x12,x12,x16,lo
csel x13,x13,x17,lo
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
ldp x19,x20,[x29,#16]
ldp x21,x22,[x29,#32]
ldr x29,[sp],#48
.inst 0xd50323bf
ret
.size sqr_mont_sparse_256,.-sqr_mont_sparse_256
.globl from_mont_256
.hidden from_mont_256
.type from_mont_256,%function
.align 5
from_mont_256:
.inst 0xd503233f
stp x29,x30,[sp,#-16]!
add x29,sp,#0
mov x4,x3
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
bl __mul_by_1_mont_256
ldr x30,[x29,#8]
subs x14,x10,x5
sbcs x15,x11,x6
sbcs x16,x12,x7
sbcs x17,x13,x8
csel x10,x10,x14,lo
csel x11,x11,x15,lo
csel x12,x12,x16,lo
csel x13,x13,x17,lo
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
ldr x29,[sp],#16
.inst 0xd50323bf
ret
.size from_mont_256,.-from_mont_256
.globl redc_mont_256
.hidden redc_mont_256
.type redc_mont_256,%function
.align 5
redc_mont_256:
.inst 0xd503233f
stp x29,x30,[sp,#-16]!
add x29,sp,#0
mov x4,x3
ldp x10,x11,[x1]
ldp x12,x13,[x1,#16]
bl __mul_by_1_mont_256
ldr x30,[x29,#8]
ldp x14,x15,[x1,#32]
ldp x16,x17,[x1,#48]
adds x10,x10,x14
adcs x11,x11,x15
adcs x12,x12,x16
adcs x13,x13,x17
adc x9,xzr,xzr
subs x14,x10,x5
sbcs x15,x11,x6
sbcs x16,x12,x7
sbcs x17,x13,x8
sbcs xzr, x9,xzr
csel x10,x10,x14,lo
csel x11,x11,x15,lo
csel x12,x12,x16,lo
csel x13,x13,x17,lo
stp x10,x11,[x0]
stp x12,x13,[x0,#16]
ldr x29,[sp],#16
.inst 0xd50323bf
ret
.size redc_mont_256,.-redc_mont_256
.type __mul_by_1_mont_256,%function
.align 5
__mul_by_1_mont_256:
mul x3,x4,x10
ldp x5,x6,[x2]
ldp x7,x8,[x2,#16]
//mul x14,x5,x3
mul x15,x6,x3
mul x16,x7,x3
mul x17,x8,x3
subs xzr,x10,#1 //adds x10,x10,x14
umulh x14,x5,x3
adcs x11,x11,x15
umulh x15,x6,x3
adcs x12,x12,x16
umulh x16,x7,x3
adcs x13,x13,x17
umulh x17,x8,x3
adc x9,xzr,xzr
adds x10,x11,x14
adcs x11,x12,x15
adcs x12,x13,x16
mul x3,x4,x10
adc x13,x9,x17
//mul x14,x5,x3
mul x15,x6,x3
mul x16,x7,x3
mul x17,x8,x3
subs xzr,x10,#1 //adds x10,x10,x14
umulh x14,x5,x3
adcs x11,x11,x15
umulh x15,x6,x3
adcs x12,x12,x16
umulh x16,x7,x3
adcs x13,x13,x17
umulh x17,x8,x3
adc x9,xzr,xzr
adds x10,x11,x14
adcs x11,x12,x15
adcs x12,x13,x16
mul x3,x4,x10
adc x13,x9,x17
//mul x14,x5,x3
mul x15,x6,x3
mul x16,x7,x3
mul x17,x8,x3
subs xzr,x10,#1 //adds x10,x10,x14
umulh x14,x5,x3
adcs x11,x11,x15
umulh x15,x6,x3
adcs x12,x12,x16
umulh x16,x7,x3
adcs x13,x13,x17
umulh x17,x8,x3
adc x9,xzr,xzr
adds x10,x11,x14
adcs x11,x12,x15
adcs x12,x13,x16
mul x3,x4,x10
adc x13,x9,x17
//mul x14,x5,x3
mul x15,x6,x3
mul x16,x7,x3
mul x17,x8,x3
subs xzr,x10,#1 //adds x10,x10,x14
umulh x14,x5,x3
adcs x11,x11,x15
umulh x15,x6,x3
adcs x12,x12,x16
umulh x16,x7,x3
adcs x13,x13,x17
umulh x17,x8,x3
adc x9,xzr,xzr
adds x10,x11,x14
adcs x11,x12,x15
adcs x12,x13,x16
adc x13,x9,x17
ret
.size __mul_by_1_mont_256,.-__mul_by_1_mont_256

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,714 @@
.text
.globl mul_mont_sparse_256
.hidden mul_mont_sparse_256
.type mul_mont_sparse_256,@function
.align 32
mul_mont_sparse_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
pushq %rdi
.cfi_adjust_cfa_offset 8
movq 0(%rdx),%rax
movq 0(%rsi),%r13
movq 8(%rsi),%r14
movq 16(%rsi),%r12
movq 24(%rsi),%rbp
movq %rdx,%rbx
movq %rax,%r15
mulq %r13
movq %rax,%r9
movq %r15,%rax
movq %rdx,%r10
call __mulq_mont_sparse_256
movq 8(%rsp),%r15
.cfi_restore %r15
movq 16(%rsp),%r14
.cfi_restore %r14
movq 24(%rsp),%r13
.cfi_restore %r13
movq 32(%rsp),%r12
.cfi_restore %r12
movq 40(%rsp),%rbx
.cfi_restore %rbx
movq 48(%rsp),%rbp
.cfi_restore %rbp
leaq 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.byte 0xf3,0xc3
.cfi_endproc
.size mul_mont_sparse_256,.-mul_mont_sparse_256
.globl sqr_mont_sparse_256
.hidden sqr_mont_sparse_256
.type sqr_mont_sparse_256,@function
.align 32
sqr_mont_sparse_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
pushq %rdi
.cfi_adjust_cfa_offset 8
movq 0(%rsi),%rax
movq %rcx,%r8
movq 8(%rsi),%r14
movq %rdx,%rcx
movq 16(%rsi),%r12
leaq (%rsi),%rbx
movq 24(%rsi),%rbp
movq %rax,%r15
mulq %rax
movq %rax,%r9
movq %r15,%rax
movq %rdx,%r10
call __mulq_mont_sparse_256
movq 8(%rsp),%r15
.cfi_restore %r15
movq 16(%rsp),%r14
.cfi_restore %r14
movq 24(%rsp),%r13
.cfi_restore %r13
movq 32(%rsp),%r12
.cfi_restore %r12
movq 40(%rsp),%rbx
.cfi_restore %rbx
movq 48(%rsp),%rbp
.cfi_restore %rbp
leaq 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.byte 0xf3,0xc3
.cfi_endproc
.size sqr_mont_sparse_256,.-sqr_mont_sparse_256
.type __mulq_mont_sparse_256,@function
.align 32
__mulq_mont_sparse_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
mulq %r14
addq %rax,%r10
movq %r15,%rax
adcq $0,%rdx
movq %rdx,%r11
mulq %r12
addq %rax,%r11
movq %r15,%rax
adcq $0,%rdx
movq %rdx,%r12
mulq %rbp
addq %rax,%r12
movq 8(%rbx),%rax
adcq $0,%rdx
xorq %r14,%r14
movq %rdx,%r13
movq %r9,%rdi
imulq %r8,%r9
movq %rax,%r15
mulq 0(%rsi)
addq %rax,%r10
movq %r15,%rax
adcq $0,%rdx
movq %rdx,%rbp
mulq 8(%rsi)
addq %rax,%r11
movq %r15,%rax
adcq $0,%rdx
addq %rbp,%r11
adcq $0,%rdx
movq %rdx,%rbp
mulq 16(%rsi)
addq %rax,%r12
movq %r15,%rax
adcq $0,%rdx
addq %rbp,%r12
adcq $0,%rdx
movq %rdx,%rbp
mulq 24(%rsi)
addq %rax,%r13
movq %r9,%rax
adcq $0,%rdx
addq %rbp,%r13
adcq %rdx,%r14
xorq %r15,%r15
mulq 0(%rcx)
addq %rax,%rdi
movq %r9,%rax
adcq %rdx,%rdi
mulq 8(%rcx)
addq %rax,%r10
movq %r9,%rax
adcq $0,%rdx
addq %rdi,%r10
adcq $0,%rdx
movq %rdx,%rbp
mulq 16(%rcx)
addq %rax,%r11
movq %r9,%rax
adcq $0,%rdx
addq %rbp,%r11
adcq $0,%rdx
movq %rdx,%rbp
mulq 24(%rcx)
addq %rax,%r12
movq 16(%rbx),%rax
adcq $0,%rdx
addq %rbp,%r12
adcq $0,%rdx
addq %rdx,%r13
adcq $0,%r14
adcq $0,%r15
movq %r10,%rdi
imulq %r8,%r10
movq %rax,%r9
mulq 0(%rsi)
addq %rax,%r11
movq %r9,%rax
adcq $0,%rdx
movq %rdx,%rbp
mulq 8(%rsi)
addq %rax,%r12
movq %r9,%rax
adcq $0,%rdx
addq %rbp,%r12
adcq $0,%rdx
movq %rdx,%rbp
mulq 16(%rsi)
addq %rax,%r13
movq %r9,%rax
adcq $0,%rdx
addq %rbp,%r13
adcq $0,%rdx
movq %rdx,%rbp
mulq 24(%rsi)
addq %rax,%r14
movq %r10,%rax
adcq $0,%rdx
addq %rbp,%r14
adcq %rdx,%r15
xorq %r9,%r9
mulq 0(%rcx)
addq %rax,%rdi
movq %r10,%rax
adcq %rdx,%rdi
mulq 8(%rcx)
addq %rax,%r11
movq %r10,%rax
adcq $0,%rdx
addq %rdi,%r11
adcq $0,%rdx
movq %rdx,%rbp
mulq 16(%rcx)
addq %rax,%r12
movq %r10,%rax
adcq $0,%rdx
addq %rbp,%r12
adcq $0,%rdx
movq %rdx,%rbp
mulq 24(%rcx)
addq %rax,%r13
movq 24(%rbx),%rax
adcq $0,%rdx
addq %rbp,%r13
adcq $0,%rdx
addq %rdx,%r14
adcq $0,%r15
adcq $0,%r9
movq %r11,%rdi
imulq %r8,%r11
movq %rax,%r10
mulq 0(%rsi)
addq %rax,%r12
movq %r10,%rax
adcq $0,%rdx
movq %rdx,%rbp
mulq 8(%rsi)
addq %rax,%r13
movq %r10,%rax
adcq $0,%rdx
addq %rbp,%r13
adcq $0,%rdx
movq %rdx,%rbp
mulq 16(%rsi)
addq %rax,%r14
movq %r10,%rax
adcq $0,%rdx
addq %rbp,%r14
adcq $0,%rdx
movq %rdx,%rbp
mulq 24(%rsi)
addq %rax,%r15
movq %r11,%rax
adcq $0,%rdx
addq %rbp,%r15
adcq %rdx,%r9
xorq %r10,%r10
mulq 0(%rcx)
addq %rax,%rdi
movq %r11,%rax
adcq %rdx,%rdi
mulq 8(%rcx)
addq %rax,%r12
movq %r11,%rax
adcq $0,%rdx
addq %rdi,%r12
adcq $0,%rdx
movq %rdx,%rbp
mulq 16(%rcx)
addq %rax,%r13
movq %r11,%rax
adcq $0,%rdx
addq %rbp,%r13
adcq $0,%rdx
movq %rdx,%rbp
mulq 24(%rcx)
addq %rax,%r14
movq %r12,%rax
adcq $0,%rdx
addq %rbp,%r14
adcq $0,%rdx
addq %rdx,%r15
adcq $0,%r9
adcq $0,%r10
imulq %r8,%rax
movq 8(%rsp),%rsi
movq %rax,%r11
mulq 0(%rcx)
addq %rax,%r12
movq %r11,%rax
adcq %rdx,%r12
mulq 8(%rcx)
addq %rax,%r13
movq %r11,%rax
adcq $0,%rdx
addq %r12,%r13
adcq $0,%rdx
movq %rdx,%rbp
mulq 16(%rcx)
addq %rax,%r14
movq %r11,%rax
adcq $0,%rdx
addq %rbp,%r14
adcq $0,%rdx
movq %rdx,%rbp
mulq 24(%rcx)
movq %r14,%rbx
addq %rbp,%r15
adcq $0,%rdx
addq %rax,%r15
movq %r13,%rax
adcq $0,%rdx
addq %rdx,%r9
adcq $0,%r10
movq %r15,%r12
subq 0(%rcx),%r13
sbbq 8(%rcx),%r14
sbbq 16(%rcx),%r15
movq %r9,%rbp
sbbq 24(%rcx),%r9
sbbq $0,%r10
cmovcq %rax,%r13
cmovcq %rbx,%r14
cmovcq %r12,%r15
movq %r13,0(%rsi)
cmovcq %rbp,%r9
movq %r14,8(%rsi)
movq %r15,16(%rsi)
movq %r9,24(%rsi)
.byte 0xf3,0xc3
.cfi_endproc
.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256
.globl from_mont_256
.hidden from_mont_256
.type from_mont_256,@function
.align 32
from_mont_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq %rdx,%rbx
call __mulq_by_1_mont_256
movq %r14,%r10
movq %r15,%r11
movq %r9,%r12
subq 0(%rbx),%r13
sbbq 8(%rbx),%r14
sbbq 16(%rbx),%r15
sbbq 24(%rbx),%r9
cmovncq %r13,%rax
cmovncq %r14,%r10
cmovncq %r15,%r11
movq %rax,0(%rdi)
cmovncq %r9,%r12
movq %r10,8(%rdi)
movq %r11,16(%rdi)
movq %r12,24(%rdi)
movq 8(%rsp),%r15
.cfi_restore %r15
movq 16(%rsp),%r14
.cfi_restore %r14
movq 24(%rsp),%r13
.cfi_restore %r13
movq 32(%rsp),%r12
.cfi_restore %r12
movq 40(%rsp),%rbx
.cfi_restore %rbx
movq 48(%rsp),%rbp
.cfi_restore %rbp
leaq 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.byte 0xf3,0xc3
.cfi_endproc
.size from_mont_256,.-from_mont_256
.globl redc_mont_256
.hidden redc_mont_256
.type redc_mont_256,@function
.align 32
redc_mont_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq %rdx,%rbx
call __mulq_by_1_mont_256
addq 32(%rsi),%r13
adcq 40(%rsi),%r14
movq %r13,%rax
adcq 48(%rsi),%r15
movq %r14,%r10
adcq 56(%rsi),%r9
sbbq %rsi,%rsi
movq %r15,%r11
subq 0(%rbx),%r13
sbbq 8(%rbx),%r14
sbbq 16(%rbx),%r15
movq %r9,%r12
sbbq 24(%rbx),%r9
sbbq $0,%rsi
cmovncq %r13,%rax
cmovncq %r14,%r10
cmovncq %r15,%r11
movq %rax,0(%rdi)
cmovncq %r9,%r12
movq %r10,8(%rdi)
movq %r11,16(%rdi)
movq %r12,24(%rdi)
movq 8(%rsp),%r15
.cfi_restore %r15
movq 16(%rsp),%r14
.cfi_restore %r14
movq 24(%rsp),%r13
.cfi_restore %r13
movq 32(%rsp),%r12
.cfi_restore %r12
movq 40(%rsp),%rbx
.cfi_restore %rbx
movq 48(%rsp),%rbp
.cfi_restore %rbp
leaq 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.byte 0xf3,0xc3
.cfi_endproc
.size redc_mont_256,.-redc_mont_256
.type __mulq_by_1_mont_256,@function
.align 32
__mulq_by_1_mont_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq 0(%rsi),%rax
movq 8(%rsi),%r10
movq 16(%rsi),%r11
movq 24(%rsi),%r12
movq %rax,%r13
imulq %rcx,%rax
movq %rax,%r9
mulq 0(%rbx)
addq %rax,%r13
movq %r9,%rax
adcq %rdx,%r13
mulq 8(%rbx)
addq %rax,%r10
movq %r9,%rax
adcq $0,%rdx
addq %r13,%r10
adcq $0,%rdx
movq %rdx,%r13
mulq 16(%rbx)
movq %r10,%r14
imulq %rcx,%r10
addq %rax,%r11
movq %r9,%rax
adcq $0,%rdx
addq %r13,%r11
adcq $0,%rdx
movq %rdx,%r13
mulq 24(%rbx)
addq %rax,%r12
movq %r10,%rax
adcq $0,%rdx
addq %r13,%r12
adcq $0,%rdx
movq %rdx,%r13
mulq 0(%rbx)
addq %rax,%r14
movq %r10,%rax
adcq %rdx,%r14
mulq 8(%rbx)
addq %rax,%r11
movq %r10,%rax
adcq $0,%rdx
addq %r14,%r11
adcq $0,%rdx
movq %rdx,%r14
mulq 16(%rbx)
movq %r11,%r15
imulq %rcx,%r11
addq %rax,%r12
movq %r10,%rax
adcq $0,%rdx
addq %r14,%r12
adcq $0,%rdx
movq %rdx,%r14
mulq 24(%rbx)
addq %rax,%r13
movq %r11,%rax
adcq $0,%rdx
addq %r14,%r13
adcq $0,%rdx
movq %rdx,%r14
mulq 0(%rbx)
addq %rax,%r15
movq %r11,%rax
adcq %rdx,%r15
mulq 8(%rbx)
addq %rax,%r12
movq %r11,%rax
adcq $0,%rdx
addq %r15,%r12
adcq $0,%rdx
movq %rdx,%r15
mulq 16(%rbx)
movq %r12,%r9
imulq %rcx,%r12
addq %rax,%r13
movq %r11,%rax
adcq $0,%rdx
addq %r15,%r13
adcq $0,%rdx
movq %rdx,%r15
mulq 24(%rbx)
addq %rax,%r14
movq %r12,%rax
adcq $0,%rdx
addq %r15,%r14
adcq $0,%rdx
movq %rdx,%r15
mulq 0(%rbx)
addq %rax,%r9
movq %r12,%rax
adcq %rdx,%r9
mulq 8(%rbx)
addq %rax,%r13
movq %r12,%rax
adcq $0,%rdx
addq %r9,%r13
adcq $0,%rdx
movq %rdx,%r9
mulq 16(%rbx)
addq %rax,%r14
movq %r12,%rax
adcq $0,%rdx
addq %r9,%r14
adcq $0,%rdx
movq %rdx,%r9
mulq 24(%rbx)
addq %rax,%r15
movq %r13,%rax
adcq $0,%rdx
addq %r9,%r15
adcq $0,%rdx
movq %rdx,%r9
.byte 0xf3,0xc3
.cfi_endproc
.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256
.section .note.GNU-stack,"",@progbits
.section .note.gnu.property,"a",@note
.long 4,2f-1f,5
.byte 0x47,0x4E,0x55,0
1: .long 0xc0000002,4,3
.align 8
2:

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,627 @@
.text
.globl mulx_mont_sparse_256
.hidden mulx_mont_sparse_256
.type mulx_mont_sparse_256,@function
.align 32
mulx_mont_sparse_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq %rdx,%rbx
movq 0(%rdx),%rdx
movq 0(%rsi),%r14
movq 8(%rsi),%r15
movq 16(%rsi),%rbp
movq 24(%rsi),%r9
leaq -128(%rsi),%rsi
leaq -128(%rcx),%rcx
mulxq %r14,%rax,%r11
call __mulx_mont_sparse_256
movq 8(%rsp),%r15
.cfi_restore %r15
movq 16(%rsp),%r14
.cfi_restore %r14
movq 24(%rsp),%r13
.cfi_restore %r13
movq 32(%rsp),%r12
.cfi_restore %r12
movq 40(%rsp),%rbx
.cfi_restore %rbx
movq 48(%rsp),%rbp
.cfi_restore %rbp
leaq 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.byte 0xf3,0xc3
.cfi_endproc
.size mulx_mont_sparse_256,.-mulx_mont_sparse_256
.globl sqrx_mont_sparse_256
.hidden sqrx_mont_sparse_256
.type sqrx_mont_sparse_256,@function
.align 32
sqrx_mont_sparse_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq %rsi,%rbx
movq %rcx,%r8
movq %rdx,%rcx
movq 0(%rsi),%rdx
movq 8(%rsi),%r15
movq 16(%rsi),%rbp
movq 24(%rsi),%r9
leaq -128(%rbx),%rsi
leaq -128(%rcx),%rcx
mulxq %rdx,%rax,%r11
call __mulx_mont_sparse_256
movq 8(%rsp),%r15
.cfi_restore %r15
movq 16(%rsp),%r14
.cfi_restore %r14
movq 24(%rsp),%r13
.cfi_restore %r13
movq 32(%rsp),%r12
.cfi_restore %r12
movq 40(%rsp),%rbx
.cfi_restore %rbx
movq 48(%rsp),%rbp
.cfi_restore %rbp
leaq 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.byte 0xf3,0xc3
.cfi_endproc
.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256
.type __mulx_mont_sparse_256,@function
.align 32
__mulx_mont_sparse_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
mulxq %r15,%r15,%r12
mulxq %rbp,%rbp,%r13
addq %r15,%r11
mulxq %r9,%r9,%r14
movq 8(%rbx),%rdx
adcq %rbp,%r12
adcq %r9,%r13
adcq $0,%r14
movq %rax,%r10
imulq %r8,%rax
xorq %r15,%r15
mulxq 0+128(%rsi),%rbp,%r9
adoxq %rbp,%r11
adcxq %r9,%r12
mulxq 8+128(%rsi),%rbp,%r9
adoxq %rbp,%r12
adcxq %r9,%r13
mulxq 16+128(%rsi),%rbp,%r9
adoxq %rbp,%r13
adcxq %r9,%r14
mulxq 24+128(%rsi),%rbp,%r9
movq %rax,%rdx
adoxq %rbp,%r14
adcxq %r15,%r9
adoxq %r9,%r15
mulxq 0+128(%rcx),%rbp,%rax
adcxq %rbp,%r10
adoxq %r11,%rax
mulxq 8+128(%rcx),%rbp,%r9
adcxq %rbp,%rax
adoxq %r9,%r12
mulxq 16+128(%rcx),%rbp,%r9
adcxq %rbp,%r12
adoxq %r9,%r13
mulxq 24+128(%rcx),%rbp,%r9
movq 16(%rbx),%rdx
adcxq %rbp,%r13
adoxq %r9,%r14
adcxq %r10,%r14
adoxq %r10,%r15
adcxq %r10,%r15
adoxq %r10,%r10
adcq $0,%r10
movq %rax,%r11
imulq %r8,%rax
xorq %rbp,%rbp
mulxq 0+128(%rsi),%rbp,%r9
adoxq %rbp,%r12
adcxq %r9,%r13
mulxq 8+128(%rsi),%rbp,%r9
adoxq %rbp,%r13
adcxq %r9,%r14
mulxq 16+128(%rsi),%rbp,%r9
adoxq %rbp,%r14
adcxq %r9,%r15
mulxq 24+128(%rsi),%rbp,%r9
movq %rax,%rdx
adoxq %rbp,%r15
adcxq %r10,%r9
adoxq %r9,%r10
mulxq 0+128(%rcx),%rbp,%rax
adcxq %rbp,%r11
adoxq %r12,%rax
mulxq 8+128(%rcx),%rbp,%r9
adcxq %rbp,%rax
adoxq %r9,%r13
mulxq 16+128(%rcx),%rbp,%r9
adcxq %rbp,%r13
adoxq %r9,%r14
mulxq 24+128(%rcx),%rbp,%r9
movq 24(%rbx),%rdx
adcxq %rbp,%r14
adoxq %r9,%r15
adcxq %r11,%r15
adoxq %r11,%r10
adcxq %r11,%r10
adoxq %r11,%r11
adcq $0,%r11
movq %rax,%r12
imulq %r8,%rax
xorq %rbp,%rbp
mulxq 0+128(%rsi),%rbp,%r9
adoxq %rbp,%r13
adcxq %r9,%r14
mulxq 8+128(%rsi),%rbp,%r9
adoxq %rbp,%r14
adcxq %r9,%r15
mulxq 16+128(%rsi),%rbp,%r9
adoxq %rbp,%r15
adcxq %r9,%r10
mulxq 24+128(%rsi),%rbp,%r9
movq %rax,%rdx
adoxq %rbp,%r10
adcxq %r11,%r9
adoxq %r9,%r11
mulxq 0+128(%rcx),%rbp,%rax
adcxq %rbp,%r12
adoxq %r13,%rax
mulxq 8+128(%rcx),%rbp,%r9
adcxq %rbp,%rax
adoxq %r9,%r14
mulxq 16+128(%rcx),%rbp,%r9
adcxq %rbp,%r14
adoxq %r9,%r15
mulxq 24+128(%rcx),%rbp,%r9
movq %rax,%rdx
adcxq %rbp,%r15
adoxq %r9,%r10
adcxq %r12,%r10
adoxq %r12,%r11
adcxq %r12,%r11
adoxq %r12,%r12
adcq $0,%r12
imulq %r8,%rdx
xorq %rbp,%rbp
mulxq 0+128(%rcx),%r13,%r9
adcxq %rax,%r13
adoxq %r9,%r14
mulxq 8+128(%rcx),%rbp,%r9
adcxq %rbp,%r14
adoxq %r9,%r15
mulxq 16+128(%rcx),%rbp,%r9
adcxq %rbp,%r15
adoxq %r9,%r10
mulxq 24+128(%rcx),%rbp,%r9
movq %r14,%rdx
leaq 128(%rcx),%rcx
adcxq %rbp,%r10
adoxq %r9,%r11
movq %r15,%rax
adcxq %r13,%r11
adoxq %r13,%r12
adcq $0,%r12
movq %r10,%rbp
subq 0(%rcx),%r14
sbbq 8(%rcx),%r15
sbbq 16(%rcx),%r10
movq %r11,%r9
sbbq 24(%rcx),%r11
sbbq $0,%r12
cmovcq %rdx,%r14
cmovcq %rax,%r15
cmovcq %rbp,%r10
movq %r14,0(%rdi)
cmovcq %r9,%r11
movq %r15,8(%rdi)
movq %r10,16(%rdi)
movq %r11,24(%rdi)
.byte 0xf3,0xc3
.cfi_endproc
.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256
.globl fromx_mont_256
.hidden fromx_mont_256
.type fromx_mont_256,@function
.align 32
fromx_mont_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq %rdx,%rbx
call __mulx_by_1_mont_256
movq %r15,%rdx
movq %r10,%r12
movq %r11,%r13
subq 0(%rbx),%r14
sbbq 8(%rbx),%r15
sbbq 16(%rbx),%r10
sbbq 24(%rbx),%r11
cmovncq %r14,%rax
cmovncq %r15,%rdx
cmovncq %r10,%r12
movq %rax,0(%rdi)
cmovncq %r11,%r13
movq %rdx,8(%rdi)
movq %r12,16(%rdi)
movq %r13,24(%rdi)
movq 8(%rsp),%r15
.cfi_restore %r15
movq 16(%rsp),%r14
.cfi_restore %r14
movq 24(%rsp),%r13
.cfi_restore %r13
movq 32(%rsp),%r12
.cfi_restore %r12
movq 40(%rsp),%rbx
.cfi_restore %rbx
movq 48(%rsp),%rbp
.cfi_restore %rbp
leaq 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.byte 0xf3,0xc3
.cfi_endproc
.size fromx_mont_256,.-fromx_mont_256
.globl redcx_mont_256
.hidden redcx_mont_256
.type redcx_mont_256,@function
.align 32
redcx_mont_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
subq $8,%rsp
.cfi_adjust_cfa_offset 8
movq %rdx,%rbx
call __mulx_by_1_mont_256
addq 32(%rsi),%r14
adcq 40(%rsi),%r15
movq %r14,%rax
adcq 48(%rsi),%r10
movq %r15,%rdx
adcq 56(%rsi),%r11
sbbq %rsi,%rsi
movq %r10,%r12
subq 0(%rbx),%r14
sbbq 8(%rbx),%r15
sbbq 16(%rbx),%r10
movq %r11,%r13
sbbq 24(%rbx),%r11
sbbq $0,%rsi
cmovncq %r14,%rax
cmovncq %r15,%rdx
cmovncq %r10,%r12
movq %rax,0(%rdi)
cmovncq %r11,%r13
movq %rdx,8(%rdi)
movq %r12,16(%rdi)
movq %r13,24(%rdi)
movq 8(%rsp),%r15
.cfi_restore %r15
movq 16(%rsp),%r14
.cfi_restore %r14
movq 24(%rsp),%r13
.cfi_restore %r13
movq 32(%rsp),%r12
.cfi_restore %r12
movq 40(%rsp),%rbx
.cfi_restore %rbx
movq 48(%rsp),%rbp
.cfi_restore %rbp
leaq 56(%rsp),%rsp
.cfi_adjust_cfa_offset -56
.byte 0xf3,0xc3
.cfi_endproc
.size redcx_mont_256,.-redcx_mont_256
.type __mulx_by_1_mont_256,@function
.align 32
__mulx_by_1_mont_256:
.cfi_startproc
.byte 0xf3,0x0f,0x1e,0xfa
movq 0(%rsi),%rax
movq 8(%rsi),%r11
movq 16(%rsi),%r12
movq 24(%rsi),%r13
movq %rax,%r14
imulq %rcx,%rax
movq %rax,%r10
mulq 0(%rbx)
addq %rax,%r14
movq %r10,%rax
adcq %rdx,%r14
mulq 8(%rbx)
addq %rax,%r11
movq %r10,%rax
adcq $0,%rdx
addq %r14,%r11
adcq $0,%rdx
movq %rdx,%r14
mulq 16(%rbx)
movq %r11,%r15
imulq %rcx,%r11
addq %rax,%r12
movq %r10,%rax
adcq $0,%rdx
addq %r14,%r12
adcq $0,%rdx
movq %rdx,%r14
mulq 24(%rbx)
addq %rax,%r13
movq %r11,%rax
adcq $0,%rdx
addq %r14,%r13
adcq $0,%rdx
movq %rdx,%r14
mulq 0(%rbx)
addq %rax,%r15
movq %r11,%rax
adcq %rdx,%r15
mulq 8(%rbx)
addq %rax,%r12
movq %r11,%rax
adcq $0,%rdx
addq %r15,%r12
adcq $0,%rdx
movq %rdx,%r15
mulq 16(%rbx)
movq %r12,%r10
imulq %rcx,%r12
addq %rax,%r13
movq %r11,%rax
adcq $0,%rdx
addq %r15,%r13
adcq $0,%rdx
movq %rdx,%r15
mulq 24(%rbx)
addq %rax,%r14
movq %r12,%rax
adcq $0,%rdx
addq %r15,%r14
adcq $0,%rdx
movq %rdx,%r15
mulq 0(%rbx)
addq %rax,%r10
movq %r12,%rax
adcq %rdx,%r10
mulq 8(%rbx)
addq %rax,%r13
movq %r12,%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %rdx,%r10
mulq 16(%rbx)
movq %r13,%r11
imulq %rcx,%r13
addq %rax,%r14
movq %r12,%rax
adcq $0,%rdx
addq %r10,%r14
adcq $0,%rdx
movq %rdx,%r10
mulq 24(%rbx)
addq %rax,%r15
movq %r13,%rax
adcq $0,%rdx
addq %r10,%r15
adcq $0,%rdx
movq %rdx,%r10
mulq 0(%rbx)
addq %rax,%r11
movq %r13,%rax
adcq %rdx,%r11
mulq 8(%rbx)
addq %rax,%r14
movq %r13,%rax
adcq $0,%rdx
addq %r11,%r14
adcq $0,%rdx
movq %rdx,%r11
mulq 16(%rbx)
addq %rax,%r15
movq %r13,%rax
adcq $0,%rdx
addq %r11,%r15
adcq $0,%rdx
movq %rdx,%r11
mulq 24(%rbx)
addq %rax,%r10
movq %r14,%rax
adcq $0,%rdx
addq %r11,%r10
adcq $0,%rdx
movq %rdx,%r11
.byte 0xf3,0xc3
.cfi_endproc
.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256
.section .note.GNU-stack,"",@progbits
.section .note.gnu.property,"a",@note
.long 4,2f-1f,5
.byte 0x47,0x4E,0x55,0
1: .long 0xc0000002,4,3
.align 8
2:

File diff suppressed because it is too large Load diff

1077
blst/elf/sha256-armv8.S Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

1446
blst/elf/sha256-x86_64.s Normal file

File diff suppressed because it is too large Load diff