calog/vendor/openssl/crypto/bn/rsaz-2k-avxifma.s

1168 lines
23 KiB
ArmAsm
Vendored

.text
.globl ossl_rsaz_avxifma_eligible
.type ossl_rsaz_avxifma_eligible,@function
.align 32
ossl_rsaz_avxifma_eligible:
movl OPENSSL_ia32cap_P+20(%rip),%ecx
xorl %eax,%eax
andl $8388608,%ecx
cmpl $8388608,%ecx
cmovel %ecx,%eax
.byte 0xf3,0xc3
.size ossl_rsaz_avxifma_eligible, .-ossl_rsaz_avxifma_eligible
.text
.globl ossl_rsaz_amm52x20_x1_avxifma256
.type ossl_rsaz_amm52x20_x1_avxifma256,@function
.align 32
ossl_rsaz_amm52x20_x1_avxifma256:
.cfi_startproc
.byte 243,15,30,250
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-16
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lossl_rsaz_amm52x20_x1_avxifma256_body:
vpxor %ymm0,%ymm0,%ymm0
vmovapd %ymm0,%ymm3
vmovapd %ymm0,%ymm5
vmovapd %ymm0,%ymm6
vmovapd %ymm0,%ymm7
vmovapd %ymm0,%ymm8
xorl %r9d,%r9d
movq %rdx,%r11
movq $0xfffffffffffff,%rax
movl $5,%ebx
.align 32
.Lloop5:
movq 0(%r11),%r13
vpbroadcastq 0(%r11),%ymm1
movq 0(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
movq %r12,%r10
adcq $0,%r10
movq %r8,%r13
imulq %r9,%r13
andq %rax,%r13
vmovq %r13,%xmm2
vpbroadcastq %xmm2,%ymm2
movq 0(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
adcq %r12,%r10
shrq $52,%r9
salq $12,%r10
orq %r10,%r9
leaq -168(%rsp),%rsp
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8
vmovdqu %ymm3,0(%rsp)
vmovdqu %ymm5,32(%rsp)
vmovdqu %ymm6,64(%rsp)
vmovdqu %ymm7,96(%rsp)
vmovdqu %ymm8,128(%rsp)
movq $0,160(%rsp)
vmovdqu 8(%rsp),%ymm3
vmovdqu 40(%rsp),%ymm5
vmovdqu 72(%rsp),%ymm6
vmovdqu 104(%rsp),%ymm7
vmovdqu 136(%rsp),%ymm8
addq 8(%rsp),%r9
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8
leaq 168(%rsp),%rsp
movq 8(%r11),%r13
vpbroadcastq 8(%r11),%ymm1
movq 0(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
movq %r12,%r10
adcq $0,%r10
movq %r8,%r13
imulq %r9,%r13
andq %rax,%r13
vmovq %r13,%xmm2
vpbroadcastq %xmm2,%ymm2
movq 0(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
adcq %r12,%r10
shrq $52,%r9
salq $12,%r10
orq %r10,%r9
leaq -168(%rsp),%rsp
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8
vmovdqu %ymm3,0(%rsp)
vmovdqu %ymm5,32(%rsp)
vmovdqu %ymm6,64(%rsp)
vmovdqu %ymm7,96(%rsp)
vmovdqu %ymm8,128(%rsp)
movq $0,160(%rsp)
vmovdqu 8(%rsp),%ymm3
vmovdqu 40(%rsp),%ymm5
vmovdqu 72(%rsp),%ymm6
vmovdqu 104(%rsp),%ymm7
vmovdqu 136(%rsp),%ymm8
addq 8(%rsp),%r9
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8
leaq 168(%rsp),%rsp
movq 16(%r11),%r13
vpbroadcastq 16(%r11),%ymm1
movq 0(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
movq %r12,%r10
adcq $0,%r10
movq %r8,%r13
imulq %r9,%r13
andq %rax,%r13
vmovq %r13,%xmm2
vpbroadcastq %xmm2,%ymm2
movq 0(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
adcq %r12,%r10
shrq $52,%r9
salq $12,%r10
orq %r10,%r9
leaq -168(%rsp),%rsp
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8
vmovdqu %ymm3,0(%rsp)
vmovdqu %ymm5,32(%rsp)
vmovdqu %ymm6,64(%rsp)
vmovdqu %ymm7,96(%rsp)
vmovdqu %ymm8,128(%rsp)
movq $0,160(%rsp)
vmovdqu 8(%rsp),%ymm3
vmovdqu 40(%rsp),%ymm5
vmovdqu 72(%rsp),%ymm6
vmovdqu 104(%rsp),%ymm7
vmovdqu 136(%rsp),%ymm8
addq 8(%rsp),%r9
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8
leaq 168(%rsp),%rsp
movq 24(%r11),%r13
vpbroadcastq 24(%r11),%ymm1
movq 0(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
movq %r12,%r10
adcq $0,%r10
movq %r8,%r13
imulq %r9,%r13
andq %rax,%r13
vmovq %r13,%xmm2
vpbroadcastq %xmm2,%ymm2
movq 0(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
adcq %r12,%r10
shrq $52,%r9
salq $12,%r10
orq %r10,%r9
leaq -168(%rsp),%rsp
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8
vmovdqu %ymm3,0(%rsp)
vmovdqu %ymm5,32(%rsp)
vmovdqu %ymm6,64(%rsp)
vmovdqu %ymm7,96(%rsp)
vmovdqu %ymm8,128(%rsp)
movq $0,160(%rsp)
vmovdqu 8(%rsp),%ymm3
vmovdqu 40(%rsp),%ymm5
vmovdqu 72(%rsp),%ymm6
vmovdqu 104(%rsp),%ymm7
vmovdqu 136(%rsp),%ymm8
addq 8(%rsp),%r9
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8
leaq 168(%rsp),%rsp
leaq 32(%r11),%r11
decl %ebx
jne .Lloop5
vmovq %r9,%xmm0
vpbroadcastq %xmm0,%ymm0
vpblendd $3,%ymm0,%ymm3,%ymm3
vpsrlq $52,%ymm3,%ymm0
vpsrlq $52,%ymm5,%ymm1
vpsrlq $52,%ymm6,%ymm2
vpsrlq $52,%ymm7,%ymm13
vpsrlq $52,%ymm8,%ymm14
vpermq $144,%ymm14,%ymm14
vpermq $3,%ymm13,%ymm15
vblendpd $1,%ymm15,%ymm14,%ymm14
vpermq $144,%ymm13,%ymm13
vpermq $3,%ymm2,%ymm15
vblendpd $1,%ymm15,%ymm13,%ymm13
vpermq $144,%ymm2,%ymm2
vpermq $3,%ymm1,%ymm15
vblendpd $1,%ymm15,%ymm2,%ymm2
vpermq $144,%ymm1,%ymm1
vpermq $3,%ymm0,%ymm15
vblendpd $1,%ymm15,%ymm1,%ymm1
vpermq $144,%ymm0,%ymm0
vpand .Lhigh64x3(%rip),%ymm0,%ymm0
vpand .Lmask52x4(%rip),%ymm3,%ymm3
vpand .Lmask52x4(%rip),%ymm5,%ymm5
vpand .Lmask52x4(%rip),%ymm6,%ymm6
vpand .Lmask52x4(%rip),%ymm7,%ymm7
vpand .Lmask52x4(%rip),%ymm8,%ymm8
vpaddq %ymm0,%ymm3,%ymm3
vpaddq %ymm1,%ymm5,%ymm5
vpaddq %ymm2,%ymm6,%ymm6
vpaddq %ymm13,%ymm7,%ymm7
vpaddq %ymm14,%ymm8,%ymm8
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm1
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm2
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm14
vmovmskpd %ymm0,%r14d
vmovmskpd %ymm1,%r13d
vmovmskpd %ymm2,%r12d
vmovmskpd %ymm13,%r11d
vmovmskpd %ymm14,%r10d
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm1
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm2
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm14
vmovmskpd %ymm0,%r9d
vmovmskpd %ymm1,%r8d
vmovmskpd %ymm2,%ebx
vmovmskpd %ymm13,%ecx
vmovmskpd %ymm14,%edx
shlb $4,%r13b
orb %r13b,%r14b
shlb $4,%r11b
orb %r11b,%r12b
addb %r14b,%r14b
adcb %r12b,%r12b
adcb %r10b,%r10b
shlb $4,%r8b
orb %r8b,%r9b
shlb $4,%cl
orb %cl,%bl
addb %r9b,%r14b
adcb %bl,%r12b
adcb %dl,%r10b
xorb %r9b,%r14b
xorb %bl,%r12b
xorb %dl,%r10b
leaq .Lkmasklut(%rip),%rdx
movb %r14b,%r13b
andq $0xf,%r14
vpsubq .Lmask52x4(%rip),%ymm3,%ymm0
shlq $5,%r14
vmovapd (%rdx,%r14,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm3,%ymm3
shrb $4,%r13b
andq $0xf,%r13
vpsubq .Lmask52x4(%rip),%ymm5,%ymm0
shlq $5,%r13
vmovapd (%rdx,%r13,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm5,%ymm5
movb %r12b,%r11b
andq $0xf,%r12
vpsubq .Lmask52x4(%rip),%ymm6,%ymm0
shlq $5,%r12
vmovapd (%rdx,%r12,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm6,%ymm6
shrb $4,%r11b
andq $0xf,%r11
vpsubq .Lmask52x4(%rip),%ymm7,%ymm0
shlq $5,%r11
vmovapd (%rdx,%r11,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm7,%ymm7
andq $0xf,%r10
vpsubq .Lmask52x4(%rip),%ymm8,%ymm0
shlq $5,%r10
vmovapd (%rdx,%r10,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm8,%ymm8
vpand .Lmask52x4(%rip),%ymm3,%ymm3
vpand .Lmask52x4(%rip),%ymm5,%ymm5
vpand .Lmask52x4(%rip),%ymm6,%ymm6
vpand .Lmask52x4(%rip),%ymm7,%ymm7
vpand .Lmask52x4(%rip),%ymm8,%ymm8
vmovdqu %ymm3,0(%rdi)
vmovdqu %ymm5,32(%rdi)
vmovdqu %ymm6,64(%rdi)
vmovdqu %ymm7,96(%rdi)
vmovdqu %ymm8,128(%rdi)
vzeroupper
movq 0(%rsp),%r15
.cfi_restore %r15
movq 8(%rsp),%r14
.cfi_restore %r14
movq 16(%rsp),%r13
.cfi_restore %r13
movq 24(%rsp),%r12
.cfi_restore %r12
movq 32(%rsp),%rbp
.cfi_restore %rbp
movq 40(%rsp),%rbx
.cfi_restore %rbx
leaq 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lossl_rsaz_amm52x20_x1_avxifma256_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size ossl_rsaz_amm52x20_x1_avxifma256, .-ossl_rsaz_amm52x20_x1_avxifma256
.section .rodata
.align 32
.Lmask52x4:
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.Lhigh64x3:
.quad 0x0
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.Lkmasklut:
.quad 0x0
.quad 0x0
.quad 0x0
.quad 0x0
.quad 0xffffffffffffffff
.quad 0x0
.quad 0x0
.quad 0x0
.quad 0x0
.quad 0xffffffffffffffff
.quad 0x0
.quad 0x0
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0x0
.quad 0x0
.quad 0x0
.quad 0x0
.quad 0xffffffffffffffff
.quad 0x0
.quad 0xffffffffffffffff
.quad 0x0
.quad 0xffffffffffffffff
.quad 0x0
.quad 0x0
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0x0
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0x0
.quad 0x0
.quad 0x0
.quad 0x0
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0x0
.quad 0x0
.quad 0xffffffffffffffff
.quad 0x0
.quad 0xffffffffffffffff
.quad 0x0
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0x0
.quad 0xffffffffffffffff
.quad 0x0
.quad 0x0
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0x0
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0x0
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.quad 0xffffffffffffffff
.text
.globl ossl_rsaz_amm52x20_x2_avxifma256
.type ossl_rsaz_amm52x20_x2_avxifma256,@function
.align 32
ossl_rsaz_amm52x20_x2_avxifma256:
.cfi_startproc
.byte 243,15,30,250
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-16
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lossl_rsaz_amm52x20_x2_avxifma256_body:
vpxor %ymm0,%ymm0,%ymm0
vmovapd %ymm0,%ymm3
vmovapd %ymm0,%ymm5
vmovapd %ymm0,%ymm6
vmovapd %ymm0,%ymm7
vmovapd %ymm0,%ymm8
vmovapd %ymm0,%ymm4
vmovapd %ymm0,%ymm9
vmovapd %ymm0,%ymm10
vmovapd %ymm0,%ymm11
vmovapd %ymm0,%ymm12
xorl %r9d,%r9d
xorl %r15d,%r15d
movq %rdx,%r11
movq $0xfffffffffffff,%rax
movl $20,%ebx
.align 32
.Lloop20:
movq 0(%r11),%r13
vpbroadcastq 0(%r11),%ymm1
movq 0(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
movq %r12,%r10
adcq $0,%r10
movq (%r8),%r13
imulq %r9,%r13
andq %rax,%r13
vmovq %r13,%xmm2
vpbroadcastq %xmm2,%ymm2
movq 0(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
adcq %r12,%r10
shrq $52,%r9
salq $12,%r10
orq %r10,%r9
leaq -168(%rsp),%rsp
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8
vmovdqu %ymm3,0(%rsp)
vmovdqu %ymm5,32(%rsp)
vmovdqu %ymm6,64(%rsp)
vmovdqu %ymm7,96(%rsp)
vmovdqu %ymm8,128(%rsp)
movq $0,160(%rsp)
vmovdqu 8(%rsp),%ymm3
vmovdqu 40(%rsp),%ymm5
vmovdqu 72(%rsp),%ymm6
vmovdqu 104(%rsp),%ymm7
vmovdqu 136(%rsp),%ymm8
addq 8(%rsp),%r9
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8
leaq 168(%rsp),%rsp
movq 160(%r11),%r13
vpbroadcastq 160(%r11),%ymm1
movq 160(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r15
movq %r12,%r10
adcq $0,%r10
movq 8(%r8),%r13
imulq %r15,%r13
andq %rax,%r13
vmovq %r13,%xmm2
vpbroadcastq %xmm2,%ymm2
movq 160(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r15
adcq %r12,%r10
shrq $52,%r15
salq $12,%r10
orq %r10,%r15
leaq -168(%rsp),%rsp
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm4
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm4
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
vmovdqu %ymm4,0(%rsp)
vmovdqu %ymm9,32(%rsp)
vmovdqu %ymm10,64(%rsp)
vmovdqu %ymm11,96(%rsp)
vmovdqu %ymm12,128(%rsp)
movq $0,160(%rsp)
vmovdqu 8(%rsp),%ymm4
vmovdqu 40(%rsp),%ymm9
vmovdqu 72(%rsp),%ymm10
vmovdqu 104(%rsp),%ymm11
vmovdqu 136(%rsp),%ymm12
addq 8(%rsp),%r15
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm4
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm4
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
leaq 168(%rsp),%rsp
leaq 8(%r11),%r11
decl %ebx
jne .Lloop20
vmovq %r9,%xmm0
vpbroadcastq %xmm0,%ymm0
vpblendd $3,%ymm0,%ymm3,%ymm3
vpsrlq $52,%ymm3,%ymm0
vpsrlq $52,%ymm5,%ymm1
vpsrlq $52,%ymm6,%ymm2
vpsrlq $52,%ymm7,%ymm13
vpsrlq $52,%ymm8,%ymm14
vpermq $144,%ymm14,%ymm14
vpermq $3,%ymm13,%ymm15
vblendpd $1,%ymm15,%ymm14,%ymm14
vpermq $144,%ymm13,%ymm13
vpermq $3,%ymm2,%ymm15
vblendpd $1,%ymm15,%ymm13,%ymm13
vpermq $144,%ymm2,%ymm2
vpermq $3,%ymm1,%ymm15
vblendpd $1,%ymm15,%ymm2,%ymm2
vpermq $144,%ymm1,%ymm1
vpermq $3,%ymm0,%ymm15
vblendpd $1,%ymm15,%ymm1,%ymm1
vpermq $144,%ymm0,%ymm0
vpand .Lhigh64x3(%rip),%ymm0,%ymm0
vpand .Lmask52x4(%rip),%ymm3,%ymm3
vpand .Lmask52x4(%rip),%ymm5,%ymm5
vpand .Lmask52x4(%rip),%ymm6,%ymm6
vpand .Lmask52x4(%rip),%ymm7,%ymm7
vpand .Lmask52x4(%rip),%ymm8,%ymm8
vpaddq %ymm0,%ymm3,%ymm3
vpaddq %ymm1,%ymm5,%ymm5
vpaddq %ymm2,%ymm6,%ymm6
vpaddq %ymm13,%ymm7,%ymm7
vpaddq %ymm14,%ymm8,%ymm8
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm1
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm2
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm14
vmovmskpd %ymm0,%r14d
vmovmskpd %ymm1,%r13d
vmovmskpd %ymm2,%r12d
vmovmskpd %ymm13,%r11d
vmovmskpd %ymm14,%r10d
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm1
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm2
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm14
vmovmskpd %ymm0,%r9d
vmovmskpd %ymm1,%r8d
vmovmskpd %ymm2,%ebx
vmovmskpd %ymm13,%ecx
vmovmskpd %ymm14,%edx
shlb $4,%r13b
orb %r13b,%r14b
shlb $4,%r11b
orb %r11b,%r12b
addb %r14b,%r14b
adcb %r12b,%r12b
adcb %r10b,%r10b
shlb $4,%r8b
orb %r8b,%r9b
shlb $4,%cl
orb %cl,%bl
addb %r9b,%r14b
adcb %bl,%r12b
adcb %dl,%r10b
xorb %r9b,%r14b
xorb %bl,%r12b
xorb %dl,%r10b
leaq .Lkmasklut(%rip),%rdx
movb %r14b,%r13b
andq $0xf,%r14
vpsubq .Lmask52x4(%rip),%ymm3,%ymm0
shlq $5,%r14
vmovapd (%rdx,%r14,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm3,%ymm3
shrb $4,%r13b
andq $0xf,%r13
vpsubq .Lmask52x4(%rip),%ymm5,%ymm0
shlq $5,%r13
vmovapd (%rdx,%r13,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm5,%ymm5
movb %r12b,%r11b
andq $0xf,%r12
vpsubq .Lmask52x4(%rip),%ymm6,%ymm0
shlq $5,%r12
vmovapd (%rdx,%r12,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm6,%ymm6
shrb $4,%r11b
andq $0xf,%r11
vpsubq .Lmask52x4(%rip),%ymm7,%ymm0
shlq $5,%r11
vmovapd (%rdx,%r11,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm7,%ymm7
andq $0xf,%r10
vpsubq .Lmask52x4(%rip),%ymm8,%ymm0
shlq $5,%r10
vmovapd (%rdx,%r10,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm8,%ymm8
vpand .Lmask52x4(%rip),%ymm3,%ymm3
vpand .Lmask52x4(%rip),%ymm5,%ymm5
vpand .Lmask52x4(%rip),%ymm6,%ymm6
vpand .Lmask52x4(%rip),%ymm7,%ymm7
vpand .Lmask52x4(%rip),%ymm8,%ymm8
vmovq %r15,%xmm0
vpbroadcastq %xmm0,%ymm0
vpblendd $3,%ymm0,%ymm4,%ymm4
vpsrlq $52,%ymm4,%ymm0
vpsrlq $52,%ymm9,%ymm1
vpsrlq $52,%ymm10,%ymm2
vpsrlq $52,%ymm11,%ymm13
vpsrlq $52,%ymm12,%ymm14
vpermq $144,%ymm14,%ymm14
vpermq $3,%ymm13,%ymm15
vblendpd $1,%ymm15,%ymm14,%ymm14
vpermq $144,%ymm13,%ymm13
vpermq $3,%ymm2,%ymm15
vblendpd $1,%ymm15,%ymm13,%ymm13
vpermq $144,%ymm2,%ymm2
vpermq $3,%ymm1,%ymm15
vblendpd $1,%ymm15,%ymm2,%ymm2
vpermq $144,%ymm1,%ymm1
vpermq $3,%ymm0,%ymm15
vblendpd $1,%ymm15,%ymm1,%ymm1
vpermq $144,%ymm0,%ymm0
vpand .Lhigh64x3(%rip),%ymm0,%ymm0
vpand .Lmask52x4(%rip),%ymm4,%ymm4
vpand .Lmask52x4(%rip),%ymm9,%ymm9
vpand .Lmask52x4(%rip),%ymm10,%ymm10
vpand .Lmask52x4(%rip),%ymm11,%ymm11
vpand .Lmask52x4(%rip),%ymm12,%ymm12
vpaddq %ymm0,%ymm4,%ymm4
vpaddq %ymm1,%ymm9,%ymm9
vpaddq %ymm2,%ymm10,%ymm10
vpaddq %ymm13,%ymm11,%ymm11
vpaddq %ymm14,%ymm12,%ymm12
vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm0
vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm1
vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm2
vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13
vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm14
vmovmskpd %ymm0,%r14d
vmovmskpd %ymm1,%r13d
vmovmskpd %ymm2,%r12d
vmovmskpd %ymm13,%r11d
vmovmskpd %ymm14,%r10d
vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm0
vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm1
vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm2
vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13
vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm14
vmovmskpd %ymm0,%r9d
vmovmskpd %ymm1,%r8d
vmovmskpd %ymm2,%ebx
vmovmskpd %ymm13,%ecx
vmovmskpd %ymm14,%edx
shlb $4,%r13b
orb %r13b,%r14b
shlb $4,%r11b
orb %r11b,%r12b
addb %r14b,%r14b
adcb %r12b,%r12b
adcb %r10b,%r10b
shlb $4,%r8b
orb %r8b,%r9b
shlb $4,%cl
orb %cl,%bl
addb %r9b,%r14b
adcb %bl,%r12b
adcb %dl,%r10b
xorb %r9b,%r14b
xorb %bl,%r12b
xorb %dl,%r10b
leaq .Lkmasklut(%rip),%rdx
movb %r14b,%r13b
andq $0xf,%r14
vpsubq .Lmask52x4(%rip),%ymm4,%ymm0
shlq $5,%r14
vmovapd (%rdx,%r14,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm4,%ymm4
shrb $4,%r13b
andq $0xf,%r13
vpsubq .Lmask52x4(%rip),%ymm9,%ymm0
shlq $5,%r13
vmovapd (%rdx,%r13,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm9,%ymm9
movb %r12b,%r11b
andq $0xf,%r12
vpsubq .Lmask52x4(%rip),%ymm10,%ymm0
shlq $5,%r12
vmovapd (%rdx,%r12,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm10,%ymm10
shrb $4,%r11b
andq $0xf,%r11
vpsubq .Lmask52x4(%rip),%ymm11,%ymm0
shlq $5,%r11
vmovapd (%rdx,%r11,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm11,%ymm11
andq $0xf,%r10
vpsubq .Lmask52x4(%rip),%ymm12,%ymm0
shlq $5,%r10
vmovapd (%rdx,%r10,1),%ymm2
vblendvpd %ymm2,%ymm0,%ymm12,%ymm12
vpand .Lmask52x4(%rip),%ymm4,%ymm4
vpand .Lmask52x4(%rip),%ymm9,%ymm9
vpand .Lmask52x4(%rip),%ymm10,%ymm10
vpand .Lmask52x4(%rip),%ymm11,%ymm11
vpand .Lmask52x4(%rip),%ymm12,%ymm12
vmovdqu %ymm3,0(%rdi)
vmovdqu %ymm5,32(%rdi)
vmovdqu %ymm6,64(%rdi)
vmovdqu %ymm7,96(%rdi)
vmovdqu %ymm8,128(%rdi)
vmovdqu %ymm4,160(%rdi)
vmovdqu %ymm9,192(%rdi)
vmovdqu %ymm10,224(%rdi)
vmovdqu %ymm11,256(%rdi)
vmovdqu %ymm12,288(%rdi)
vzeroupper
movq 0(%rsp),%r15
.cfi_restore %r15
movq 8(%rsp),%r14
.cfi_restore %r14
movq 16(%rsp),%r13
.cfi_restore %r13
movq 24(%rsp),%r12
.cfi_restore %r12
movq 32(%rsp),%rbp
.cfi_restore %rbp
movq 40(%rsp),%rbx
.cfi_restore %rbx
leaq 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lossl_rsaz_amm52x20_x2_avxifma256_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size ossl_rsaz_amm52x20_x2_avxifma256, .-ossl_rsaz_amm52x20_x2_avxifma256
.text
.align 32
.globl ossl_extract_multiplier_2x20_win5_avx
.type ossl_extract_multiplier_2x20_win5_avx,@function
ossl_extract_multiplier_2x20_win5_avx:
.cfi_startproc
.byte 243,15,30,250
vmovapd .Lones(%rip),%ymm14
vmovq %rdx,%xmm10
vpbroadcastq %xmm10,%ymm12
vmovq %rcx,%xmm10
vpbroadcastq %xmm10,%ymm13
leaq 10240(%rsi),%rax
vpxor %xmm0,%xmm0,%xmm0
vmovapd %ymm0,%ymm11
vmovapd %ymm0,%ymm1
vmovapd %ymm0,%ymm2
vmovapd %ymm0,%ymm3
vmovapd %ymm0,%ymm4
vmovapd %ymm0,%ymm5
vmovapd %ymm0,%ymm6
vmovapd %ymm0,%ymm7
vmovapd %ymm0,%ymm8
vmovapd %ymm0,%ymm9
.align 32
.Lloop:
vpcmpeqq %ymm11,%ymm12,%ymm15
vmovdqu 0(%rsi),%ymm10
vblendvpd %ymm15,%ymm10,%ymm0,%ymm0
vmovdqu 32(%rsi),%ymm10
vblendvpd %ymm15,%ymm10,%ymm1,%ymm1
vmovdqu 64(%rsi),%ymm10
vblendvpd %ymm15,%ymm10,%ymm2,%ymm2
vmovdqu 96(%rsi),%ymm10
vblendvpd %ymm15,%ymm10,%ymm3,%ymm3
vmovdqu 128(%rsi),%ymm10
vblendvpd %ymm15,%ymm10,%ymm4,%ymm4
vpcmpeqq %ymm11,%ymm13,%ymm15
vmovdqu 160(%rsi),%ymm10
vblendvpd %ymm15,%ymm10,%ymm5,%ymm5
vmovdqu 192(%rsi),%ymm10
vblendvpd %ymm15,%ymm10,%ymm6,%ymm6
vmovdqu 224(%rsi),%ymm10
vblendvpd %ymm15,%ymm10,%ymm7,%ymm7
vmovdqu 256(%rsi),%ymm10
vblendvpd %ymm15,%ymm10,%ymm8,%ymm8
vmovdqu 288(%rsi),%ymm10
vblendvpd %ymm15,%ymm10,%ymm9,%ymm9
vpaddq %ymm14,%ymm11,%ymm11
addq $320,%rsi
cmpq %rsi,%rax
jne .Lloop
vmovdqu %ymm0,0(%rdi)
vmovdqu %ymm1,32(%rdi)
vmovdqu %ymm2,64(%rdi)
vmovdqu %ymm3,96(%rdi)
vmovdqu %ymm4,128(%rdi)
vmovdqu %ymm5,160(%rdi)
vmovdqu %ymm6,192(%rdi)
vmovdqu %ymm7,224(%rdi)
vmovdqu %ymm8,256(%rdi)
vmovdqu %ymm9,288(%rdi)
vzeroupper
.byte 0xf3,0xc3
.cfi_endproc
.size ossl_extract_multiplier_2x20_win5_avx, .-ossl_extract_multiplier_2x20_win5_avx
.section .rodata
.align 32
.Lones:
.quad 1,1,1,1
.Lzeros:
.quad 0,0,0,0
.section ".note.gnu.property", "a"
.p2align 3
.long 1f - 0f
.long 4f - 1f
.long 5
0:
# "GNU" encoded with .byte, since .asciz isn't supported
# on Solaris.
.byte 0x47
.byte 0x4e
.byte 0x55
.byte 0
1:
.p2align 3
.long 0xc0000002
.long 3f - 2f
2:
.long 3
3:
.p2align 3
4: