1923 lines
42 KiB
ArmAsm
Vendored
1923 lines
42 KiB
ArmAsm
Vendored
.text
|
|
|
|
.globl ossl_rsaz_amm52x40_x1_avxifma256
|
|
.type ossl_rsaz_amm52x40_x1_avxifma256,@function
|
|
.align 32
|
|
ossl_rsaz_amm52x40_x1_avxifma256:
|
|
.cfi_startproc
|
|
.byte 243,15,30,250
|
|
pushq %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r15,-56
|
|
|
|
vpxor %ymm0,%ymm0,%ymm0
|
|
vmovapd %ymm0,%ymm3
|
|
vmovapd %ymm0,%ymm4
|
|
vmovapd %ymm0,%ymm5
|
|
vmovapd %ymm0,%ymm6
|
|
vmovapd %ymm0,%ymm7
|
|
vmovapd %ymm0,%ymm8
|
|
vmovapd %ymm0,%ymm9
|
|
vmovapd %ymm0,%ymm10
|
|
vmovapd %ymm0,%ymm11
|
|
vmovapd %ymm0,%ymm12
|
|
|
|
xorl %r9d,%r9d
|
|
|
|
movq %rdx,%r11
|
|
movq $0xfffffffffffff,%rax
|
|
|
|
|
|
movl $10,%ebx
|
|
|
|
.align 32
|
|
.Lloop10:
|
|
movq 0(%r11),%r13
|
|
|
|
vpbroadcastq 0(%r11),%ymm1
|
|
movq 0(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq %r8,%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vmovq %r13,%xmm2
|
|
vpbroadcastq %xmm2,%ymm2
|
|
movq 0(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
leaq -328(%rsp),%rsp
|
|
|
|
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
|
|
vmovdqu %ymm3,0(%rsp)
|
|
vmovdqu %ymm4,32(%rsp)
|
|
vmovdqu %ymm5,64(%rsp)
|
|
vmovdqu %ymm6,96(%rsp)
|
|
vmovdqu %ymm7,128(%rsp)
|
|
vmovdqu %ymm8,160(%rsp)
|
|
vmovdqu %ymm9,192(%rsp)
|
|
vmovdqu %ymm10,224(%rsp)
|
|
vmovdqu %ymm11,256(%rsp)
|
|
vmovdqu %ymm12,288(%rsp)
|
|
movq $0,320(%rsp)
|
|
|
|
vmovdqu 8(%rsp),%ymm3
|
|
vmovdqu 40(%rsp),%ymm4
|
|
vmovdqu 72(%rsp),%ymm5
|
|
vmovdqu 104(%rsp),%ymm6
|
|
vmovdqu 136(%rsp),%ymm7
|
|
vmovdqu 168(%rsp),%ymm8
|
|
vmovdqu 200(%rsp),%ymm9
|
|
vmovdqu 232(%rsp),%ymm10
|
|
vmovdqu 264(%rsp),%ymm11
|
|
vmovdqu 296(%rsp),%ymm12
|
|
|
|
addq 8(%rsp),%r9
|
|
|
|
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
|
|
leaq 328(%rsp),%rsp
|
|
movq 8(%r11),%r13
|
|
|
|
vpbroadcastq 8(%r11),%ymm1
|
|
movq 0(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq %r8,%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vmovq %r13,%xmm2
|
|
vpbroadcastq %xmm2,%ymm2
|
|
movq 0(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
leaq -328(%rsp),%rsp
|
|
|
|
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
|
|
vmovdqu %ymm3,0(%rsp)
|
|
vmovdqu %ymm4,32(%rsp)
|
|
vmovdqu %ymm5,64(%rsp)
|
|
vmovdqu %ymm6,96(%rsp)
|
|
vmovdqu %ymm7,128(%rsp)
|
|
vmovdqu %ymm8,160(%rsp)
|
|
vmovdqu %ymm9,192(%rsp)
|
|
vmovdqu %ymm10,224(%rsp)
|
|
vmovdqu %ymm11,256(%rsp)
|
|
vmovdqu %ymm12,288(%rsp)
|
|
movq $0,320(%rsp)
|
|
|
|
vmovdqu 8(%rsp),%ymm3
|
|
vmovdqu 40(%rsp),%ymm4
|
|
vmovdqu 72(%rsp),%ymm5
|
|
vmovdqu 104(%rsp),%ymm6
|
|
vmovdqu 136(%rsp),%ymm7
|
|
vmovdqu 168(%rsp),%ymm8
|
|
vmovdqu 200(%rsp),%ymm9
|
|
vmovdqu 232(%rsp),%ymm10
|
|
vmovdqu 264(%rsp),%ymm11
|
|
vmovdqu 296(%rsp),%ymm12
|
|
|
|
addq 8(%rsp),%r9
|
|
|
|
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
|
|
leaq 328(%rsp),%rsp
|
|
movq 16(%r11),%r13
|
|
|
|
vpbroadcastq 16(%r11),%ymm1
|
|
movq 0(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq %r8,%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vmovq %r13,%xmm2
|
|
vpbroadcastq %xmm2,%ymm2
|
|
movq 0(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
leaq -328(%rsp),%rsp
|
|
|
|
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
|
|
vmovdqu %ymm3,0(%rsp)
|
|
vmovdqu %ymm4,32(%rsp)
|
|
vmovdqu %ymm5,64(%rsp)
|
|
vmovdqu %ymm6,96(%rsp)
|
|
vmovdqu %ymm7,128(%rsp)
|
|
vmovdqu %ymm8,160(%rsp)
|
|
vmovdqu %ymm9,192(%rsp)
|
|
vmovdqu %ymm10,224(%rsp)
|
|
vmovdqu %ymm11,256(%rsp)
|
|
vmovdqu %ymm12,288(%rsp)
|
|
movq $0,320(%rsp)
|
|
|
|
vmovdqu 8(%rsp),%ymm3
|
|
vmovdqu 40(%rsp),%ymm4
|
|
vmovdqu 72(%rsp),%ymm5
|
|
vmovdqu 104(%rsp),%ymm6
|
|
vmovdqu 136(%rsp),%ymm7
|
|
vmovdqu 168(%rsp),%ymm8
|
|
vmovdqu 200(%rsp),%ymm9
|
|
vmovdqu 232(%rsp),%ymm10
|
|
vmovdqu 264(%rsp),%ymm11
|
|
vmovdqu 296(%rsp),%ymm12
|
|
|
|
addq 8(%rsp),%r9
|
|
|
|
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
|
|
leaq 328(%rsp),%rsp
|
|
movq 24(%r11),%r13
|
|
|
|
vpbroadcastq 24(%r11),%ymm1
|
|
movq 0(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq %r8,%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vmovq %r13,%xmm2
|
|
vpbroadcastq %xmm2,%ymm2
|
|
movq 0(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
leaq -328(%rsp),%rsp
|
|
|
|
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
|
|
vmovdqu %ymm3,0(%rsp)
|
|
vmovdqu %ymm4,32(%rsp)
|
|
vmovdqu %ymm5,64(%rsp)
|
|
vmovdqu %ymm6,96(%rsp)
|
|
vmovdqu %ymm7,128(%rsp)
|
|
vmovdqu %ymm8,160(%rsp)
|
|
vmovdqu %ymm9,192(%rsp)
|
|
vmovdqu %ymm10,224(%rsp)
|
|
vmovdqu %ymm11,256(%rsp)
|
|
vmovdqu %ymm12,288(%rsp)
|
|
movq $0,320(%rsp)
|
|
|
|
vmovdqu 8(%rsp),%ymm3
|
|
vmovdqu 40(%rsp),%ymm4
|
|
vmovdqu 72(%rsp),%ymm5
|
|
vmovdqu 104(%rsp),%ymm6
|
|
vmovdqu 136(%rsp),%ymm7
|
|
vmovdqu 168(%rsp),%ymm8
|
|
vmovdqu 200(%rsp),%ymm9
|
|
vmovdqu 232(%rsp),%ymm10
|
|
vmovdqu 264(%rsp),%ymm11
|
|
vmovdqu 296(%rsp),%ymm12
|
|
|
|
addq 8(%rsp),%r9
|
|
|
|
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
|
|
leaq 328(%rsp),%rsp
|
|
leaq 32(%r11),%r11
|
|
decl %ebx
|
|
jne .Lloop10
|
|
|
|
vmovq %r9,%xmm0
|
|
vpbroadcastq %xmm0,%ymm0
|
|
vpblendd $3,%ymm0,%ymm3,%ymm3
|
|
|
|
leaq -640(%rsp),%rsp
|
|
vmovupd %ymm3,0(%rsp)
|
|
vmovupd %ymm4,32(%rsp)
|
|
vmovupd %ymm5,64(%rsp)
|
|
vmovupd %ymm6,96(%rsp)
|
|
vmovupd %ymm7,128(%rsp)
|
|
vmovupd %ymm8,160(%rsp)
|
|
vmovupd %ymm9,192(%rsp)
|
|
vmovupd %ymm10,224(%rsp)
|
|
vmovupd %ymm11,256(%rsp)
|
|
vmovupd %ymm12,288(%rsp)
|
|
|
|
|
|
|
|
vpsrlq $52,%ymm3,%ymm3
|
|
vpsrlq $52,%ymm4,%ymm4
|
|
vpsrlq $52,%ymm5,%ymm5
|
|
vpsrlq $52,%ymm6,%ymm6
|
|
vpsrlq $52,%ymm7,%ymm7
|
|
vpsrlq $52,%ymm8,%ymm8
|
|
vpsrlq $52,%ymm9,%ymm9
|
|
vpsrlq $52,%ymm10,%ymm10
|
|
vpsrlq $52,%ymm11,%ymm11
|
|
vpsrlq $52,%ymm12,%ymm12
|
|
|
|
|
|
vpermq $144,%ymm12,%ymm12
|
|
vpermq $3,%ymm11,%ymm13
|
|
vblendpd $1,%ymm13,%ymm12,%ymm12
|
|
|
|
vpermq $144,%ymm11,%ymm11
|
|
vpermq $3,%ymm10,%ymm13
|
|
vblendpd $1,%ymm13,%ymm11,%ymm11
|
|
|
|
vpermq $144,%ymm10,%ymm10
|
|
vpermq $3,%ymm9,%ymm13
|
|
vblendpd $1,%ymm13,%ymm10,%ymm10
|
|
|
|
vpermq $144,%ymm9,%ymm9
|
|
vpermq $3,%ymm8,%ymm13
|
|
vblendpd $1,%ymm13,%ymm9,%ymm9
|
|
|
|
vpermq $144,%ymm8,%ymm8
|
|
vpermq $3,%ymm7,%ymm13
|
|
vblendpd $1,%ymm13,%ymm8,%ymm8
|
|
|
|
vpermq $144,%ymm7,%ymm7
|
|
vpermq $3,%ymm6,%ymm13
|
|
vblendpd $1,%ymm13,%ymm7,%ymm7
|
|
|
|
vpermq $144,%ymm6,%ymm6
|
|
vpermq $3,%ymm5,%ymm13
|
|
vblendpd $1,%ymm13,%ymm6,%ymm6
|
|
|
|
vpermq $144,%ymm5,%ymm5
|
|
vpermq $3,%ymm4,%ymm13
|
|
vblendpd $1,%ymm13,%ymm5,%ymm5
|
|
|
|
vpermq $144,%ymm4,%ymm4
|
|
vpermq $3,%ymm3,%ymm13
|
|
vblendpd $1,%ymm13,%ymm4,%ymm4
|
|
|
|
vpermq $144,%ymm3,%ymm3
|
|
vpand .Lhigh64x3(%rip),%ymm3,%ymm3
|
|
|
|
vmovupd %ymm3,320(%rsp)
|
|
vmovupd %ymm4,352(%rsp)
|
|
vmovupd %ymm5,384(%rsp)
|
|
vmovupd %ymm6,416(%rsp)
|
|
vmovupd %ymm7,448(%rsp)
|
|
vmovupd %ymm8,480(%rsp)
|
|
vmovupd %ymm9,512(%rsp)
|
|
vmovupd %ymm10,544(%rsp)
|
|
vmovupd %ymm11,576(%rsp)
|
|
vmovupd %ymm12,608(%rsp)
|
|
|
|
vmovupd 0(%rsp),%ymm3
|
|
vmovupd 32(%rsp),%ymm4
|
|
vmovupd 64(%rsp),%ymm5
|
|
vmovupd 96(%rsp),%ymm6
|
|
vmovupd 128(%rsp),%ymm7
|
|
vmovupd 160(%rsp),%ymm8
|
|
vmovupd 192(%rsp),%ymm9
|
|
vmovupd 224(%rsp),%ymm10
|
|
vmovupd 256(%rsp),%ymm11
|
|
vmovupd 288(%rsp),%ymm12
|
|
|
|
|
|
vpand .Lmask52x4(%rip),%ymm3,%ymm3
|
|
vpand .Lmask52x4(%rip),%ymm4,%ymm4
|
|
vpand .Lmask52x4(%rip),%ymm5,%ymm5
|
|
vpand .Lmask52x4(%rip),%ymm6,%ymm6
|
|
vpand .Lmask52x4(%rip),%ymm7,%ymm7
|
|
vpand .Lmask52x4(%rip),%ymm8,%ymm8
|
|
vpand .Lmask52x4(%rip),%ymm9,%ymm9
|
|
vpand .Lmask52x4(%rip),%ymm10,%ymm10
|
|
vpand .Lmask52x4(%rip),%ymm11,%ymm11
|
|
vpand .Lmask52x4(%rip),%ymm12,%ymm12
|
|
|
|
|
|
vpaddq 320(%rsp),%ymm3,%ymm3
|
|
vpaddq 352(%rsp),%ymm4,%ymm4
|
|
vpaddq 384(%rsp),%ymm5,%ymm5
|
|
vpaddq 416(%rsp),%ymm6,%ymm6
|
|
vpaddq 448(%rsp),%ymm7,%ymm7
|
|
vpaddq 480(%rsp),%ymm8,%ymm8
|
|
vpaddq 512(%rsp),%ymm9,%ymm9
|
|
vpaddq 544(%rsp),%ymm10,%ymm10
|
|
vpaddq 576(%rsp),%ymm11,%ymm11
|
|
vpaddq 608(%rsp),%ymm12,%ymm12
|
|
|
|
leaq 640(%rsp),%rsp
|
|
|
|
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13
|
|
vmovmskpd %ymm13,%r14d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13
|
|
vmovmskpd %ymm13,%r13d
|
|
shlb $4,%r13b
|
|
orb %r13b,%r14b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13
|
|
vmovmskpd %ymm13,%r13d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13
|
|
vmovmskpd %ymm13,%r12d
|
|
shlb $4,%r12b
|
|
orb %r12b,%r13b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13
|
|
vmovmskpd %ymm13,%r12d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
|
|
vmovmskpd %ymm13,%r11d
|
|
shlb $4,%r11b
|
|
orb %r11b,%r12b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13
|
|
vmovmskpd %ymm13,%r11d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13
|
|
vmovmskpd %ymm13,%r10d
|
|
shlb $4,%r10b
|
|
orb %r10b,%r11b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13
|
|
vmovmskpd %ymm13,%r10d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13
|
|
vmovmskpd %ymm13,%r9d
|
|
shlb $4,%r9b
|
|
orb %r9b,%r10b
|
|
|
|
addb %r14b,%r14b
|
|
adcb %r13b,%r13b
|
|
adcb %r12b,%r12b
|
|
adcb %r11b,%r11b
|
|
adcb %r10b,%r10b
|
|
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13
|
|
vmovmskpd %ymm13,%r9d
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13
|
|
vmovmskpd %ymm13,%r8d
|
|
shlb $4,%r8b
|
|
orb %r8b,%r9b
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13
|
|
vmovmskpd %ymm13,%r8d
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13
|
|
vmovmskpd %ymm13,%edx
|
|
shlb $4,%dl
|
|
orb %dl,%r8b
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13
|
|
vmovmskpd %ymm13,%edx
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
|
|
vmovmskpd %ymm13,%ecx
|
|
shlb $4,%cl
|
|
orb %cl,%dl
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13
|
|
vmovmskpd %ymm13,%ecx
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13
|
|
vmovmskpd %ymm13,%ebx
|
|
shlb $4,%bl
|
|
orb %bl,%cl
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13
|
|
vmovmskpd %ymm13,%ebx
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13
|
|
vmovmskpd %ymm13,%eax
|
|
shlb $4,%al
|
|
orb %al,%bl
|
|
|
|
addb %r9b,%r14b
|
|
adcb %r8b,%r13b
|
|
adcb %dl,%r12b
|
|
adcb %cl,%r11b
|
|
adcb %bl,%r10b
|
|
|
|
xorb %r9b,%r14b
|
|
xorb %r8b,%r13b
|
|
xorb %dl,%r12b
|
|
xorb %cl,%r11b
|
|
xorb %bl,%r10b
|
|
|
|
pushq %r9
|
|
pushq %r8
|
|
|
|
leaq .Lkmasklut(%rip),%r8
|
|
|
|
movb %r14b,%r9b
|
|
andq $0xf,%r14
|
|
vpsubq .Lmask52x4(%rip),%ymm3,%ymm13
|
|
shlq $5,%r14
|
|
vmovapd (%r8,%r14,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm3,%ymm3
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm4,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm4,%ymm4
|
|
|
|
movb %r13b,%r9b
|
|
andq $0xf,%r13
|
|
vpsubq .Lmask52x4(%rip),%ymm5,%ymm13
|
|
shlq $5,%r13
|
|
vmovapd (%r8,%r13,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm5,%ymm5
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm6,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm6,%ymm6
|
|
|
|
movb %r12b,%r9b
|
|
andq $0xf,%r12
|
|
vpsubq .Lmask52x4(%rip),%ymm7,%ymm13
|
|
shlq $5,%r12
|
|
vmovapd (%r8,%r12,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm7,%ymm7
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm8,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm8,%ymm8
|
|
|
|
movb %r11b,%r9b
|
|
andq $0xf,%r11
|
|
vpsubq .Lmask52x4(%rip),%ymm9,%ymm13
|
|
shlq $5,%r11
|
|
vmovapd (%r8,%r11,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm9,%ymm9
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm10,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm10,%ymm10
|
|
|
|
movb %r10b,%r9b
|
|
andq $0xf,%r10
|
|
vpsubq .Lmask52x4(%rip),%ymm11,%ymm13
|
|
shlq $5,%r10
|
|
vmovapd (%r8,%r10,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm11,%ymm11
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm12,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm12,%ymm12
|
|
|
|
popq %r8
|
|
popq %r9
|
|
|
|
vpand .Lmask52x4(%rip),%ymm3,%ymm3
|
|
vpand .Lmask52x4(%rip),%ymm4,%ymm4
|
|
vpand .Lmask52x4(%rip),%ymm5,%ymm5
|
|
vpand .Lmask52x4(%rip),%ymm6,%ymm6
|
|
vpand .Lmask52x4(%rip),%ymm7,%ymm7
|
|
vpand .Lmask52x4(%rip),%ymm8,%ymm8
|
|
vpand .Lmask52x4(%rip),%ymm9,%ymm9
|
|
|
|
vpand .Lmask52x4(%rip),%ymm10,%ymm10
|
|
vpand .Lmask52x4(%rip),%ymm11,%ymm11
|
|
vpand .Lmask52x4(%rip),%ymm12,%ymm12
|
|
|
|
vmovdqu %ymm3,0(%rdi)
|
|
vmovdqu %ymm4,32(%rdi)
|
|
vmovdqu %ymm5,64(%rdi)
|
|
vmovdqu %ymm6,96(%rdi)
|
|
vmovdqu %ymm7,128(%rdi)
|
|
vmovdqu %ymm8,160(%rdi)
|
|
vmovdqu %ymm9,192(%rdi)
|
|
vmovdqu %ymm10,224(%rdi)
|
|
vmovdqu %ymm11,256(%rdi)
|
|
vmovdqu %ymm12,288(%rdi)
|
|
|
|
vzeroupper
|
|
leaq (%rsp),%rax
|
|
.cfi_def_cfa_register %rax
|
|
movq 0(%rax),%r15
|
|
.cfi_restore %r15
|
|
movq 8(%rax),%r14
|
|
.cfi_restore %r14
|
|
movq 16(%rax),%r13
|
|
.cfi_restore %r13
|
|
movq 24(%rax),%r12
|
|
.cfi_restore %r12
|
|
movq 32(%rax),%rbp
|
|
.cfi_restore %rbp
|
|
movq 40(%rax),%rbx
|
|
.cfi_restore %rbx
|
|
leaq 48(%rax),%rsp
|
|
.cfi_def_cfa %rsp,8
|
|
.Lossl_rsaz_amm52x40_x1_avxifma256_epilogue:
|
|
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size ossl_rsaz_amm52x40_x1_avxifma256, .-ossl_rsaz_amm52x40_x1_avxifma256
|
|
.section .rodata
|
|
.align 32
|
|
.Lmask52x4:
|
|
.quad 0xfffffffffffff
|
|
.quad 0xfffffffffffff
|
|
.quad 0xfffffffffffff
|
|
.quad 0xfffffffffffff
|
|
.Lhigh64x3:
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.Lkmasklut:
|
|
|
|
.quad 0x0
|
|
.quad 0x0
|
|
.quad 0x0
|
|
.quad 0x0
|
|
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
.quad 0x0
|
|
.quad 0x0
|
|
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
.quad 0x0
|
|
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
.quad 0x0
|
|
|
|
.quad 0x0
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
|
|
.quad 0x0
|
|
.quad 0x0
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
|
|
.quad 0x0
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
|
|
.quad 0xffffffffffffffff
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
|
|
.quad 0x0
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.text
|
|
|
|
.globl ossl_rsaz_amm52x40_x2_avxifma256
|
|
.type ossl_rsaz_amm52x40_x2_avxifma256,@function
|
|
.align 32
|
|
ossl_rsaz_amm52x40_x2_avxifma256:
|
|
.cfi_startproc
|
|
.byte 243,15,30,250
|
|
pushq %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r15,-56
|
|
|
|
vpxor %ymm0,%ymm0,%ymm0
|
|
vmovapd %ymm0,%ymm3
|
|
vmovapd %ymm0,%ymm4
|
|
vmovapd %ymm0,%ymm5
|
|
vmovapd %ymm0,%ymm6
|
|
vmovapd %ymm0,%ymm7
|
|
vmovapd %ymm0,%ymm8
|
|
vmovapd %ymm0,%ymm9
|
|
vmovapd %ymm0,%ymm10
|
|
vmovapd %ymm0,%ymm11
|
|
vmovapd %ymm0,%ymm12
|
|
|
|
xorl %r9d,%r9d
|
|
|
|
movq %rdx,%r11
|
|
movq $0xfffffffffffff,%rax
|
|
|
|
movl $40,%ebx
|
|
|
|
.align 32
|
|
.Lloop40:
|
|
movq 0(%r11),%r13
|
|
|
|
vpbroadcastq 0(%r11),%ymm1
|
|
movq 0(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq (%r8),%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vmovq %r13,%xmm2
|
|
vpbroadcastq %xmm2,%ymm2
|
|
movq 0(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
leaq -328(%rsp),%rsp
|
|
|
|
{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12
|
|
vmovdqu %ymm3,0(%rsp)
|
|
vmovdqu %ymm4,32(%rsp)
|
|
vmovdqu %ymm5,64(%rsp)
|
|
vmovdqu %ymm6,96(%rsp)
|
|
vmovdqu %ymm7,128(%rsp)
|
|
vmovdqu %ymm8,160(%rsp)
|
|
vmovdqu %ymm9,192(%rsp)
|
|
vmovdqu %ymm10,224(%rsp)
|
|
vmovdqu %ymm11,256(%rsp)
|
|
vmovdqu %ymm12,288(%rsp)
|
|
movq $0,320(%rsp)
|
|
|
|
vmovdqu 8(%rsp),%ymm3
|
|
vmovdqu 40(%rsp),%ymm4
|
|
vmovdqu 72(%rsp),%ymm5
|
|
vmovdqu 104(%rsp),%ymm6
|
|
vmovdqu 136(%rsp),%ymm7
|
|
vmovdqu 168(%rsp),%ymm8
|
|
vmovdqu 200(%rsp),%ymm9
|
|
vmovdqu 232(%rsp),%ymm10
|
|
vmovdqu 264(%rsp),%ymm11
|
|
vmovdqu 296(%rsp),%ymm12
|
|
|
|
addq 8(%rsp),%r9
|
|
|
|
{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12
|
|
leaq 328(%rsp),%rsp
|
|
leaq 8(%r11),%r11
|
|
decl %ebx
|
|
jne .Lloop40
|
|
|
|
pushq %r11
|
|
pushq %rsi
|
|
pushq %rcx
|
|
pushq %r8
|
|
|
|
vmovq %r9,%xmm0
|
|
vpbroadcastq %xmm0,%ymm0
|
|
vpblendd $3,%ymm0,%ymm3,%ymm3
|
|
|
|
leaq -640(%rsp),%rsp
|
|
vmovupd %ymm3,0(%rsp)
|
|
vmovupd %ymm4,32(%rsp)
|
|
vmovupd %ymm5,64(%rsp)
|
|
vmovupd %ymm6,96(%rsp)
|
|
vmovupd %ymm7,128(%rsp)
|
|
vmovupd %ymm8,160(%rsp)
|
|
vmovupd %ymm9,192(%rsp)
|
|
vmovupd %ymm10,224(%rsp)
|
|
vmovupd %ymm11,256(%rsp)
|
|
vmovupd %ymm12,288(%rsp)
|
|
|
|
|
|
|
|
vpsrlq $52,%ymm3,%ymm3
|
|
vpsrlq $52,%ymm4,%ymm4
|
|
vpsrlq $52,%ymm5,%ymm5
|
|
vpsrlq $52,%ymm6,%ymm6
|
|
vpsrlq $52,%ymm7,%ymm7
|
|
vpsrlq $52,%ymm8,%ymm8
|
|
vpsrlq $52,%ymm9,%ymm9
|
|
vpsrlq $52,%ymm10,%ymm10
|
|
vpsrlq $52,%ymm11,%ymm11
|
|
vpsrlq $52,%ymm12,%ymm12
|
|
|
|
|
|
vpermq $144,%ymm12,%ymm12
|
|
vpermq $3,%ymm11,%ymm13
|
|
vblendpd $1,%ymm13,%ymm12,%ymm12
|
|
|
|
vpermq $144,%ymm11,%ymm11
|
|
vpermq $3,%ymm10,%ymm13
|
|
vblendpd $1,%ymm13,%ymm11,%ymm11
|
|
|
|
vpermq $144,%ymm10,%ymm10
|
|
vpermq $3,%ymm9,%ymm13
|
|
vblendpd $1,%ymm13,%ymm10,%ymm10
|
|
|
|
vpermq $144,%ymm9,%ymm9
|
|
vpermq $3,%ymm8,%ymm13
|
|
vblendpd $1,%ymm13,%ymm9,%ymm9
|
|
|
|
vpermq $144,%ymm8,%ymm8
|
|
vpermq $3,%ymm7,%ymm13
|
|
vblendpd $1,%ymm13,%ymm8,%ymm8
|
|
|
|
vpermq $144,%ymm7,%ymm7
|
|
vpermq $3,%ymm6,%ymm13
|
|
vblendpd $1,%ymm13,%ymm7,%ymm7
|
|
|
|
vpermq $144,%ymm6,%ymm6
|
|
vpermq $3,%ymm5,%ymm13
|
|
vblendpd $1,%ymm13,%ymm6,%ymm6
|
|
|
|
vpermq $144,%ymm5,%ymm5
|
|
vpermq $3,%ymm4,%ymm13
|
|
vblendpd $1,%ymm13,%ymm5,%ymm5
|
|
|
|
vpermq $144,%ymm4,%ymm4
|
|
vpermq $3,%ymm3,%ymm13
|
|
vblendpd $1,%ymm13,%ymm4,%ymm4
|
|
|
|
vpermq $144,%ymm3,%ymm3
|
|
vpand .Lhigh64x3(%rip),%ymm3,%ymm3
|
|
|
|
vmovupd %ymm3,320(%rsp)
|
|
vmovupd %ymm4,352(%rsp)
|
|
vmovupd %ymm5,384(%rsp)
|
|
vmovupd %ymm6,416(%rsp)
|
|
vmovupd %ymm7,448(%rsp)
|
|
vmovupd %ymm8,480(%rsp)
|
|
vmovupd %ymm9,512(%rsp)
|
|
vmovupd %ymm10,544(%rsp)
|
|
vmovupd %ymm11,576(%rsp)
|
|
vmovupd %ymm12,608(%rsp)
|
|
|
|
vmovupd 0(%rsp),%ymm3
|
|
vmovupd 32(%rsp),%ymm4
|
|
vmovupd 64(%rsp),%ymm5
|
|
vmovupd 96(%rsp),%ymm6
|
|
vmovupd 128(%rsp),%ymm7
|
|
vmovupd 160(%rsp),%ymm8
|
|
vmovupd 192(%rsp),%ymm9
|
|
vmovupd 224(%rsp),%ymm10
|
|
vmovupd 256(%rsp),%ymm11
|
|
vmovupd 288(%rsp),%ymm12
|
|
|
|
|
|
vpand .Lmask52x4(%rip),%ymm3,%ymm3
|
|
vpand .Lmask52x4(%rip),%ymm4,%ymm4
|
|
vpand .Lmask52x4(%rip),%ymm5,%ymm5
|
|
vpand .Lmask52x4(%rip),%ymm6,%ymm6
|
|
vpand .Lmask52x4(%rip),%ymm7,%ymm7
|
|
vpand .Lmask52x4(%rip),%ymm8,%ymm8
|
|
vpand .Lmask52x4(%rip),%ymm9,%ymm9
|
|
vpand .Lmask52x4(%rip),%ymm10,%ymm10
|
|
vpand .Lmask52x4(%rip),%ymm11,%ymm11
|
|
vpand .Lmask52x4(%rip),%ymm12,%ymm12
|
|
|
|
|
|
vpaddq 320(%rsp),%ymm3,%ymm3
|
|
vpaddq 352(%rsp),%ymm4,%ymm4
|
|
vpaddq 384(%rsp),%ymm5,%ymm5
|
|
vpaddq 416(%rsp),%ymm6,%ymm6
|
|
vpaddq 448(%rsp),%ymm7,%ymm7
|
|
vpaddq 480(%rsp),%ymm8,%ymm8
|
|
vpaddq 512(%rsp),%ymm9,%ymm9
|
|
vpaddq 544(%rsp),%ymm10,%ymm10
|
|
vpaddq 576(%rsp),%ymm11,%ymm11
|
|
vpaddq 608(%rsp),%ymm12,%ymm12
|
|
|
|
leaq 640(%rsp),%rsp
|
|
|
|
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13
|
|
vmovmskpd %ymm13,%r14d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13
|
|
vmovmskpd %ymm13,%r13d
|
|
shlb $4,%r13b
|
|
orb %r13b,%r14b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13
|
|
vmovmskpd %ymm13,%r13d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13
|
|
vmovmskpd %ymm13,%r12d
|
|
shlb $4,%r12b
|
|
orb %r12b,%r13b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13
|
|
vmovmskpd %ymm13,%r12d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
|
|
vmovmskpd %ymm13,%r11d
|
|
shlb $4,%r11b
|
|
orb %r11b,%r12b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13
|
|
vmovmskpd %ymm13,%r11d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13
|
|
vmovmskpd %ymm13,%r10d
|
|
shlb $4,%r10b
|
|
orb %r10b,%r11b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13
|
|
vmovmskpd %ymm13,%r10d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13
|
|
vmovmskpd %ymm13,%r9d
|
|
shlb $4,%r9b
|
|
orb %r9b,%r10b
|
|
|
|
addb %r14b,%r14b
|
|
adcb %r13b,%r13b
|
|
adcb %r12b,%r12b
|
|
adcb %r11b,%r11b
|
|
adcb %r10b,%r10b
|
|
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13
|
|
vmovmskpd %ymm13,%r9d
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13
|
|
vmovmskpd %ymm13,%r8d
|
|
shlb $4,%r8b
|
|
orb %r8b,%r9b
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13
|
|
vmovmskpd %ymm13,%r8d
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13
|
|
vmovmskpd %ymm13,%edx
|
|
shlb $4,%dl
|
|
orb %dl,%r8b
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13
|
|
vmovmskpd %ymm13,%edx
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
|
|
vmovmskpd %ymm13,%ecx
|
|
shlb $4,%cl
|
|
orb %cl,%dl
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13
|
|
vmovmskpd %ymm13,%ecx
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13
|
|
vmovmskpd %ymm13,%ebx
|
|
shlb $4,%bl
|
|
orb %bl,%cl
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13
|
|
vmovmskpd %ymm13,%ebx
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13
|
|
vmovmskpd %ymm13,%eax
|
|
shlb $4,%al
|
|
orb %al,%bl
|
|
|
|
addb %r9b,%r14b
|
|
adcb %r8b,%r13b
|
|
adcb %dl,%r12b
|
|
adcb %cl,%r11b
|
|
adcb %bl,%r10b
|
|
|
|
xorb %r9b,%r14b
|
|
xorb %r8b,%r13b
|
|
xorb %dl,%r12b
|
|
xorb %cl,%r11b
|
|
xorb %bl,%r10b
|
|
|
|
pushq %r9
|
|
pushq %r8
|
|
|
|
leaq .Lkmasklut(%rip),%r8
|
|
|
|
movb %r14b,%r9b
|
|
andq $0xf,%r14
|
|
vpsubq .Lmask52x4(%rip),%ymm3,%ymm13
|
|
shlq $5,%r14
|
|
vmovapd (%r8,%r14,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm3,%ymm3
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm4,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm4,%ymm4
|
|
|
|
movb %r13b,%r9b
|
|
andq $0xf,%r13
|
|
vpsubq .Lmask52x4(%rip),%ymm5,%ymm13
|
|
shlq $5,%r13
|
|
vmovapd (%r8,%r13,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm5,%ymm5
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm6,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm6,%ymm6
|
|
|
|
movb %r12b,%r9b
|
|
andq $0xf,%r12
|
|
vpsubq .Lmask52x4(%rip),%ymm7,%ymm13
|
|
shlq $5,%r12
|
|
vmovapd (%r8,%r12,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm7,%ymm7
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm8,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm8,%ymm8
|
|
|
|
movb %r11b,%r9b
|
|
andq $0xf,%r11
|
|
vpsubq .Lmask52x4(%rip),%ymm9,%ymm13
|
|
shlq $5,%r11
|
|
vmovapd (%r8,%r11,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm9,%ymm9
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm10,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm10,%ymm10
|
|
|
|
movb %r10b,%r9b
|
|
andq $0xf,%r10
|
|
vpsubq .Lmask52x4(%rip),%ymm11,%ymm13
|
|
shlq $5,%r10
|
|
vmovapd (%r8,%r10,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm11,%ymm11
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm12,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm12,%ymm12
|
|
|
|
popq %r8
|
|
popq %r9
|
|
|
|
vpand .Lmask52x4(%rip),%ymm3,%ymm3
|
|
vpand .Lmask52x4(%rip),%ymm4,%ymm4
|
|
vpand .Lmask52x4(%rip),%ymm5,%ymm5
|
|
vpand .Lmask52x4(%rip),%ymm6,%ymm6
|
|
vpand .Lmask52x4(%rip),%ymm7,%ymm7
|
|
vpand .Lmask52x4(%rip),%ymm8,%ymm8
|
|
vpand .Lmask52x4(%rip),%ymm9,%ymm9
|
|
|
|
vpand .Lmask52x4(%rip),%ymm10,%ymm10
|
|
vpand .Lmask52x4(%rip),%ymm11,%ymm11
|
|
vpand .Lmask52x4(%rip),%ymm12,%ymm12
|
|
|
|
popq %r8
|
|
popq %rcx
|
|
popq %rsi
|
|
popq %r11
|
|
|
|
vmovdqu %ymm3,0(%rdi)
|
|
vmovdqu %ymm4,32(%rdi)
|
|
vmovdqu %ymm5,64(%rdi)
|
|
vmovdqu %ymm6,96(%rdi)
|
|
vmovdqu %ymm7,128(%rdi)
|
|
vmovdqu %ymm8,160(%rdi)
|
|
vmovdqu %ymm9,192(%rdi)
|
|
vmovdqu %ymm10,224(%rdi)
|
|
vmovdqu %ymm11,256(%rdi)
|
|
vmovdqu %ymm12,288(%rdi)
|
|
|
|
xorl %r9d,%r9d
|
|
|
|
movq $0xfffffffffffff,%rax
|
|
|
|
movl $40,%ebx
|
|
|
|
vpxor %ymm0,%ymm0,%ymm0
|
|
vmovapd %ymm0,%ymm3
|
|
vmovapd %ymm0,%ymm4
|
|
vmovapd %ymm0,%ymm5
|
|
vmovapd %ymm0,%ymm6
|
|
vmovapd %ymm0,%ymm7
|
|
vmovapd %ymm0,%ymm8
|
|
vmovapd %ymm0,%ymm9
|
|
vmovapd %ymm0,%ymm10
|
|
vmovapd %ymm0,%ymm11
|
|
vmovapd %ymm0,%ymm12
|
|
.align 32
|
|
.Lloop40_1:
|
|
movq 0(%r11),%r13
|
|
|
|
vpbroadcastq 0(%r11),%ymm1
|
|
movq 320(%rsi),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
movq %r12,%r10
|
|
adcq $0,%r10
|
|
|
|
movq 8(%r8),%r13
|
|
imulq %r9,%r13
|
|
andq %rax,%r13
|
|
|
|
vmovq %r13,%xmm2
|
|
vpbroadcastq %xmm2,%ymm2
|
|
movq 320(%rcx),%rdx
|
|
mulxq %r13,%r13,%r12
|
|
addq %r13,%r9
|
|
adcq %r12,%r10
|
|
|
|
shrq $52,%r9
|
|
salq $12,%r10
|
|
orq %r10,%r9
|
|
|
|
leaq -328(%rsp),%rsp
|
|
|
|
{vex} vpmadd52luq 320(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52luq 352(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52luq 384(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52luq 416(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52luq 448(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52luq 480(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52luq 512(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52luq 544(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52luq 576(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52luq 608(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52luq 320(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52luq 352(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52luq 384(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52luq 416(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52luq 448(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52luq 480(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52luq 512(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52luq 544(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52luq 576(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52luq 608(%rcx),%ymm2,%ymm12
|
|
vmovdqu %ymm3,0(%rsp)
|
|
vmovdqu %ymm4,32(%rsp)
|
|
vmovdqu %ymm5,64(%rsp)
|
|
vmovdqu %ymm6,96(%rsp)
|
|
vmovdqu %ymm7,128(%rsp)
|
|
vmovdqu %ymm8,160(%rsp)
|
|
vmovdqu %ymm9,192(%rsp)
|
|
vmovdqu %ymm10,224(%rsp)
|
|
vmovdqu %ymm11,256(%rsp)
|
|
vmovdqu %ymm12,288(%rsp)
|
|
movq $0,320(%rsp)
|
|
|
|
vmovdqu 8(%rsp),%ymm3
|
|
vmovdqu 40(%rsp),%ymm4
|
|
vmovdqu 72(%rsp),%ymm5
|
|
vmovdqu 104(%rsp),%ymm6
|
|
vmovdqu 136(%rsp),%ymm7
|
|
vmovdqu 168(%rsp),%ymm8
|
|
vmovdqu 200(%rsp),%ymm9
|
|
vmovdqu 232(%rsp),%ymm10
|
|
vmovdqu 264(%rsp),%ymm11
|
|
vmovdqu 296(%rsp),%ymm12
|
|
|
|
addq 8(%rsp),%r9
|
|
|
|
{vex} vpmadd52huq 320(%rsi),%ymm1,%ymm3
|
|
{vex} vpmadd52huq 352(%rsi),%ymm1,%ymm4
|
|
{vex} vpmadd52huq 384(%rsi),%ymm1,%ymm5
|
|
{vex} vpmadd52huq 416(%rsi),%ymm1,%ymm6
|
|
{vex} vpmadd52huq 448(%rsi),%ymm1,%ymm7
|
|
{vex} vpmadd52huq 480(%rsi),%ymm1,%ymm8
|
|
{vex} vpmadd52huq 512(%rsi),%ymm1,%ymm9
|
|
{vex} vpmadd52huq 544(%rsi),%ymm1,%ymm10
|
|
{vex} vpmadd52huq 576(%rsi),%ymm1,%ymm11
|
|
{vex} vpmadd52huq 608(%rsi),%ymm1,%ymm12
|
|
|
|
{vex} vpmadd52huq 320(%rcx),%ymm2,%ymm3
|
|
{vex} vpmadd52huq 352(%rcx),%ymm2,%ymm4
|
|
{vex} vpmadd52huq 384(%rcx),%ymm2,%ymm5
|
|
{vex} vpmadd52huq 416(%rcx),%ymm2,%ymm6
|
|
{vex} vpmadd52huq 448(%rcx),%ymm2,%ymm7
|
|
{vex} vpmadd52huq 480(%rcx),%ymm2,%ymm8
|
|
{vex} vpmadd52huq 512(%rcx),%ymm2,%ymm9
|
|
{vex} vpmadd52huq 544(%rcx),%ymm2,%ymm10
|
|
{vex} vpmadd52huq 576(%rcx),%ymm2,%ymm11
|
|
{vex} vpmadd52huq 608(%rcx),%ymm2,%ymm12
|
|
leaq 328(%rsp),%rsp
|
|
leaq 8(%r11),%r11
|
|
decl %ebx
|
|
jne .Lloop40_1
|
|
|
|
vmovq %r9,%xmm0
|
|
vpbroadcastq %xmm0,%ymm0
|
|
vpblendd $3,%ymm0,%ymm3,%ymm3
|
|
|
|
leaq -640(%rsp),%rsp
|
|
vmovupd %ymm3,0(%rsp)
|
|
vmovupd %ymm4,32(%rsp)
|
|
vmovupd %ymm5,64(%rsp)
|
|
vmovupd %ymm6,96(%rsp)
|
|
vmovupd %ymm7,128(%rsp)
|
|
vmovupd %ymm8,160(%rsp)
|
|
vmovupd %ymm9,192(%rsp)
|
|
vmovupd %ymm10,224(%rsp)
|
|
vmovupd %ymm11,256(%rsp)
|
|
vmovupd %ymm12,288(%rsp)
|
|
|
|
|
|
|
|
vpsrlq $52,%ymm3,%ymm3
|
|
vpsrlq $52,%ymm4,%ymm4
|
|
vpsrlq $52,%ymm5,%ymm5
|
|
vpsrlq $52,%ymm6,%ymm6
|
|
vpsrlq $52,%ymm7,%ymm7
|
|
vpsrlq $52,%ymm8,%ymm8
|
|
vpsrlq $52,%ymm9,%ymm9
|
|
vpsrlq $52,%ymm10,%ymm10
|
|
vpsrlq $52,%ymm11,%ymm11
|
|
vpsrlq $52,%ymm12,%ymm12
|
|
|
|
|
|
vpermq $144,%ymm12,%ymm12
|
|
vpermq $3,%ymm11,%ymm13
|
|
vblendpd $1,%ymm13,%ymm12,%ymm12
|
|
|
|
vpermq $144,%ymm11,%ymm11
|
|
vpermq $3,%ymm10,%ymm13
|
|
vblendpd $1,%ymm13,%ymm11,%ymm11
|
|
|
|
vpermq $144,%ymm10,%ymm10
|
|
vpermq $3,%ymm9,%ymm13
|
|
vblendpd $1,%ymm13,%ymm10,%ymm10
|
|
|
|
vpermq $144,%ymm9,%ymm9
|
|
vpermq $3,%ymm8,%ymm13
|
|
vblendpd $1,%ymm13,%ymm9,%ymm9
|
|
|
|
vpermq $144,%ymm8,%ymm8
|
|
vpermq $3,%ymm7,%ymm13
|
|
vblendpd $1,%ymm13,%ymm8,%ymm8
|
|
|
|
vpermq $144,%ymm7,%ymm7
|
|
vpermq $3,%ymm6,%ymm13
|
|
vblendpd $1,%ymm13,%ymm7,%ymm7
|
|
|
|
vpermq $144,%ymm6,%ymm6
|
|
vpermq $3,%ymm5,%ymm13
|
|
vblendpd $1,%ymm13,%ymm6,%ymm6
|
|
|
|
vpermq $144,%ymm5,%ymm5
|
|
vpermq $3,%ymm4,%ymm13
|
|
vblendpd $1,%ymm13,%ymm5,%ymm5
|
|
|
|
vpermq $144,%ymm4,%ymm4
|
|
vpermq $3,%ymm3,%ymm13
|
|
vblendpd $1,%ymm13,%ymm4,%ymm4
|
|
|
|
vpermq $144,%ymm3,%ymm3
|
|
vpand .Lhigh64x3(%rip),%ymm3,%ymm3
|
|
|
|
vmovupd %ymm3,320(%rsp)
|
|
vmovupd %ymm4,352(%rsp)
|
|
vmovupd %ymm5,384(%rsp)
|
|
vmovupd %ymm6,416(%rsp)
|
|
vmovupd %ymm7,448(%rsp)
|
|
vmovupd %ymm8,480(%rsp)
|
|
vmovupd %ymm9,512(%rsp)
|
|
vmovupd %ymm10,544(%rsp)
|
|
vmovupd %ymm11,576(%rsp)
|
|
vmovupd %ymm12,608(%rsp)
|
|
|
|
vmovupd 0(%rsp),%ymm3
|
|
vmovupd 32(%rsp),%ymm4
|
|
vmovupd 64(%rsp),%ymm5
|
|
vmovupd 96(%rsp),%ymm6
|
|
vmovupd 128(%rsp),%ymm7
|
|
vmovupd 160(%rsp),%ymm8
|
|
vmovupd 192(%rsp),%ymm9
|
|
vmovupd 224(%rsp),%ymm10
|
|
vmovupd 256(%rsp),%ymm11
|
|
vmovupd 288(%rsp),%ymm12
|
|
|
|
|
|
vpand .Lmask52x4(%rip),%ymm3,%ymm3
|
|
vpand .Lmask52x4(%rip),%ymm4,%ymm4
|
|
vpand .Lmask52x4(%rip),%ymm5,%ymm5
|
|
vpand .Lmask52x4(%rip),%ymm6,%ymm6
|
|
vpand .Lmask52x4(%rip),%ymm7,%ymm7
|
|
vpand .Lmask52x4(%rip),%ymm8,%ymm8
|
|
vpand .Lmask52x4(%rip),%ymm9,%ymm9
|
|
vpand .Lmask52x4(%rip),%ymm10,%ymm10
|
|
vpand .Lmask52x4(%rip),%ymm11,%ymm11
|
|
vpand .Lmask52x4(%rip),%ymm12,%ymm12
|
|
|
|
|
|
vpaddq 320(%rsp),%ymm3,%ymm3
|
|
vpaddq 352(%rsp),%ymm4,%ymm4
|
|
vpaddq 384(%rsp),%ymm5,%ymm5
|
|
vpaddq 416(%rsp),%ymm6,%ymm6
|
|
vpaddq 448(%rsp),%ymm7,%ymm7
|
|
vpaddq 480(%rsp),%ymm8,%ymm8
|
|
vpaddq 512(%rsp),%ymm9,%ymm9
|
|
vpaddq 544(%rsp),%ymm10,%ymm10
|
|
vpaddq 576(%rsp),%ymm11,%ymm11
|
|
vpaddq 608(%rsp),%ymm12,%ymm12
|
|
|
|
leaq 640(%rsp),%rsp
|
|
|
|
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13
|
|
vmovmskpd %ymm13,%r14d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13
|
|
vmovmskpd %ymm13,%r13d
|
|
shlb $4,%r13b
|
|
orb %r13b,%r14b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13
|
|
vmovmskpd %ymm13,%r13d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13
|
|
vmovmskpd %ymm13,%r12d
|
|
shlb $4,%r12b
|
|
orb %r12b,%r13b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13
|
|
vmovmskpd %ymm13,%r12d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
|
|
vmovmskpd %ymm13,%r11d
|
|
shlb $4,%r11b
|
|
orb %r11b,%r12b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13
|
|
vmovmskpd %ymm13,%r11d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13
|
|
vmovmskpd %ymm13,%r10d
|
|
shlb $4,%r10b
|
|
orb %r10b,%r11b
|
|
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13
|
|
vmovmskpd %ymm13,%r10d
|
|
vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13
|
|
vmovmskpd %ymm13,%r9d
|
|
shlb $4,%r9b
|
|
orb %r9b,%r10b
|
|
|
|
addb %r14b,%r14b
|
|
adcb %r13b,%r13b
|
|
adcb %r12b,%r12b
|
|
adcb %r11b,%r11b
|
|
adcb %r10b,%r10b
|
|
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13
|
|
vmovmskpd %ymm13,%r9d
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13
|
|
vmovmskpd %ymm13,%r8d
|
|
shlb $4,%r8b
|
|
orb %r8b,%r9b
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13
|
|
vmovmskpd %ymm13,%r8d
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13
|
|
vmovmskpd %ymm13,%edx
|
|
shlb $4,%dl
|
|
orb %dl,%r8b
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13
|
|
vmovmskpd %ymm13,%edx
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
|
|
vmovmskpd %ymm13,%ecx
|
|
shlb $4,%cl
|
|
orb %cl,%dl
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13
|
|
vmovmskpd %ymm13,%ecx
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13
|
|
vmovmskpd %ymm13,%ebx
|
|
shlb $4,%bl
|
|
orb %bl,%cl
|
|
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13
|
|
vmovmskpd %ymm13,%ebx
|
|
vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13
|
|
vmovmskpd %ymm13,%eax
|
|
shlb $4,%al
|
|
orb %al,%bl
|
|
|
|
addb %r9b,%r14b
|
|
adcb %r8b,%r13b
|
|
adcb %dl,%r12b
|
|
adcb %cl,%r11b
|
|
adcb %bl,%r10b
|
|
|
|
xorb %r9b,%r14b
|
|
xorb %r8b,%r13b
|
|
xorb %dl,%r12b
|
|
xorb %cl,%r11b
|
|
xorb %bl,%r10b
|
|
|
|
pushq %r9
|
|
pushq %r8
|
|
|
|
leaq .Lkmasklut(%rip),%r8
|
|
|
|
movb %r14b,%r9b
|
|
andq $0xf,%r14
|
|
vpsubq .Lmask52x4(%rip),%ymm3,%ymm13
|
|
shlq $5,%r14
|
|
vmovapd (%r8,%r14,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm3,%ymm3
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm4,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm4,%ymm4
|
|
|
|
movb %r13b,%r9b
|
|
andq $0xf,%r13
|
|
vpsubq .Lmask52x4(%rip),%ymm5,%ymm13
|
|
shlq $5,%r13
|
|
vmovapd (%r8,%r13,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm5,%ymm5
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm6,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm6,%ymm6
|
|
|
|
movb %r12b,%r9b
|
|
andq $0xf,%r12
|
|
vpsubq .Lmask52x4(%rip),%ymm7,%ymm13
|
|
shlq $5,%r12
|
|
vmovapd (%r8,%r12,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm7,%ymm7
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm8,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm8,%ymm8
|
|
|
|
movb %r11b,%r9b
|
|
andq $0xf,%r11
|
|
vpsubq .Lmask52x4(%rip),%ymm9,%ymm13
|
|
shlq $5,%r11
|
|
vmovapd (%r8,%r11,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm9,%ymm9
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm10,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm10,%ymm10
|
|
|
|
movb %r10b,%r9b
|
|
andq $0xf,%r10
|
|
vpsubq .Lmask52x4(%rip),%ymm11,%ymm13
|
|
shlq $5,%r10
|
|
vmovapd (%r8,%r10,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm11,%ymm11
|
|
|
|
shrb $4,%r9b
|
|
andq $0xf,%r9
|
|
vpsubq .Lmask52x4(%rip),%ymm12,%ymm13
|
|
shlq $5,%r9
|
|
vmovapd (%r8,%r9,1),%ymm14
|
|
vblendvpd %ymm14,%ymm13,%ymm12,%ymm12
|
|
|
|
popq %r8
|
|
popq %r9
|
|
|
|
vpand .Lmask52x4(%rip),%ymm3,%ymm3
|
|
vpand .Lmask52x4(%rip),%ymm4,%ymm4
|
|
vpand .Lmask52x4(%rip),%ymm5,%ymm5
|
|
vpand .Lmask52x4(%rip),%ymm6,%ymm6
|
|
vpand .Lmask52x4(%rip),%ymm7,%ymm7
|
|
vpand .Lmask52x4(%rip),%ymm8,%ymm8
|
|
vpand .Lmask52x4(%rip),%ymm9,%ymm9
|
|
|
|
vpand .Lmask52x4(%rip),%ymm10,%ymm10
|
|
vpand .Lmask52x4(%rip),%ymm11,%ymm11
|
|
vpand .Lmask52x4(%rip),%ymm12,%ymm12
|
|
|
|
vmovdqu %ymm3,320(%rdi)
|
|
vmovdqu %ymm4,352(%rdi)
|
|
vmovdqu %ymm5,384(%rdi)
|
|
vmovdqu %ymm6,416(%rdi)
|
|
vmovdqu %ymm7,448(%rdi)
|
|
vmovdqu %ymm8,480(%rdi)
|
|
vmovdqu %ymm9,512(%rdi)
|
|
vmovdqu %ymm10,544(%rdi)
|
|
vmovdqu %ymm11,576(%rdi)
|
|
vmovdqu %ymm12,608(%rdi)
|
|
|
|
vzeroupper
|
|
leaq (%rsp),%rax
|
|
.cfi_def_cfa_register %rax
|
|
movq 0(%rax),%r15
|
|
.cfi_restore %r15
|
|
movq 8(%rax),%r14
|
|
.cfi_restore %r14
|
|
movq 16(%rax),%r13
|
|
.cfi_restore %r13
|
|
movq 24(%rax),%r12
|
|
.cfi_restore %r12
|
|
movq 32(%rax),%rbp
|
|
.cfi_restore %rbp
|
|
movq 40(%rax),%rbx
|
|
.cfi_restore %rbx
|
|
leaq 48(%rax),%rsp
|
|
.cfi_def_cfa %rsp,8
|
|
.Lossl_rsaz_amm52x40_x2_avxifma256_epilogue:
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size ossl_rsaz_amm52x40_x2_avxifma256, .-ossl_rsaz_amm52x40_x2_avxifma256
|
|
.text
|
|
|
|
.align 32
|
|
.globl ossl_extract_multiplier_2x40_win5_avx
|
|
.type ossl_extract_multiplier_2x40_win5_avx,@function
|
|
ossl_extract_multiplier_2x40_win5_avx:
|
|
.cfi_startproc
|
|
.byte 243,15,30,250
|
|
vmovapd .Lones(%rip),%ymm14
|
|
vmovq %rdx,%xmm10
|
|
vpbroadcastq %xmm10,%ymm12
|
|
vmovq %rcx,%xmm10
|
|
vpbroadcastq %xmm10,%ymm13
|
|
leaq 20480(%rsi),%rax
|
|
|
|
|
|
movq %rsi,%r10
|
|
|
|
|
|
vpxor %xmm0,%xmm0,%xmm0
|
|
vmovapd %ymm0,%ymm1
|
|
vmovapd %ymm0,%ymm2
|
|
vmovapd %ymm0,%ymm3
|
|
vmovapd %ymm0,%ymm4
|
|
vmovapd %ymm0,%ymm5
|
|
vmovapd %ymm0,%ymm6
|
|
vmovapd %ymm0,%ymm7
|
|
vmovapd %ymm0,%ymm8
|
|
vmovapd %ymm0,%ymm9
|
|
vpxor %ymm11,%ymm11,%ymm11
|
|
.align 32
|
|
.Lloop_0:
|
|
vpcmpeqq %ymm11,%ymm12,%ymm15
|
|
vmovdqu 0(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm0,%ymm0
|
|
vmovdqu 32(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm1,%ymm1
|
|
vmovdqu 64(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm2,%ymm2
|
|
vmovdqu 96(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm3,%ymm3
|
|
vmovdqu 128(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm4,%ymm4
|
|
vmovdqu 160(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm5,%ymm5
|
|
vmovdqu 192(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm6,%ymm6
|
|
vmovdqu 224(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm7,%ymm7
|
|
vmovdqu 256(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm8,%ymm8
|
|
vmovdqu 288(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm9,%ymm9
|
|
vpaddq %ymm14,%ymm11,%ymm11
|
|
addq $640,%rsi
|
|
cmpq %rsi,%rax
|
|
jne .Lloop_0
|
|
vmovdqu %ymm0,0(%rdi)
|
|
vmovdqu %ymm1,32(%rdi)
|
|
vmovdqu %ymm2,64(%rdi)
|
|
vmovdqu %ymm3,96(%rdi)
|
|
vmovdqu %ymm4,128(%rdi)
|
|
vmovdqu %ymm5,160(%rdi)
|
|
vmovdqu %ymm6,192(%rdi)
|
|
vmovdqu %ymm7,224(%rdi)
|
|
vmovdqu %ymm8,256(%rdi)
|
|
vmovdqu %ymm9,288(%rdi)
|
|
movq %r10,%rsi
|
|
vpxor %ymm11,%ymm11,%ymm11
|
|
.align 32
|
|
.Lloop_320:
|
|
vpcmpeqq %ymm11,%ymm13,%ymm15
|
|
vmovdqu 320(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm0,%ymm0
|
|
vmovdqu 352(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm1,%ymm1
|
|
vmovdqu 384(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm2,%ymm2
|
|
vmovdqu 416(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm3,%ymm3
|
|
vmovdqu 448(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm4,%ymm4
|
|
vmovdqu 480(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm5,%ymm5
|
|
vmovdqu 512(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm6,%ymm6
|
|
vmovdqu 544(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm7,%ymm7
|
|
vmovdqu 576(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm8,%ymm8
|
|
vmovdqu 608(%rsi),%ymm10
|
|
|
|
vblendvpd %ymm15,%ymm10,%ymm9,%ymm9
|
|
vpaddq %ymm14,%ymm11,%ymm11
|
|
addq $640,%rsi
|
|
cmpq %rsi,%rax
|
|
jne .Lloop_320
|
|
vmovdqu %ymm0,320(%rdi)
|
|
vmovdqu %ymm1,352(%rdi)
|
|
vmovdqu %ymm2,384(%rdi)
|
|
vmovdqu %ymm3,416(%rdi)
|
|
vmovdqu %ymm4,448(%rdi)
|
|
vmovdqu %ymm5,480(%rdi)
|
|
vmovdqu %ymm6,512(%rdi)
|
|
vmovdqu %ymm7,544(%rdi)
|
|
vmovdqu %ymm8,576(%rdi)
|
|
vmovdqu %ymm9,608(%rdi)
|
|
vzeroupper
|
|
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size ossl_extract_multiplier_2x40_win5_avx, .-ossl_extract_multiplier_2x40_win5_avx
|
|
.section .rodata
|
|
.align 32
|
|
.Lones:
|
|
.quad 1,1,1,1
|
|
.Lzeros:
|
|
.quad 0,0,0,0
|
|
.section ".note.gnu.property", "a"
|
|
.p2align 3
|
|
.long 1f - 0f
|
|
.long 4f - 1f
|
|
.long 5
|
|
0:
|
|
# "GNU" encoded with .byte, since .asciz isn't supported
|
|
# on Solaris.
|
|
.byte 0x47
|
|
.byte 0x4e
|
|
.byte 0x55
|
|
.byte 0
|
|
1:
|
|
.p2align 3
|
|
.long 0xc0000002
|
|
.long 3f - 2f
|
|
2:
|
|
.long 3
|
|
3:
|
|
.p2align 3
|
|
4:
|